In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.linear_model import Perceptron
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score,precision_score,recall_score,roc_auc_score

In [2]:
event_data = pd.read_csv('event_type.csv')
log_data = pd.read_csv('log_feature.csv')
resource_data = pd.read_csv('resource_type.csv')
severity_data = pd.read_csv('severity_type.csv')
train_data = pd.read_csv('train.csv')

In [3]:
event_data.tail()

Unnamed: 0,id,event_type
31165,3761,event_type 11
31166,8720,event_type 11
31167,6488,event_type 11
31168,878,event_type 11
31169,4464,event_type 11


In [4]:
log_data.tail(n=5)

Unnamed: 0,id,log_feature,volume
58666,8720,feature 209,1
58667,6488,feature 54,3
58668,878,feature 62,1
58669,4464,feature 209,1
58670,4464,feature 87,2


In [5]:
resource_data.tail(n=5)

Unnamed: 0,id,resource_type
21071,3761,resource_type 8
21072,8720,resource_type 8
21073,6488,resource_type 8
21074,878,resource_type 8
21075,4464,resource_type 8


In [6]:
severity_data.tail(n=5)

Unnamed: 0,id,severity_type
18547,3761,severity_type 1
18548,8720,severity_type 1
18549,6488,severity_type 2
18550,878,severity_type 2
18551,4464,severity_type 1


In [7]:
train_data.tail(n=5)

Unnamed: 0,id,location,fault_severity
7376,870,location 167,0
7377,18068,location 106,0
7378,14111,location 1086,2
7379,15189,location 7,0
7380,17067,location 885,0


In [8]:
event_log = pd.merge(event_data,log_data,on='id')

In [9]:
resource_severity = pd.merge(event_log,resource_data,on='id')
data = pd.merge(resource_severity,severity_data,on='id')

In [10]:
data.tail(n=5)

Unnamed: 0,id,event_type,log_feature,volume,resource_type,severity_type
146418,8720,event_type 11,feature 209,1,resource_type 8,severity_type 1
146419,6488,event_type 11,feature 54,3,resource_type 8,severity_type 2
146420,878,event_type 11,feature 62,1,resource_type 8,severity_type 2
146421,4464,event_type 11,feature 209,1,resource_type 8,severity_type 1
146422,4464,event_type 11,feature 87,2,resource_type 8,severity_type 1


In [11]:
data.event_type = data.event_type.apply(lambda x: x.split()[1])
data.log_feature = data.log_feature.apply(lambda x: x.split()[1])
data.resource_type = data.resource_type.apply(lambda x: x.split()[1])
data.severity_type = data.severity_type.apply(lambda x: x.split()[1])

In [12]:
train_data.location = train_data.location.apply(lambda x: x.split()[1])

In [13]:
data[data.duplicated()==True]

Unnamed: 0,id,event_type,log_feature,volume,resource_type,severity_type


In [14]:
data = pd.merge(data,train_data,on='id')
data.head(n=5)

Unnamed: 0,id,event_type,log_feature,volume,resource_type,severity_type,location,fault_severity
0,8011,15,68,7,8,2,1,0
1,2588,15,82,9,8,1,1,0
2,2588,15,201,5,8,1,1,0
3,2588,15,80,15,8,1,1,0
4,2588,15,203,5,8,1,1,0


In [15]:
data.head(n=2)

Unnamed: 0,id,event_type,log_feature,volume,resource_type,severity_type,location,fault_severity
0,8011,15,68,7,8,2,1,0
1,2588,15,82,9,8,1,1,0


In [16]:
columns = ['id','event_type','log_feature','volume','resource_type','severity_type','location','fault_severity']

In [17]:
X = pd.get_dummies(data[columns], drop_first= True) #pandas get dummies convert categorical variables into numerical variables
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 61839 entries, 0 to 61838
Columns: 1322 entries, id to location_999
dtypes: int64(3), uint8(1319)
memory usage: 79.7 MB


In [18]:
columns2 = ['id','fault_severity']
target = X[columns2]
target = target.groupby(['id']).max()

In [25]:
target.head()

Unnamed: 0_level_0,fault_severity
id,Unnamed: 1_level_1
1,1
5,0
6,1
8,0
13,0


In [19]:
X = X.groupby(['id']).sum()

In [20]:
X

Unnamed: 0_level_0,volume,fault_severity,event_type_10,event_type_11,event_type_12,event_type_13,event_type_14,event_type_15,event_type_18,event_type_19,...,location_987,location_989,location_99,location_990,location_991,location_994,location_995,location_996,location_998,location_999
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,20,12,0.0,6.0,0.0,6.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,34,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,2,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,32,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13,4,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19,14,8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
20,18,0,6.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
23,12,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
24,2,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
26,1,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [23]:
target.id

AttributeError: 'DataFrame' object has no attribute 'id'

In [22]:
idt = target['id']
target = target['fault_severity']
X.drop('fault_severity',axis=1,inplace=True)

KeyError: 'id'

In [None]:
X.shape

In [None]:
target.shape

In [None]:
X = X
Y = target
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.20,random_state=0)

In [None]:
X_train.head(n=5)

In [None]:
X.head(n=5)

In [None]:
X_train.head(5)

In [None]:
X_test.head(5)

In [None]:
y_train.head(5)

In [None]:
# Logistic Regression
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
acc_logreg = round(accuracy_score(y_pred, y_test) * 100, 2)
print(acc_logreg)

In [None]:
#Random Forest
randomforest = RandomForestClassifier()
randomforest.fit(X_train, y_train)
y_pred = randomforest.predict(X_test)
acc_randomforest = round(accuracy_score(y_pred, y_test) * 100, 2)
print(acc_randomforest)

In [None]:
#Gradient Boosting
gbk = GradientBoostingClassifier(learning_rate=0.1, n_estimators=100, subsample=1.0)
gbk.fit(X_train, y_train)
y_pred_gbk = gbk.predict(X_test)
y_pred_proba_gbk = gbk.predict_proba(X_test)
acc_gbk = round(accuracy_score(y_pred, y_test) * 100, 2)
print(acc_gbk)

In [None]:
kn = KNeighborsClassifier()
kn.fit(X_train, y_train)
y_pred_kn = kn.predict(X_test)
y_pred_proba_kn = kn.predict_proba(X_test)
acc_kn = round(accuracy_score(y_pred, y_test) * 100, 2)
print(acc_kn)

In [None]:
#Decision trees
decisiontree = DecisionTreeClassifier()
decisiontree.fit(X_train, y_train)
y_pred = decisiontree.predict(X_test)
y_pred_proba = decisiontree.predict_proba(X_test)
acc_decisiontree = round(accuracy_score(y_pred, y_test) * 100, 2)
print(acc_decisiontree)

In [None]:
metrics = pd.DataFrame({
    'Metric': ['KNN', 'Logistic Regression', 
              'Random Forest','Decision Tree', 
             'Gradient Boosting Classifier'],
    'Score': [acc_kn, acc_logreg, 
              acc_randomforest, acc_decisiontree,
              acc_gbk]})
metrics.sort_values(by='Score', ascending=False)

In [None]:
result = pd.DataFrame(y_pred)

In [None]:
X_test.id

In [None]:
target.id

In [None]:
result = pd.DataFrame({
         "id":target.id,
         "Predicted fault_severity": y_pred_kn,
         "Prediction probability 0": y_pred_proba_kn[:,0],
         "Prediction probability 1": y_pred_proba_kn[:,1],
         "Prediction probability 2": y_pred_proba_kn[:,2]
},columns=['id','Predicted fault_severity','Prediction probability 0','Prediction probability 1','Prediction probability 2'])

In [None]:
result.tail(n=5)

In [None]:
result.id[result.id==4582].value_counts()

In [None]:
result.id.value_counts()