## Import Requirements and Data

In [185]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [186]:
import pandas as pd
import numpy as np
import re
from sklearn import preprocessing
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")
pd.set_option("display.max_columns",100)

#Classification
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from xgboost import XGBClassifier

from sklearn.metrics import accuracy_score,confusion_matrix,precision_score,recall_score,f1_score
from sklearn.model_selection import train_test_split

In [187]:
def classification_(X,y): 
    xgbc=XGBClassifier()
    k=KNeighborsClassifier()
    d=DecisionTreeClassifier()
    log=LogisticRegression()
    gbc=GradientBoostingClassifier()
    rf=RandomForestClassifier()
    
    algos=[xgbc,k,d,log,gbc,rf]
    algos_name=['XGBClassifier','KNeighbors','DecisionTree','LogisticReg','GradientBoosting','RandomForest']

    X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=.2,random_state=101)

    accuracy = []
    precision = []
    recall = []
    f1 = []
   
    result=pd.DataFrame(columns=['AccuracyScore','PrecisionScore','RecallScore','f1_Score'],index=algos_name)
    
    for i in algos:
        predict=i.fit(X_train,y_train).predict(X_test)
        accuracy.append(accuracy_score(y_test,predict))
        precision.append(precision_score(y_test,predict,average='weighted'))
        recall.append(recall_score(y_test,predict,average='weighted'))
        f1.append(f1_score(y_test,predict,average='weighted'))
      
    result.AccuracyScore=accuracy
    result.PrecisionScore=precision
    result.RecallScore=recall
    result.f1_Score=f1
    
    return result.sort_values('AccuracyScore',ascending=False)

In [188]:
df1=pd.read_csv('/content/drive/Othercomputers/My Laptop/0 Yaz Kampı/PBL1/4 - Service Disruption/event_type.csv')
df2=pd.read_csv('/content/drive/Othercomputers/My Laptop/0 Yaz Kampı/PBL1/4 - Service Disruption/log_feature.csv')
df3=pd.read_csv('/content/drive/Othercomputers/My Laptop/0 Yaz Kampı/PBL1/4 - Service Disruption/resource_type.csv')
df4=pd.read_csv('/content/drive/Othercomputers/My Laptop/0 Yaz Kampı/PBL1/4 - Service Disruption/severity_type.csv')

In [189]:
df1.head()

Unnamed: 0,id,event_type
0,6597,event_type 11
1,8011,event_type 15
2,2597,event_type 15
3,5022,event_type 15
4,5022,event_type 11


In [190]:
df2.head()

Unnamed: 0,id,log_feature,volume
0,6597,feature 68,6
1,8011,feature 68,7
2,2597,feature 68,1
3,5022,feature 172,2
4,5022,feature 56,1


In [191]:
df3.head()

Unnamed: 0,id,resource_type
0,6597,resource_type 8
1,8011,resource_type 8
2,2597,resource_type 8
3,5022,resource_type 8
4,6852,resource_type 8


In [192]:
df4.head()

Unnamed: 0,id,severity_type
0,6597,severity_type 2
1,8011,severity_type 2
2,2597,severity_type 2
3,5022,severity_type 1
4,6852,severity_type 1


### Merge Data

In [193]:
df5=df1.merge(df2).merge(df3).merge(df4) # oneline :)

In [194]:
df5.head()

Unnamed: 0,id,event_type,log_feature,volume,resource_type,severity_type
0,6597,event_type 11,feature 68,6,resource_type 8,severity_type 2
1,8011,event_type 15,feature 68,7,resource_type 8,severity_type 2
2,2597,event_type 15,feature 68,1,resource_type 8,severity_type 2
3,5022,event_type 15,feature 172,2,resource_type 8,severity_type 1
4,5022,event_type 15,feature 56,1,resource_type 8,severity_type 1


In [195]:
df5.replace({'event_type ':'', 'resource_type ':'' , 'severity_type ':'', 'feature ': ''}, regex=True, inplace=True)
df5

Unnamed: 0,id,event_type,log_feature,volume,resource_type,severity_type
0,6597,11,68,6,8,2
1,8011,15,68,7,8,2
2,2597,15,68,1,8,2
3,5022,15,172,2,8,1
4,5022,15,56,1,8,1
...,...,...,...,...,...,...
146418,8720,11,209,1,8,1
146419,6488,11,54,3,8,2
146420,878,11,62,1,8,2
146421,4464,11,209,1,8,1


In [196]:
df6 = pd.read_csv('/content/drive/Othercomputers/My Laptop/0 Yaz Kampı/PBL1/4 - Service Disruption/train.csv')

In [197]:
df6.replace({'location ':''}, regex=True, inplace=True)

In [198]:
df = pd.merge(df6,df5)
df

Unnamed: 0,id,location,fault_severity,event_type,log_feature,volume,resource_type,severity_type
0,14121,118,1,34,312,19,2,2
1,14121,118,1,34,232,19,2,2
2,14121,118,1,35,312,19,2,2
3,14121,118,1,35,232,19,2,2
4,9320,91,0,34,315,200,2,2
...,...,...,...,...,...,...,...,...
61834,15189,7,0,11,55,10,8,1
61835,15189,7,0,11,70,1,8,1
61836,17067,885,0,11,81,1,8,1
61837,17067,885,0,11,191,1,8,1


In [199]:
df.drop_duplicates(subset='id', inplace=True)
df

Unnamed: 0,id,location,fault_severity,event_type,log_feature,volume,resource_type,severity_type
0,14121,118,1,34,312,19,2,2
4,9320,91,0,34,315,200,2,2
8,14394,152,1,35,221,1,2,2
12,8218,931,1,15,80,9,8,1
18,14804,120,0,34,134,1,2,1
...,...,...,...,...,...,...,...,...
61824,870,167,0,34,232,1,2,2
61828,18068,106,0,20,219,1,2,1
61830,14111,1086,2,15,82,21,8,2
61832,15189,7,0,11,191,3,8,1


## Prediction

In [200]:
x = df.drop(columns='fault_severity')
x = pd.get_dummies(x, drop_first=True)
y = df['fault_severity']

In [201]:
classification_(x,y)

Unnamed: 0,AccuracyScore,PrecisionScore,RecallScore,f1_Score
GradientBoosting,0.694651,0.686729,0.694651,0.642413
XGBClassifier,0.693974,0.697087,0.693974,0.632081
RandomForest,0.686527,0.663732,0.686527,0.666286
DecisionTree,0.65606,0.644588,0.65606,0.649358
LogisticReg,0.654028,0.600993,0.654028,0.564633
KNeighbors,0.59174,0.511548,0.59174,0.527482


In [202]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=.2,random_state=4)

In [203]:
xgb = XGBClassifier()
pred_xgb = xgb.fit(x_train,y_train).predict(x_test)
accuracy_score(y_test,pred_xgb), f1_score(y_test,pred_xgb,average='weighted')

(0.6980365605958023, 0.6376705544313918)

In [204]:
y_pred_proba_xgb = xgb.predict_proba(x_test)

In [205]:
result = pd.DataFrame({'id':x_test.id,
                     'Predicted fault_severity':pred_xgb,
                    'Prediction_probability_0':y_pred_proba_xgb[:,0],
                    'Prediction_probability_1':y_pred_proba_xgb[:,1],
                    'Prediction_probability_2':y_pred_proba_xgb[:,2]},
                   columns=['id','Predicted fault_severity','Prediction_probability_0','Prediction_probability_1','Prediction_probability_2'])

In [206]:
result.head()

Unnamed: 0,id,Predicted fault_severity,Prediction_probability_0,Prediction_probability_1,Prediction_probability_2
33268,13885,0,0.723092,0.23621,0.040698
12656,17633,0,0.482095,0.114953,0.402952
55395,1367,0,0.835674,0.088127,0.076199
29880,6935,0,0.497152,0.416615,0.086233
13006,9329,0,0.480078,0.457216,0.062705


In [207]:
result.to_csv('service_disruptions.csv')