# Predicting the severity of service disruption

In [1]:
# import data from csv into pandas dataFrame
import pandas as pd
severity_type = pd.read_csv('/Users/sumantbhandari/Desktop/Data Analysis/severity_type.csv', header=0)
log_feature = pd.read_csv('/Users/sumantbhandari/Desktop/Data Analysis/log_feature.csv', header=0)
train = pd.read_csv('/Users/sumantbhandari/Desktop/Data Analysis/train.csv', header=0)
test = pd.read_csv('/Users/sumantbhandari/Desktop/Data Analysis/test.csv', header=0)
event_type = pd.read_csv('/Users/sumantbhandari/Desktop/Data Analysis/event_type.csv', header=0)
resource_type = pd.read_csv('/Users/sumantbhandari/Desktop/Data Analysis/resource_type.csv', header=0)

In [2]:
# convert categorical variables into indicator variables
severity_type_dummies = pd.get_dummies(severity_type)
log_feature_dummies= pd.get_dummies(log_feature)
event_type_dummies = pd.get_dummies(event_type)
resource_type_dummies = pd.get_dummies(resource_type)

In [3]:
# Aggregate duplicate entries of id and collapse rows with their sum
event_type_dummies= event_type_dummies.groupby(event_type_dummies.id).sum()
resource_type_dummies = resource_type_dummies.groupby(resource_type_dummies.id).sum()
severity_type_dummies = severity_type_dummies.groupby(severity_type_dummies.id).sum()

In [4]:
# Use Volume as weights, and multiply it with every other columns in the log_feature data.

# take all columns except volume
columns = log_feature_dummies.columns[2:]
# Multiply all columns of log_dummy with vol
log_feature_dummies_mulVol = log_feature_dummies[columns].multiply(log_feature_dummies["volume"], axis='index')
# add id column to log_dummy
log_feature_dummies_mulVol["id"]  = log_feature_dummies.id
# aggregate by id and summ up all rows
log_feature_dummies_mulVol = log_feature_dummies_mulVol.groupby(log_feature_dummies_mulVol.id).sum()

# Add feature -  AggregateVol
# aggregate by id and caluclate total sum of volume for that time id
Df_withAggVol = log_feature.groupby(log_feature.id).sum()
Df_withAggVol.columns = ["AggregateVol"]

# Add column AggregateVol to the log_feature_dummies_sumVol datframe
log_feature_dummies_sumVol = pd.concat([log_feature_dummies_mulVol,Df_withAggVol], axis=1, join ="inner")

In [14]:
# Merge test and train with severtity_type and extract time sequence
trainPlusTestDF_Sev = pd.concat([train,test], axis=0)
merged_SevTestdf = pd.merge(severity_type,trainPlusTestDF_Sev, on='id')
merged_SevTestdf['time_seq'] = merged_SevTestdf.groupby('location').cumcount()
merged_SevTestdf = merged_SevTestdf.set_index("id")
time_seq = merged_SevTestdf.time_seq

In [17]:
merged_SevTestdf

Unnamed: 0_level_0,severity_type,fault_severity,location,time_seq
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
6597,severity_type 2,,location 1,0
8011,severity_type 2,0.0,location 1,1
2597,severity_type 2,,location 1,2
5022,severity_type 1,,location 1,3
6852,severity_type 1,,location 1,4
5611,severity_type 2,,location 1,5
14838,severity_type 1,,location 1,6
2588,severity_type 1,0.0,location 1,7
4848,severity_type 1,0.0,location 1,8
6914,severity_type 1,0.0,location 1,9


In [6]:
# Add another feature : total vol per location from merging test and train with the log_table 
# for each location get the total count and map it with id
trainPlusTestDF = pd.concat([train,test], axis=0)
merged_LogTestdf = pd.merge(log_feature,trainPlusTestDF, on='id')
grouped_LogTestdf = merged_LogTestdf.groupby(merged_LogTestdf.location).mean()
grouped_LogTestdf = grouped_LogTestdf.drop(['id','fault_severity'], axis=1 )
grouped_LogTestdf.columns = ["volume_per_location"]
grouped_LogTestdf['location'] = grouped_LogTestdf.index
volume_per_location_df = pd.merge(grouped_LogTestdf,merged_LogTestdf, on='location')
volume_per_location_df = volume_per_location_df.groupby(volume_per_location_df.id).sum()
volume_per_location = volume_per_location_df["volume_per_location"]


In [7]:
# Join the created features with the log_feature_dummies_sumVol dataframe
log_feature_dummies_sumVol = pd.concat([log_feature_dummies_sumVol,time_seq], axis=1, join ="inner")
log_feature_dummies_sumVol = pd.concat([log_feature_dummies_sumVol,volume_per_location], axis=1, join ="inner")

In [8]:
# Merge all the dataframes together on id
mergedAttributes_df = pd.concat([log_feature_dummies_sumVol, event_type_dummies,resource_type_dummies, severity_type_dummies], axis=1)
train = train.set_index("id")
test = test.set_index("id")
finaltrain_df = pd.concat([mergedAttributes_df, train], axis=1, join ="inner")
finaltest_df = pd.concat([mergedAttributes_df, test], axis=1, join ="inner")

In [9]:
print(finaltrain_df.shape)
print(finaltest_df.shape)

(7381, 459)
(11171, 458)


In [10]:
#check for any null values
finaltest_df.isnull().values.any()

False

In [11]:
# store labels in Y_train
Y_train= finaltrain_df.pop('fault_severity')


In [12]:
# Convert location as discrete numeric variable.
finaltrain_df['location_id'] = finaltrain_df['location'].str[8:].astype(float)
finaltrain_df['location_id'].describe()
finaltrain_df.pop('location')
print("ok")

ok


In [15]:
#XGBoost
import xgboost as xg
clf = xg.XGBClassifier(max_depth=8)
clf.fit(finaltrain_df, Y_train)

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=1,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=8,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='multi:softprob', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)

In [16]:
# generate probabilities for test set
finaltest_df['location_id'] = finaltest_df['location'].str[8:].astype(float)
finaltest_df.pop('location')
a = clf.predict_proba(finaltest_df)
finaltestsubmit_df = pd.DataFrame(a)
finaltestsubmit_df["id"] = finaltest_df.index
finaltestsubmit_df = finaltestsubmit_df.set_index("id")
finaltestsubmit_df.columns =["predict_0", "predict_1", "predict_2"]
finaltestsubmit_df.head()

Unnamed: 0_level_0,predict_0,predict_1,predict_2
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
11066,0.983432,0.015949,0.000618
18000,0.086721,0.139683,0.773596
16964,0.989321,0.010137,0.000542
4795,0.942497,0.054193,0.00331
3392,0.380555,0.594,0.025445


In [17]:
# generate csv
finaltestsubmit_df.to_csv("final_submission")

# Below is the code used for tuning parameters and experimenting other algorithms

In [13]:
# Create validation set 
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(finaltrain_df, Y_train, test_size=0.25, random_state=22)



In [None]:
# Normalize - did not improve output for Tree based models. 
#location = finaltrain_df.location
#import numpy as np
#Normalized_df = finaltrain_df.ix[:, finaltrain_df.columns != "location"].apply(lambda x: ((x - np.min(x)) / (np.max(x) - np.min(x)))\
#                                                                               if np.max(x) != 0 else 0)
#Normalized_df['location'] = location
#finaltrain_df = Normalized_df

In [14]:
import sklearn.linear_model as l
import numpy as np

clf = l.LogisticRegression()
clf.fit(X_train, y_train)
a=clf.predict(X_test)
from sklearn.metrics import confusion_matrix
print(confusion_matrix(list(np.array(y_test)), list(a)))
from sklearn.metrics import accuracy_score
print(accuracy_score(list(np.array(y_test)), list(a)))


[[1083   83   17]
 [ 342  122   13]
 [  73   14   99]]
0.70639219935


In [27]:
#for validation purpose
#RandomForest
import numpy as np
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=1000, max_depth=30)
clf.fit(X_train, y_train)
a=clf.predict(X_test)
from sklearn.metrics import confusion_matrix
print(confusion_matrix(list(np.array(y_test)), list(a)))
from sklearn.metrics import accuracy_score
print(accuracy_score(list(np.array(y_test)), list(a)))

[[1063  108   12]
 [ 217  236   24]
 [  37   36  113]]
0.764897074756


In [20]:
#Validation
#xgboost
import xgboost as xg
import numpy as np
from matplotlib import pyplot
clf = xg.XGBClassifier(max_depth=9)
clf.fit(X_train, y_train)
#pyplot.bar(range(len(clf.feature_importances_)), clf.feature_importances_)
#pyplot.show()
a=clf.predict(X_test)
from sklearn.metrics import confusion_matrix
print(confusion_matrix(list(np.array(y_test)), list(a)))
from sklearn.metrics import accuracy_score
print(accuracy_score(list(np.array(y_test)), list(a)))

[[1073  100   10]
 [ 180  271   26]
 [  30   23  133]]
0.800108342362


In [93]:
#Validation - this one with cross validation
#RandomForest
from sklearn.cross_validation import KFold, cross_val_score
import numpy as np
k_fold = KFold(len(finaltrain_df), n_folds=5, shuffle=True, random_state=0)
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=1000, max_depth=30)
clf.fit(finaltrain_df, Y_train)
print(cross_val_score(clf, finaltrain_df, Y_train, cv=k_fold, n_jobs=1))

[ 0.80974949  0.79403794  0.78794038  0.79674797  0.79200542]


In [94]:
#Validation - this one with cross validation
#xgboost
from sklearn.cross_validation import KFold, cross_val_score
import xgboost as xg
import numpy as np
k_fold = KFold(len(finaltrain_df), n_folds=5, shuffle=True, random_state=0)
clf = xg.XGBClassifier(max_depth=8)
clf.fit(finaltrain_df, Y_train)
print(cross_val_score(clf, finaltrain_df, Y_train, cv=k_fold, n_jobs=1))

[ 0.81719702  0.81300813  0.79878049  0.8096206   0.79945799]
