## Load all necessary libraries

In [64]:
import pandas as pd
import numpy as np 
import warnings
warnings.filterwarnings("ignore")
from tqdm import tqdm


## Load the Data sets

In [65]:
train_raw= pd.read_csv('Data/train.csv')
test_raw= pd.read_csv('Data/test.csv')
print(train_raw.shape)
print(test_raw.shape)

(10000, 12)
(2500, 11)


## get rid of outliers


In [66]:
from scipy import stats
train_raw=train_raw[(np.abs(stats.zscore(train_raw.drop(['Severity','Accident_ID'], axis=1))) < 3).all(axis=1)]

train_raw.shape

(9323, 12)

## Declare the dependant and Independent Variables 

In [67]:
y = train_raw['Severity'].values
X = train_raw.drop(['Severity','Accident_ID'], axis=1)
X_test =test_raw.drop('Accident_ID',axis=1)

# Vectoring the Outputs by label encoding them 

In [68]:
from sklearn.preprocessing import LabelEncoder

output_encoder=LabelEncoder()
y=output_encoder.fit_transform(y)

y.shape

(9323,)

## Split data Into train and Validation



In [69]:
from sklearn.model_selection import train_test_split

X_train,X_val,y_train, y_val=train_test_split(X, y, test_size=0.1, random_state=0)

print(X_train.shape)
print(X_val.shape)
print(X_test.shape)
print(y_train.shape)
print(y_val.shape)


(8390, 10)
(933, 10)
(2500, 10)
(8390,)
(933,)


In [70]:
from sklearn.preprocessing import StandardScaler


for column in X_train.columns:
    if column !='Accident_Type_Code':
        print(column)
        scaler = StandardScaler()
        scaler.fit(X_train[column].values.reshape(-1,1))

        # we use the fitted CountVectorizer to convert the text to vector
        train_column=scaler.transform(X_train[column].values.reshape(-1,1))
        val_column=scaler.transform(X_val[column].values.reshape(-1,1))
        test_column=scaler.transform(X_test[column].values.reshape(-1,1))
    
        X_train[column]= train_column
        X_val[column]= val_column
        X_test[column]=test_column




Safety_Score
Days_Since_Inspection
Total_Safety_Complaints
Control_Metric
Turbulence_In_gforces
Cabin_Temperature
Max_Elevation
Violations
Adverse_Weather_Metric


In [71]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

# integer encode
label_encoder = LabelEncoder()
label_encoder.fit(X_train['Accident_Type_Code'].values.reshape(-1, 1))

X_train_label=label_encoder.transform(X_train['Accident_Type_Code'].values.reshape(-1, 1))
X_val_label=label_encoder.transform(X_val['Accident_Type_Code'].values.reshape(-1, 1))
X_test_label=label_encoder.transform(X_test['Accident_Type_Code'].values.reshape(-1, 1))

# binary encode
onehot_encoder = OneHotEncoder(sparse=False)
X_train_label = X_train_label.reshape(len(X_train_label), 1)
X_val_label = X_val_label.reshape(len(X_val_label), 1)
X_test_label = X_test_label.reshape(len(X_test_label), 1)


onehot_encoder.fit(X_train_label)

X_train_one=onehot_encoder.transform(X_train_label)
X_val_one=onehot_encoder.transform(X_val_label)
X_test_one=onehot_encoder.transform(X_test_label)

print(X_train_one.shape)
print(X_val_one.shape)
print(X_test_one.shape)



(8390, 7)
(933, 7)
(2500, 7)


In [72]:
X_train=np.hstack((X_train,X_train_one))
X_val=np.hstack((X_val,X_val_one))
X_test=np.hstack((X_test,X_test_one))


print(X_train.shape)
print(X_val.shape)
print(X_test.shape)
print(y_train.shape)
print(y_val.shape)

(8390, 17)
(933, 17)
(2500, 17)
(8390,)
(933,)


## Applying Logistic Regression with HyperParameter Tuning

In [73]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.model_selection import RandomizedSearchCV

summary=[]

#hyperparameters
C=[0.00001,0.0001,0.001,0.01,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1,10,100,200,300,400,500,600,700,800,900,1000,10000]
penalty=['l1','l2','elasticnet','none']
dual=[True,False]
fit_intercept=[True,False]
class_weight=['balanced',None]
solver=['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']

# Create hyperparameter options
hyperparameters = dict(C=C, penalty=penalty,dual=dual,fit_intercept=fit_intercept,class_weight=class_weight,solver=solver)

# create instance of model
lr=LogisticRegression()

# Create randomized search 5-fold cross validation and 100 iterations
clf = RandomizedSearchCV(lr, hyperparameters, random_state=1, n_iter=100, cv=5, verbose=0, n_jobs=-1)

# fit the data
best_lr=clf.fit(X_train,y_train)

# predict the output
y_pred=best_lr.predict(X_val)

#evaluate f1 score 
f1=f1_score(y_val,y_pred,average='weighted')

#add into list to view in table 
summary.append(['logistic Regression',best_lr.best_estimator_.get_params(),f1])

## Applying SVM Regression with HyperParameter Tuning

In [74]:
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import RandomizedSearchCV


C=[0.00001,0.0001,0.001,0.01,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9,1,10,100,200,300,400,500,600,700,800,900,1000,10000]
penalty=['l1','l2','elasticnet']
fit_intercept=[True,False]
shuffle=[True,False]
class_weight=['balanced',None]


# Create hyperparameter options
hyperparameters = dict(alpha=C, penalty=penalty,fit_intercept=fit_intercept,shuffle=shuffle,class_weight=class_weight)    

# create instance of model
sgd=SGDClassifier(loss='hinge')

# Create randomized search 5-fold cross validation and 100 iterations
clf = RandomizedSearchCV(sgd, hyperparameters, random_state=1, n_iter=100, cv=5, verbose=0, n_jobs=-1)

# fit the data
best_sgd=clf.fit(X_train,y_train)

# predict the output
y_pred=best_sgd.predict(X_val)

#evaluate f1 score 
f1=f1_score(y_val,y_pred,average='weighted')

#add into list to view in table 
summary.append(['SVM',best_sgd.best_estimator_.get_params(),f1])

## Applying KNN Model

In [75]:
from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import RandomizedSearchCV


n_neighbors=range(1,100,5)
weights=['uniform','distance']
algorithm=['auto', 'ball_tree', 'kd_tree','brute']


# Create hyperparameter options
hyperparameters = dict(n_neighbors=n_neighbors, weights=weights,algorithm=algorithm)    

# create instance of model
knn=KNeighborsClassifier()

# Create randomized search 5-fold cross validation and 100 iterations
clf = RandomizedSearchCV(knn, hyperparameters, random_state=1, n_iter=100, cv=5, verbose=0, n_jobs=-1)

# fit the data
best_knn=clf.fit(X_train,y_train)

# predict the output
y_pred=best_knn.predict(X_val)

#evaluate f1 score 
f1=f1_score(y_val,y_pred,average='weighted')

#add into list to view in table 
summary.append(['knn',best_knn.best_estimator_.get_params(),f1])

## Applying Descision Tree Model

In [76]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import RandomizedSearchCV

min_samples_split=range(2,100,5)
min_samples_leaf=range(1,100,5)
criterion = ['gini', 'entropy']
splitter=['best', 'random']
max_features=['auto', 'sqrt', 'log2']
class_weight=['balanced',None]





# Create hyperparameter options
hyperparameters = dict(min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf,criterion=criterion,splitter=splitter,max_features=max_features,class_weight=class_weight)    

# create instance of model
DT=DecisionTreeClassifier()

# Create randomized search 5-fold cross validation and 100 iterations
clf = RandomizedSearchCV(DT, hyperparameters, random_state=1, n_iter=100, cv=5, verbose=0, n_jobs=-1)

# fit the data
best_dt=clf.fit(X_train,y_train)

# predict the output
y_pred=best_dt.predict(X_val)

#evaluate f1 score 
f1=f1_score(y_val,y_pred,average='weighted')

#add into list to view in table 
summary.append(['DT',best_dt.best_estimator_.get_params(),f1])

## Applying Random Forest  Model

In [77]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

min_samples_split=range(2,100,5)
min_samples_leaf=range(1,100,5)
criterion = ['gini', 'entropy']
n_estimators=range(1,1000,25)
bootstrap=[True,False]
class_weight=['balanced',None]





# Create hyperparameter options
hyperparameters = dict(min_samples_split=min_samples_split, min_samples_leaf=min_samples_leaf,criterion=criterion,
                       n_estimators=n_estimators,bootstrap=bootstrap,class_weight=class_weight)    

# create instance of model
RF=RandomForestClassifier()

# Create randomized search 5-fold cross validation and 100 iterations
clf = RandomizedSearchCV(RF, hyperparameters, random_state=1, n_iter=100, cv=5, verbose=0, n_jobs=-1)

# fit the data
best_rf=clf.fit(X_train,y_train)

# predict the output
y_pred=best_rf.predict(X_val)

#evaluate f1 score 
f1=f1_score(y_val,y_pred,average='weighted')

#add into list to view in table 
summary.append(['RF',best_rf.best_estimator_.get_params(),f1])

## Applying Gradient boosted Descision Tree Model



In [78]:
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV

max_depth=range(2,100,5)
learning_rate =np.arange(0,1,0.1)
n_estimators=range(1,1000,25)
min_child_weight = [ 1, 3, 5, 7 ]
gamma  = [ 0.0, 0.1, 0.2 , 0.3, 0.4 ]
colsample_bytree = [ 0.3, 0.4, 0.5 , 0.7 ]






# Create hyperparameter options
hyperparameters = dict(max_depth=max_depth, learning_rate=learning_rate,n_estimators=n_estimators,min_child_weight=min_child_weight,
                      gamma=gamma,colsample_bytree=colsample_bytree)    

# create instance of model
GBDT=XGBClassifier()

# Create randomized search 5-fold cross validation and 100 iterations
clf = RandomizedSearchCV(GBDT, hyperparameters, random_state=1, n_iter=100, cv=5, verbose=0, n_jobs=-1)

# fit the data
best_gb=clf.fit(X_train,y_train)

# predict the output
y_pred=best_gb.predict(X_val)

#evaluate f1 score 
f1=f1_score(y_val,y_pred,average='weighted')

#add into list to view in table 
summary.append(['XGDT',best_gb.best_estimator_.get_params(),f1])

In [79]:
result=pd.DataFrame(summary,columns=['Model','Best Hyper parameters','f1-score'])
result.to_csv('model_result.csv',index=False)

In [80]:
result

Unnamed: 0,Model,Best Hyper parameters,f1-score
0,logistic Regression,"{'C': 800, 'class_weight': 'balanced', 'dual':...",0.647304
1,SVM,"{'alpha': 0.001, 'average': False, 'class_weig...",0.631023
2,knn,"{'algorithm': 'brute', 'leaf_size': 30, 'metri...",0.721164
3,DT,"{'ccp_alpha': 0.0, 'class_weight': None, 'crit...",0.828199
4,RF,"{'bootstrap': False, 'ccp_alpha': 0.0, 'class_...",0.906947
5,XGDT,"{'base_score': 0.5, 'booster': 'gbtree', 'cols...",0.965696


## saving DT,RF and XGDT models

In [85]:
import pickle

pickle.dump(best_gb,open('xgboost_96.pkl','wb'))


# # load the model from disk
# loaded_model = pickle.load(open(filename, 'rb'))

## predict with the best model


In [91]:
best_gb.fit(X_train,y_train)
y_pred=best_gb.predict(X_test)

In [92]:
output=pd.DataFrame(test_raw['Accident_ID'],columns=['Accident_ID'])


In [93]:
output['Severity']=list(output_encoder.inverse_transform(y_pred))

In [94]:
output.to_csv('Result.csv',index=False)
output

Unnamed: 0,Accident_ID,Severity
0,1,Highly_Fatal_And_Damaging
1,10,Significant_Damage_And_Fatalities
2,14,Significant_Damage_And_Serious_Injuries
3,17,Highly_Fatal_And_Damaging
4,21,Significant_Damage_And_Fatalities
...,...,...
2495,12484,Highly_Fatal_And_Damaging
2496,12487,Significant_Damage_And_Serious_Injuries
2497,12488,Significant_Damage_And_Serious_Injuries
2498,12491,Significant_Damage_And_Serious_Injuries
