In [1]:
import pandas as pd
import numpy as np

In [2]:
modeldata = pd.read_csv("model_Data.csv")

In [3]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [4]:
modeldata['Traffic_Signal'] = modeldata["Traffic_Signal"].apply(lambda x: True if x else False)
modeldata['Station'] = modeldata["Station"].apply(lambda x: True if x else False)
modeldata['Junction'] = modeldata["Junction"].apply(lambda x: True if x else False)
modeldata['Crossing'] = modeldata["Crossing"].apply(lambda x: True if x else False)
modeldata['Sunrise_Sunset'] = modeldata["Sunrise_Sunset"].apply(lambda x: True if x == 'Day' else False)
weather_categories = pd.get_dummies(modeldata['weather_keyword'],drop_first=True)
modeldata.drop(['weather_keyword'],axis=1,inplace=True)
modeldata = pd.concat([modeldata,weather_categories],axis=1)



In [5]:
modeldata.columns

Index(['Start_Lat', 'Start_Lng', 'Cluster', 'Weekday', 'Hour', 'Date', 'Month',
       'Sunrise_Sunset', 'Traffic_Signal', 'Station', 'Junction', 'Crossing',
       'Temperature(F)', 'Wind_Speed(mph)', 'Wind_Direction', 'Pressure(in)',
       'Visibility(mi)', 'Humidity(%)', 'Accident', 'cloudy', 'fog',
       'overcast', 'rain', 'snow', 'thunderstorm', 'wind'],
      dtype='object')

In [6]:
y = modeldata['Accident']
#X = modeldata.drop(labels=['Wind_Direction'], axis = 1, inplace=True)
#y = modeldata['Accident']
modeldata.drop(labels=['Wind_Direction','Accident'], axis = 1, inplace=True)
X = modeldata


In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)

In [10]:
rfc = RandomForestClassifier(n_estimators=300)
rfc.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=300,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [11]:
pred = rfc.predict(X_test)

In [18]:
rfc.n_estimators

300

In [17]:

rfc.

24

In [16]:
from sklearn.metrics import classification_report,confusion_matrix

In [65]:
print(confusion_matrix(y_test,pred))
print(classification_report(y_test,pred))

[[32427  4460]
 [ 6242 18316]]
              precision    recall  f1-score   support

           0       0.84      0.88      0.86     36887
           1       0.80      0.75      0.77     24558

    accuracy                           0.83     61445
   macro avg       0.82      0.81      0.82     61445
weighted avg       0.82      0.83      0.82     61445



In [9]:
from sklearn.model_selection import GridSearchCV

In [11]:
# tune models GridsearchCV 

parameters = {
    'n_estimators'      : [250,300,350],
    'max_depth'         : [25, 30, 35],
    'random_state'      : [0],
    #'max_features': ['auto'],
    #'criterion' :['mae']
}


gs = GridSearchCV(RandomForestClassifier(),parameters,scoring='neg_mean_absolute_error',cv=3)

GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators='warn', n_jobs=None,
                                              oob_score=False,
                                              random_state=None, verbose=0,
                                              warm_start=False),
             iid

In [18]:
gs.best_estimator_.fit(X_train,y_train)
pred = gs.predict(X_test)

In [19]:
print(confusion_matrix(y_test,pred))
print(classification_report(y_test,pred))

[[32185  4651]
 [ 5933 18676]]
              precision    recall  f1-score   support

           0       0.84      0.87      0.86     36836
           1       0.80      0.76      0.78     24609

    accuracy                           0.83     61445
   macro avg       0.82      0.82      0.82     61445
weighted avg       0.83      0.83      0.83     61445



In [60]:
from sklearn.model_selection import validation_curve
train_scoreNum, test_scoreNum = validation_curve(
                                RandomForestClassifier(),
                                X = X_train, y = y_train, 
                                param_name = 'n_estimators', 
                                param_range = [100,200,300,400,500], cv = 3)

In [62]:
test_scoreNum

array([[0.80421   , 0.80446109, 0.80060265],
       [0.80902262, 0.8115754 , 0.80719412],
       [0.81224498, 0.81310289, 0.80968424],
       [0.81435835, 0.81203574, 0.80922388],
       [0.81406541, 0.81488146, 0.80932851]])

In [22]:
train_scoreNum, test_scoreNum = validation_curve(
                                RandomForestClassifier(),
                                X = X_train, y = y_train, 
                                param_name = 'max_depth', 
                                param_range = [35,40,45,50], cv = 3)



In [23]:
test_scoreNum

array([[0.73775397, 0.7255702 , 0.73785311],
       [0.74714904, 0.75379787, 0.74942457],
       [0.73277395, 0.72266164, 0.73216154],
       [0.74032768, 0.74023854, 0.74308433]])

In [8]:
import pickle
# Creating a pickle file for the classifier
filename = 'accident-prediction-rfc-model.pkl'
pickle.dump(gs.best_estimator_, open(filename, 'wb'))

NameError: name 'gs' is not defined

In [11]:
X_test.iloc[1].values

array([27.872295, -82.745537, 815, 3, 7, 2, 8, False, True, False, False,
       False, 62.1, 4.6, 30.09, 10.0, 93.0, 0, 0, 0, 0, 0, 0, 0],
      dtype=object)

In [13]:
y_test.iloc[1]

0