In [1]:
import pyodbc
import pandas as pd
import numpy as np
import matplotlib as mpl, matplotlib.pyplot as plt
import seaborn as sns
from sklearn import metrics
from sklearn.metrics import classification_report
from imblearn.combine import SMOTEENN
from imblearn.over_sampling import SMOTE
import warnings
warnings.filterwarnings('ignore')

In [2]:
DWConnect = pyodbc.connect('Driver={SQL Server};'
                      'Server=SAMET;'
                      'Database=Projet3;'
                      'Trusted_Connection=yes;')
cursor_Dim = DWConnect.cursor()

In [3]:
SQL_Query = pd.read_sql_query('''select * FROM [dbo].[DimConditions]''', DWConnect)
DimConditions = pd.DataFrame(SQL_Query)
SQL_Query = pd.read_sql_query('''select * FROM [dbo].[DimTypeAccidentSeverity]''', DWConnect)
DimTypeAccidentSeverity = pd.DataFrame(SQL_Query)
SQL_Query = pd.read_sql_query('''select * FROM [dbo].[FactAccident]''', DWConnect)
FactAccident = pd.DataFrame(SQL_Query)

In [4]:
def decomposefact(fact,liste_1,liste_2):
    for i,j in enumerate(liste_1):
        fact= pd.merge(fact,j,on=liste_2[i])
    return fact

In [5]:
listetable=[DimConditions,DimTypeAccidentSeverity]
listeid=['code_condition','code_severity']

In [6]:
Accidents=decomposefact(FactAccident,listetable,listeid)

In [7]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier

In [8]:
Accidents_class = Accidents
Accidents_class=Accidents_class[[ 'Light_Conditions'
      ,'Weather_Conditions'
      ,'Road_Surface_Conditions'
      ,'Special_Conditions_at_Site','Number_of_Vehicles','Accident_Severity']]
le = LabelEncoder()

for i in Accidents_class.columns:
    if i != 'Accident_Severity':
        Accidents_class[i] = le.fit_transform(Accidents_class[i])

In [9]:
Accidents_class

Unnamed: 0,Light_Conditions,Weather_Conditions,Road_Surface_Conditions,Special_Conditions_at_Site,Number_of_Vehicles,Accident_Severity
0,4,6,5,4,0,2
1,4,6,5,4,0,2
2,4,6,5,4,0,2
3,4,6,5,4,0,2
4,4,6,5,4,1,2
...,...,...,...,...,...,...
1504044,0,9,1,7,0,1
1504045,4,6,2,6,0,1
1504046,3,2,2,4,2,1
1504047,0,8,4,6,1,1


In [10]:
#train_frame=Accidents_class
#test_frame=Accidents_class
#X = train_frame.drop("Accident_Severity",axis=1)
#over_X_test = test_frame.drop("Accident_Severity",axis=1)
#y = train_frame["Accident_Severity"]
#over_y_test = test_frame["Accident_Severity"]

In [11]:
X=pd.DataFrame(Accidents_class.iloc[:,:-1])

In [12]:
X


Unnamed: 0,Light_Conditions,Weather_Conditions,Road_Surface_Conditions,Special_Conditions_at_Site,Number_of_Vehicles
0,4,6,5,4,0
1,4,6,5,4,0
2,4,6,5,4,0
3,4,6,5,4,0
4,4,6,5,4,1
...,...,...,...,...,...
1504044,0,9,1,7,0
1504045,4,6,2,6,0
1504046,3,2,2,4,2
1504047,0,8,4,6,1


In [13]:
y=pd.DataFrame(Accidents_class.iloc[:,-1])

In [14]:
y

Unnamed: 0,Accident_Severity
0,2
1,2
2,2
3,2
4,2
...,...
1504044,1
1504045,1
1504046,1
1504047,1


In [15]:
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split



# Oversample the minority class using SMOTE
smote = SMOTE()
X_oversampled, y_oversampled = smote.fit_resample(X, y)

# Split the oversampled dataset into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_oversampled, y_oversampled, test_size=0.2)

# Train a Random Forest model on the training set
model = RandomForestClassifier()
model.fit(X_train, y_train)

# Use the model to make predictions on the test set
predictions = model.predict(X_test)

In [16]:
accuracy_score(y_test, predictions)

0.4461717130685665

In [28]:
from collections import Counter

import numpy as np

from sklearn.metrics import precision_score, recall_score, classification_report, roc_curve
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification


In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=np.random.randint(100)) # 70% training and 30% test


In [23]:
#Create a Gaussian Classifier
clf=RandomForestClassifier(n_estimators=100)

#Train the model using the training sets y_pred=clf.predict(X_test)
clf.fit(X_train,y_train)

y_pred=clf.predict(X_test)

In [24]:
#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics
# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.8500482031847345


In [34]:
from sklearn.model_selection import RandomizedSearchCV, StratifiedKFold
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import make_pipeline
smote = SMOTE()
model = RandomForestClassifier()

pipeline = make_pipeline(smote, model)

In [35]:
data_x=pd.DataFrame(Accidents_class.iloc[:,:-1])
data_y=pd.DataFrame(Accidents_class.iloc[:,-1])

In [38]:
from imblearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix
train_x, test_x, train_y, test_y = train_test_split(
    data_x, data_y, test_size=0.3, shuffle=True
)

# HYPERPARAMETER TUNING

pipeline = Pipeline([("smote", SMOTE()), ("rf", RandomForestClassifier())])

grid = {
    "rf__n_estimators": [100],
}

kf = StratifiedKFold(n_splits=5)

# Just applies smote to the k-1 training folds, and not to the validation fold
search = RandomizedSearchCV(
    pipeline, grid, scoring="f1", n_iter=1, n_jobs=-1, cv=kf
).fit(train_x, train_y)

best_score = search.best_score_
best_params = {
    key.replace("rf__", ""): value for key, value in search.best_params_.items()
}

print(f"Best Tuning F1 Score: {best_score}")
print(f"Best Tuning Params:   {best_params}")

# EVALUTING BEST MODEL ON TEST SET

best_model = RandomForestClassifier(**best_params).fit(train_x, train_y)

accuracy = best_model.score(test_x, test_y)

test_pred = best_model.predict(test_x)
tn, fp, fn, tp = confusion_matrix(test_y, test_pred).ravel()
conf_mat = pd.DataFrame(
    {"Model (0)": [tn, fn], "Model (1)": [fp, tp]}, index=["Actual (0)", "Actual (1)"],
)

classif_report = classification_report(test_y, test_pred)

feature_importance = pd.DataFrame(
    {"feature": list(train_x.columns), "importance": best_model.feature_importances_}
).sort_values("importance", ascending=False)

print(f"Accuracy: {round(accuracy * 100, 2)}%")
print("")

print(conf_mat)
print("")

print(classif_report)
print("")

pd.set_option("display.max_rows", len(feature_importance))
print(feature_importance)
pd.reset_option("display.max_rows")

Best Tuning F1 Score: nan
Best Tuning Params:   {'n_estimators': 100}


ValueError: too many values to unpack (expected 4)

In [36]:
print(train_x.shape)
print(train_y.shape)

(2688072, 5)
(2688072,)


In [33]:
random_forest_classifier = RandomForestClassifier(random_state=42)
random_forest_classifier.fit(train_x,train_y)
# Testing the model
y_pred=random_forest_classifier.predict(X_test)


In [34]:
# Model evaluvation
accuracy_score(y_test, y_pred)

0.5710246142246695

In [13]:
from joblib import  dump

In [14]:
dump(random_forest_classifier,'./../savedModeles/model.joblib')

FileNotFoundError: [Errno 2] No such file or directory: './../savedModeles/model.joblib'