# Models training

Importing librairies needed for the models training notebook

In [1]:
import pandas as pd
import xgboost as xgb
from sklearn.metrics import confusion_matrix
import sklearn as sk
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report
import pickle

## Data collection

Retrieve data from the file dataset_final.csv and split the dataset into two parts : training and testing parts

In [2]:
df_trimed = pd.read_csv("../Data/dataset_final.csv")

In [3]:
x_train, x_test, y_train, y_test = train_test_split(df_trimed.loc[:,df_trimed.columns != 'TARGET'], 
                                                    df_trimed['TARGET'], 
                                                    test_size = 0.3, 
                                                    random_state = 50)

## XGBoost model training confusion, matrix and classification reports

Create a XGBoost classifier model in order to train it with training dataset and make predictions on testing dataset

In [4]:
xg_reg = xgb.XGBClassifier(objective ='reg:logistic', 
                           use_label_encoder=False)

xg_reg.fit(x_train,y_train)

predictionsXG = xg_reg.predict(x_test)

In [5]:
confusion_matrix(y_test, predictionsXG)

array([[69098,    46],
       [ 6481,    17]], dtype=int64)

In [6]:
print(classification_report(y_test, predictionsXG))

              precision    recall  f1-score   support

           0       0.91      1.00      0.95     69144
           1       0.27      0.00      0.01      6498

    accuracy                           0.91     75642
   macro avg       0.59      0.50      0.48     75642
weighted avg       0.86      0.91      0.87     75642



## Gradient Boosting model training, confusion matrix and classification reports

Create a Gradient Boosting classifier model in order to train it with training dataset and make predictions on testing dataset

In [7]:
GradientBoosting = GradientBoostingClassifier()

GradientBoosting.fit(x_train,y_train)

predictionsGB = GradientBoosting.predict(x_test)

In [8]:
confusion_matrix(y_test, predictionsGB)

array([[69143,     1],
       [ 6498,     0]], dtype=int64)

In [9]:
print(classification_report(y_test, predictionsGB))

              precision    recall  f1-score   support

           0       0.91      1.00      0.96     69144
           1       0.00      0.00      0.00      6498

    accuracy                           0.91     75642
   macro avg       0.46      0.50      0.48     75642
weighted avg       0.84      0.91      0.87     75642



## Random Forest model training, confusion matrix and classification reports

Create a Random Forest classifier model in order to train it with training dataset and make predictions on testing dataset

In [10]:
RandomForest = RandomForestClassifier()

RandomForest.fit(x_train, y_train)

predictionsRF = RandomForest.predict(x_test)

In [11]:
confusion_matrix(y_test, predictionsRF)

array([[69144,     0],
       [ 6495,     3]], dtype=int64)

In [12]:
print(classification_report(y_test, predictionsRF))

              precision    recall  f1-score   support

           0       0.91      1.00      0.96     69144
           1       1.00      0.00      0.00      6498

    accuracy                           0.91     75642
   macro avg       0.96      0.50      0.48     75642
weighted avg       0.92      0.91      0.87     75642



## Saving models

Use library pickle to save model into .pkl format

In [14]:
with open('../Model/XGBoost.pkl', 'wb') as file:
    pickle.dump(xg_reg, file)
with open('../Model/GradientBoosting.pkl', 'wb') as file:
    pickle.dump(GradientBoosting, file)
with open('../Model/RandomForest.pkl', 'wb') as file:
    pickle.dump(RandomForest, file)