# 4.Modeling

### 4.1.Import Libaries

In [16]:
import pandas as pd
import numpy as np
import os
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import __version__ as sklearn_version
from sklearn.model_selection import cross_validate,cross_val_score, train_test_split, GridSearchCV, learning_curve
from sklearn.decomposition import PCA
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report,confusion_matrix,ConfusionMatrixDisplay
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error,accuracy_score, recall_score, precision_score, f1_score
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
import datetime


### 4.2.Load Data( Test set and training set )

In [2]:
datafilepath = "../data/interim/train_test_split.pkl"

with open(datafilepath, 'rb') as file:
    X,y,X_train, X_test, y_train, y_test = pickle.load(file)

### Initiation

In [3]:
#Calculate the mean of `y_train`
#train_mean = y_train.mean()
#train_mean

In [4]:
#Calculate the medium of `y_train`
X_defaults_median = X_train.median()
X_defaults_median

VICT_AGE              -0.002785
latitude               0.013320
longitude             -0.019968
MTH_RPTD              -0.127860
DAY_RPTD               0.032557
YEAR_RPTD              0.123388
MTH_OCC               -0.123719
DAY_OCC               -0.050370
YEAR_OCC               0.134830
WEEKDAY_OCC_Encoded    0.026682
AREA_NAME_Encoded      0.029688
dtype: float64

##### Impute NaN Data

In [5]:
# handling NaN values
# Replace 'mean' with 'median', 'most_frequent', or a constant value if needed

imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
X_train = imputer.fit_transform(X_train)

In [6]:
# Check for NaN values
print("NaN values in X_test:", X_test.isna().sum().sum())

# Replace NaN values with the mean (you can choose other methods)
X_test = X_test.fillna(X_test.mean())

# Check for infinity values
print("Infinity values in X_test:", np.isinf(X_test).sum().sum())

# Replace infinity values
X_test = X_test.replace([np.inf, -np.inf], np.nan).fillna(X_test.mean())

NaN values in X_test: 0
Infinity values in X_test: 0


In [7]:
# Normalize or Scale your data
scaler = StandardScaler()
X_test_scaled = scaler.fit_transform(X_test)
X_test_scaled

array([[-2.36081361e-01,  1.32988822e-01, -5.53067227e-02, ...,
        -3.73444436e-01,  5.23831269e-01,  1.18393023e+00],
       [ 8.93392949e-01, -4.83791241e-02, -3.62480539e-03, ...,
        -1.64134548e+00,  1.51770253e+00, -1.33851924e-01],
       [-1.45967853e+00,  1.14374487e-01, -3.25565177e-02, ...,
        -1.38776527e+00, -1.46391126e+00, -2.98574693e-01],
       ...,
       [-4.24327079e-01,  2.05204646e-01, -5.14962045e-02, ...,
        -8.80604855e-01,  2.68956370e-02, -1.12218854e+00],
       [ 3.75717224e-01,  2.16509687e-01, -7.46133483e-02, ...,
         1.65519724e+00,  1.51770253e+00, -1.28691131e+00],
       [ 1.36400725e+00,  1.42153173e-03, -4.17864396e-02, ...,
        -8.80604855e-01,  5.23831269e-01,  1.34865299e+00]])

### 4.3.Models

Lets try different models. 
Decision tree-based models with different criteria (Gini impurity and entropy).

#### Decision Tree Classifier (gini)

In [8]:
dt_gini_model = DecisionTreeClassifier(criterion='gini')
dt_gini_model.fit(X_train, y_train)
# Making predictions
dt_gini_pred = dt_gini_model.predict(X_test)

<b> Decision Tree Classifier (gini) :Evaluation</b></br>

In [9]:

dt_gini_ac_score =  accuracy_score(y_test, dt_gini_pred),
dt_gini_re_score =  recall_score(y_test, dt_gini_pred, average='macro'),
dt_gini_pre_score =  precision_score(y_test, dt_gini_pred, average='macro'),
dt_gini_f1_score =  f1_score(y_test, dt_gini_pred, average='macro')


# Cross-validation for hyperparameter tuning
dt_gini_cv_scores = np.mean(cross_val_score(dt_gini_model, X, y, cv=5))



print( "Decision Tree Classifier (gini) CV Score", dt_gini_cv_scores)
print( "Decision Tree Classifier (gini) Accuracy", dt_gini_ac_score)
print( "Decision Tree Classifier (gini) Recall", dt_gini_re_score)
print( "Decision Tree Classifier (gini) Precision", dt_gini_pre_score)
print( "Decision Tree Classifier (gini) F1 Score", dt_gini_f1_score)


dt_gini_cof = confusion_matrix(y_test, dt_gini_pred)
dt_gini_cof

Decision Tree Classifier (gini) CV Score 0.11685887662629477
Decision Tree Classifier (gini) Accuracy (0.2658039044859406,)
Decision Tree Classifier (gini) Recall (0.24457562664973675,)
Decision Tree Classifier (gini) Precision (0.24783766486950398,)
Decision Tree Classifier (gini) F1 Score 0.24608533315584893


array([[ 5064,  5748,  2024,  2867,  3401,  1074,  1505,  2486,  2323,
          403],
       [ 5997, 13027,  4258,  5613,  5585,  2044,  3485,  5500,  4214,
          283],
       [ 2234,  4184,  8098,  4602,  2526,  2455,  3948,  3904,  3794,
         2898],
       [ 3014,  5661,  4608,  9380,  3977,  2440,  3635,  4533,  3840,
          456],
       [ 3610,  5531,  2683,  3708,  5704,  1372,  2219,  2774,  2893,
          139],
       [ 1083,  1957,  2374,  2511,  1345,  2698,  1527,  1999,  2025,
         5838],
       [ 1878,  3537,  4182,  3878,  2486,  1774, 10395,  3703,  2857,
          238],
       [ 2733,  5640,  3997,  4386,  2834,  1979,  3466,  7426,  3318,
         1673],
       [ 2458,  4227,  3644,  3699,  2746,  1987,  2481,  3294,  3938,
         2653],
       [  590,   312,  3578,   564,   191,  6800,   292,  2015,  3104,
        30680]])

#### Decision Tree Classifier (entropy)

In [10]:
dt_entropy_model = DecisionTreeClassifier(criterion='entropy')
dt_entropy_model.fit(X_train, y_train)
# Making predictions
dt_en_pred = dt_entropy_model.predict(X_test)

<b> Decision Tree Classifier (entropy) :Evaluation</b></br>

In [11]:
dt_en_ac_score =  accuracy_score(y_test, dt_en_pred),
dt_en_score =  recall_score(y_test, dt_en_pred, average='macro'),
dt_en_pre_score =  precision_score(y_test, dt_en_pred, average='macro'),
dt_en_f1_score =  f1_score(y_test, dt_en_pred, average='macro')


# Cross-validation for hyperparameter tuning
dt_en_cv_scores = np.mean(cross_val_score(dt_entropy_model, X, y, cv=5))


# Print results

print("Decision Tree Classifier (entropy) CV Score",  dt_en_cv_scores)
print("Decision Tree Classifier (entropy) Accuracy", dt_en_ac_score)
print("Decision Tree Classifier (entropy) Recall", dt_en_score)
print("Decision Tree Classifier (entropy) Precision", dt_en_pre_score)
print("Decision Tree Classifier (entropy) F1 Score", dt_en_f1_score)


dt_en_cof = confusion_matrix(y_test, dt_en_pred)
dt_en_cof

Decision Tree Classifier (entropy) CV Score 0.11347932651615197
Decision Tree Classifier (entropy) Accuracy (0.26537381000300514,)
Decision Tree Classifier (entropy) Recall (0.24404198287694037,)
Decision Tree Classifier (entropy) Precision (0.24685026604664087,)
Decision Tree Classifier (entropy) F1 Score 0.245347358378012


array([[ 4951,  5712,  2107,  2807,  3336,  1102,  1573,  2533,  2345,
          429],
       [ 5982, 12872,  4367,  5650,  5586,  2037,  3323,  5669,  4219,
          301],
       [ 2217,  4347,  7923,  4488,  2579,  2548,  3915,  3799,  3848,
         2979],
       [ 3030,  5831,  4558,  9296,  3905,  2593,  3553,  4464,  3839,
          475],
       [ 3646,  5497,  2646,  3757,  5636,  1458,  2241,  2745,  2862,
          145],
       [ 1112,  1985,  2433,  2337,  1389,  2742,  1548,  1939,  2020,
         5852],
       [ 1822,  3713,  4067,  3973,  2522,  1790, 10398,  3603,  2767,
          273],
       [ 2659,  5834,  3866,  4425,  2798,  2049,  3403,  7410,  3253,
         1755],
       [ 2504,  4179,  3577,  3738,  2737,  1969,  2507,  3229,  4001,
         2686],
       [  537,   354,  3481,   627,   165,  6597,   288,  2038,  3014,
        31025]])

#### Random Forest

In [12]:
rf_model = RandomForestClassifier(n_estimators=100)
rf_model.fit(X_train, y_train)
# Making predictions
rf_pred = rf_model.predict(X_test)

<b> Random Forest:Evaluation</b></br>

In [14]:
rf_ac_score =  accuracy_score(y_test, rf_pred),
rf_score =  recall_score(y_test, rf_pred, average='macro'),
rf_pre_score =  precision_score(y_test, rf_pred, average='macro'),
rf_f1_score =  f1_score(y_test, rf_pred, average='macro')


# Cross-validation for hyperparameter tuning
rf_cv_scores = np.mean(cross_val_score(rf_model, X, y, cv=5))


# Print results
print("Random Forest CV Score", rf_cv_scores)
print("Random Forest Accuracy", rf_ac_score)
print("Random Forest Recall", rf_score)
print("Random Forest Precision", rf_pre_score)
print("Random Forest F1 Score", rf_f1_score)



rf_cof = confusion_matrix(y_test, rf_pred)
rf_cof

Random Forest CV Score 0.118987292913642
Random Forest Accuracy (0.3753622029659977,)
Random Forest Recall (0.3331109278538909,)
Random Forest Precision (0.3221226924900893,)
Random Forest F1 Score 0.31424482330881653


array([[ 5840,  8904,  1629,  2842,  3566,   176,   880,  1284,  1146,
          628],
       [ 4212, 22382,  4011,  6170,  5381,   322,  1873,  3376,  1826,
          453],
       [ 1243,  5445, 11024,  5702,  2230,   422,  3480,  2041,  1515,
         5541],
       [ 1849,  7587,  4606, 15307,  3646,   621,  2689,  2828,  1633,
          778],
       [ 3034,  8214,  2038,  4764,  7882,   294,  1413,  1446,  1352,
          196],
       [  620,  2750,  2119,  3527,  1371,   739,  1309,  1103,   767,
         9052],
       [  782,  3659,  3612,  4037,  1700,   304, 17623,  1901,   834,
          476],
       [ 1547,  8227,  3589,  5697,  2399,   403,  3768,  7159,  1209,
         3454],
       [ 1883,  6098,  3483,  4848,  3039,   413,  2179,  1900,  2347,
         4937],
       [   36,    42,   699,    53,    22,   703,    13,   312,   401,
        45845]])

### 4.4.Comparison on the models

Based on these metrics, the Random Forest model performs slightly better in terms of accuracy, recall, precision, and F1 score compared to the Decision Tree with the Gini criterion and the entropy criterion. <br/>
However,to choose the best model,not only these metrics but also factors such as computational complexity, scalability, and maintenance costs should be considered.Random Forest model typically involves more computational cost due to the ensemble of trees but might generalize better when provided with more data. On the other hand, Decision Trees are simpler and faster but may not perform as well on unseen data.

Plotting Confustion Matrix