# 4.Modeling

### Import Libaries

In [19]:
import pandas as pd
import numpy as np
import os
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import __version__ as sklearn_version
from sklearn.model_selection import cross_validate,cross_val_score, train_test_split, GridSearchCV, learning_curve
from sklearn.decomposition import PCA
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error,accuracy_score, recall_score, precision_score, f1_score
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
import datetime


### Load Data( Test set and training set )

In [2]:
datafilepath = "../data/interim/train_test_split.pkl"

with open(datafilepath, 'rb') as file:
    X,y,X_train, X_test, y_train, y_test = pickle.load(file)

### Initiation

In [3]:
#Calculate the mean of `y_train`
#train_mean = y_train.mean()
#train_mean

In [4]:
#Calculate the medium of `y_train`
X_defaults_median = X_train.median()
X_defaults_median

VICT_AGE              -0.002785
latitude               0.013320
longitude             -0.019968
MTH_RPTD              -0.127860
DAY_RPTD               0.032557
YEAR_RPTD              0.123388
MTH_OCC               -0.123719
DAY_OCC               -0.050370
YEAR_OCC               0.134830
WEEKDAY_OCC_Encoded    0.026682
AREA_NAME_Encoded      0.029688
dtype: float64

##### Impute NaN Data

In [5]:
# handling NaN values
# Replace 'mean' with 'median', 'most_frequent', or a constant value if needed

imputer = SimpleImputer(missing_values=np.nan, strategy='mean')
X_train = imputer.fit_transform(X_train)

In [6]:
# Check for NaN values
print("NaN values in X_test:", X_test.isna().sum().sum())

# Replace NaN values with the mean (you can choose other methods)
X_test = X_test.fillna(X_test.mean())

# Check for infinity values
print("Infinity values in X_test:", np.isinf(X_test).sum().sum())

# Replace infinity values
X_test = X_test.replace([np.inf, -np.inf], np.nan).fillna(X_test.mean())

NaN values in X_test: 0
Infinity values in X_test: 0


In [7]:
# Normalize or Scale your data
scaler = StandardScaler()
X_test_scaled = scaler.fit_transform(X_test)
X_test_scaled

array([[-2.36081361e-01,  1.32988822e-01, -5.53067227e-02, ...,
        -3.73444436e-01,  5.23831269e-01,  1.18393023e+00],
       [ 8.93392949e-01, -4.83791241e-02, -3.62480539e-03, ...,
        -1.64134548e+00,  1.51770253e+00, -1.33851924e-01],
       [-1.45967853e+00,  1.14374487e-01, -3.25565177e-02, ...,
        -1.38776527e+00, -1.46391126e+00, -2.98574693e-01],
       ...,
       [-4.24327079e-01,  2.05204646e-01, -5.14962045e-02, ...,
        -8.80604855e-01,  2.68956370e-02, -1.12218854e+00],
       [ 3.75717224e-01,  2.16509687e-01, -7.46133483e-02, ...,
         1.65519724e+00,  1.51770253e+00, -1.28691131e+00],
       [ 1.36400725e+00,  1.42153173e-03, -4.17864396e-02, ...,
        -8.80604855e-01,  5.23831269e-01,  1.34865299e+00]])

### Models

Lets try different models. 
Decision tree-based models with different criteria (Gini impurity and entropy).

#### Decision Tree Classifier (gini)

In [17]:
dt_gini_model = DecisionTreeClassifier(criterion='gini')
dt_gini_model.fit(X_train, y_train)
# Making predictions
dt_gini_pred = dt_gini_model.predict(X_test)

In [None]:
<b> Decision Tree Classifier (gini) :Evaluation</b></br>

In [22]:

dt_gini_ac_score =  accuracy_score(y_test, dt_gini_pred),
dt_gini_re_score =  recall_score(y_test, dt_gini_pred, average='macro'),
dt_gini_pre_score =  precision_score(y_test, dt_gini_pred, average='macro'),
dt_gini_f1_score =  f1_score(y_test, dt_gini_pred, average='macro')


# Cross-validation for hyperparameter tuning
dt_gini_cv_scores = np.mean(cross_val_score(dt_gini_model, X, y, cv=5))


# Print results
gini_results = {
        "Decision Tree Classifier (gini) CV Score": dt_gini_cv_scores,
        "Decision Tree Classifier (gini) Accuracy": dt_gini_ac_score,
        "Decision Tree Classifier (gini) Recall": dt_gini_re_score,
        "Decision Tree Classifier (gini) Precision": dt_gini_pre_score,
        "Decision Tree Classifier (gini) F1 Score": dt_gini_f1_score,
        

    }
print (gini_results)

dt_gini_cof = confusion_matrix(y_test, dt_gini_pred)
dt_gini_cof

{'Decision Tree Classifier (gini) CV Score': 0.1168103531461687, 'Decision Tree Classifier (gini) Accuracy': (0.26660343910165396,), 'Decision Tree Classifier (gini) Recall': (0.24532717317209016,), 'Decision Tree Classifier (gini) Precision': (0.2485361067957766,), 'Decision Tree Classifier (gini) F1 Score': 0.24681981619502968, 'Decision Tree Classifier (gini) confusion matrix': array([[ 5038,  5731,  2020,  2913,  3362,  1056,  1569,  2497,  2309,
          400],
       [ 5904, 13032,  4301,  5621,  5507,  2013,  3425,  5645,  4285,
          273],
       [ 2255,  4180,  8054,  4608,  2531,  2476,  3943,  3867,  3824,
         2905],
       [ 2977,  5696,  4604,  9415,  3901,  2401,  3672,  4529,  3880,
          469],
       [ 3599,  5555,  2636,  3740,  5785,  1397,  2172,  2776,  2836,
          137],
       [ 1104,  1952,  2369,  2465,  1359,  2697,  1542,  2017,  2000,
         5852],
       [ 1879,  3583,  4062,  3858,  2465,  1798, 10497,  3690,  2852,
          244],
       

#### Decision Tree Classifier (entropy)

In [None]:
dt_entropy_model = DecisionTreeClassifier(criterion='entropy')
dt_entropy_model.fit(X_train, y_train)
# Making predictions
dt_en_pred = dt_entropy_model.predict(X_test)

<b> Decision Tree Classifier (entropy) :Evaluation</b></br>

In [None]:
dt_en_ac_score =  accuracy_score(y_test, dt_en_pred),
dt_en_score =  recall_score(y_test, dt_en_pred, average='macro'),
dt_en_pre_score =  precision_score(y_test, dt_en_pred, average='macro'),
dt_en_f1_score =  f1_score(y_test, dt_en_pred, average='macro')


# Cross-validation for hyperparameter tuning
dt_en_cv_scores = np.mean(cross_val_score(dt_entropy_model, X, y, cv=5))


# Print results
en_results = {
        "Decision Tree Classifier (entropy) CV Score": dt_en_cv_scores,
        "Decision Tree Classifier (entropy) Accuracy": dt_en_ac_score,
        "Decision Tree Classifier (entropy) Recall": dt_en_score,
        "Decision Tree Classifier (entropy) Precision": dt_en_pre_score,
        "Decision Tree Classifier (entropy) F1 Score": dt_en_f1_score,

    

    }
print (en_results)

dt_en_cof = confusion_matrix(y_test, dt_en_pred)
dt_en_cof

#### Random Forest

In [None]:
rf_model = RandomForestClassifier(n_estimators=100)
rf_model.fit(X_train, y_train)
# Making predictions
rf_pred = rf_model.predict(X_test)

In [None]:
<b> Random Forest:Evaluation</b></br>

In [None]:
rf_ac_score =  accuracy_score(y_test, rf_pred),
rf_score =  recall_score(y_test, rf_pred, average='macro'),
rf_pre_score =  precision_score(y_test, rf_pred, average='macro'),
rf_f1_score =  f1_score(y_test, rf_pred, average='macro')


# Cross-validation for hyperparameter tuning
rf_cv_scores = np.mean(cross_val_score(dt_en, X, y, cv=5))


# Print results
rf_results = {
        "Random Forest CV Score": rf_cv_scores,
        "Random Forest Accuracy": rf_ac_score,
        "Random Forest Recall": rf_score,
        "Random Forest Precision": rf_pre_score,
        "Random Forest F1 Score": rf_f1_score,


    }
print (rf_results)


rf_cof = confusion_matrix(y_test, rf_pred)
rf_cof