Importing Dependencies

In [8]:
import joblib
import warnings
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import(
    StratifiedKFold,
    cross_validate,
    GridSearchCV
)
from sklearn.metrics import confusion_matrix
warnings.filterwarnings("ignore")

Load The Data

In [9]:
X_train = np.load('../artifacts/X_train.npz')['arr_0']
Y_train = np.load('../artifacts/Y_train.npz')['arr_0']
X_test = np.load('../artifacts/X_test.npz')['arr_0']
Y_test = np.load('../artifacts/Y_test.npz')['arr_0']

Define Multi Models

In [10]:
lr_param_grid = {
    'max_iter' : [1000, 5000, 10000]
}

dt_param_grid = {
    'max_depth' : [5, 10, 15, 25],
    'criterion' : ['gini', 'entropy', 'log_loss'] 
}

rf_param_grid ={
    'n_estimators' : [100],
    'max_depth' : [10, 15],
    'criterion' : ['gini', 'entropy', 'log_loss']
}

param_grids = {
    'Logistic Regression' : lr_param_grid,
    'Decision Tree' : dt_param_grid,
    'Random Forest' : rf_param_grid
}


In [11]:
models = {
    'Logistic Regression': LogisticRegression(),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier()
}

In [12]:
cv = StratifiedKFold(
    n_splits = 6,
    shuffle = True,
    random_state = 42
)

In [13]:
grid_search_results = {}

for model_name, model in models.items():
    print(f"Cross-validating {model_name}...")
    
    param_grid = param_grids[model_name]
    
    grid_search = GridSearchCV(
        estimator = model,
        param_grid = param_grid,
        scoring = 'f1',
        cv = cv,
        verbose = 1,
        return_train_score = False
    )
    
    print(f"Fitting Grid search CV on {model_name}...")
    
    grid_search.fit(X_train, Y_train)
    
    grid_search_results[model_name] = grid_search
    
    print(f"GridSearchCV for {model_name} completed.")
    print(f"Best parameters for {model_name}: {grid_search.best_params_}")
    print(f"Best CV score for {model_name}: {grid_search.best_score_}\n")

Cross-validating Logistic Regression...
Fitting Grid search CV on Logistic Regression...
Fitting 6 folds for each of 3 candidates, totalling 18 fits
GridSearchCV for Logistic Regression completed.
Best parameters for Logistic Regression: {'max_iter': 1000}
Best CV score for Logistic Regression: 0.7150065795139099

Cross-validating Decision Tree...
Fitting Grid search CV on Decision Tree...
Fitting 6 folds for each of 12 candidates, totalling 72 fits
GridSearchCV for Decision Tree completed.
Best parameters for Decision Tree: {'criterion': 'entropy', 'max_depth': 25}
Best CV score for Decision Tree: 0.8412504982825381

Cross-validating Random Forest...
Fitting Grid search CV on Random Forest...
Fitting 6 folds for each of 6 candidates, totalling 36 fits
GridSearchCV for Random Forest completed.
Best parameters for Random Forest: {'criterion': 'gini', 'max_depth': 15, 'n_estimators': 100}
Best CV score for Random Forest: 0.8906487749707716

