## Hyperparamters and fixing models performance

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import  confusion_matrix, accuracy_score, recall_score, precision_score, f1_score, classification_report
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

In [2]:
df = pd.read_csv("../data/feature_selected.csv")
df = df.drop('Unnamed: 0', axis=1)

### Split the dataset into training (80%) and testing (20%) sets.

In [3]:
from sklearn.model_selection import train_test_split

X = df.iloc[:, :-1]
y = df.iloc[:, -1].astype(str)
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=42)

### Logistic Regression

In [15]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

param_grid_log = {
  'C': [0.01, 0.1, 1, 10],
  'penalty': ['l1', 'l2'],
  'solver': ['liblinear'],
  'class_weight': [None, 'balanced'],
  'max_iter': [100, 500]
}

log = LogisticRegression(random_state=0,class_weight='balanced')

grid_log = GridSearchCV(log, param_grid_log, cv=5, scoring='accuracy')
grid_log.fit(X_train, y_train)

# get the best model and parameters
best_log_model = grid_log.best_estimator_
best_params_log = grid_log.best_params_

# print the best hyperparameters
print('Best Hyperparameters:')
print(best_params_log)

# Train the model on the full training set
best_log_model.fit(X_train, y_train)

# Evaluate the model on the test set
y_pred_log = best_log_model.predict(X_test)
accuracy_log = accuracy_score(y_test, y_pred_log)

print(f'Accuracy on Test Set: {accuracy_log:.2f}')

Best Hyperparameters:
{'C': 0.1, 'class_weight': None, 'max_iter': 100, 'penalty': 'l2', 'solver': 'liblinear'}
Accuracy on Test Set: 0.82


### Decision Tree

In [17]:
from sklearn.tree import DecisionTreeClassifier
from scipy.stats import randint

param_dist_dst = {
  'max_depth': [3, 5, 10, None],
  'min_samples_split': randint(2, 20),
  'min_samples_leaf': randint(1, 10),
  'criterion': ['gini', 'entropy'],
  'max_features': [None, 'sqrt', 'log2']
}

dst = DecisionTreeClassifier()

random_search_dst = RandomizedSearchCV(dst, param_distributions=param_dist_dst, n_iter=50, cv=5, random_state=42)
random_search_dst.fit(X_train, y_train)

print("Best cross-validated score:", random_search_dst.best_score_)

# get the best model and parameters
best_dst_model = random_search_dst.best_estimator_
best_params_dst = random_search_dst.best_params_

# print the best hyperparameters
print('Best Hyperparameters:')
print(best_params_dst)

# Train the model on the full training set
best_dst_model.fit(X_train, y_train)

# Evaluate the model on the test set
y_pred_dst = best_dst_model.predict(X_test)
accuracy_dst = accuracy_score(y_test, y_pred_dst)

print(f'Accuracy on Test Set: {accuracy_dst:.2f}')

Best cross-validated score: 0.8086580086580086
Best Hyperparameters:
{'criterion': 'gini', 'max_depth': 3, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 3}
Accuracy on Test Set: 0.80


### Random Forest

In [18]:
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(random_state=0,class_weight='balanced')

# define hyperparameters for tuning
param_dist_rf = {
  'n_estimators': randint(50, 200),
  'max_depth': [None, 10, 20, 40],
  'min_samples_split': randint(2, 10),
  'min_samples_leaf': randint(1, 10),
  'max_features': ['auto', 'sqrt', 'log2'],
  'bootstrap': [True, False],
  'criterion': ['gini', 'entropy']
}

grid_search_rf = RandomizedSearchCV(rf_model, param_distributions=param_dist_rf, n_iter=50, cv=5, random_state=42)
grid_search_rf.fit(X_train, y_train)

# get the best model and parameters
best_rf_model = grid_search_rf.best_estimator_
best_params_rf = grid_search_rf.best_params_

# print the best hyperparameters
print('Best Hyperparameters:')
print(best_params_rf)

# Train the model on the full training set
best_rf_model.fit(X_train, y_train)

# Evaluate the model on the test set
y_pred_rf = best_rf_model.predict(X_test)
accuracy_rf = accuracy_score(y_test, y_pred_rf)

print(f'Accuracy on Test Set: {accuracy_rf:.2f}')

55 fits failed out of a total of 250.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
55 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\yahia\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\yahia\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\base.py", line 1382, in wrapper
    estimator._validate_params()
  File "c:\Users\yahia\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\base.py", line 436, in _validate_params
    validate_parameter_constraints(
  File "c:\Users\yahia\AppData\Local\Programs\Python\Python311\Lib

Best Hyperparameters:
{'bootstrap': True, 'criterion': 'entropy', 'max_depth': 40, 'max_features': 'log2', 'min_samples_leaf': 3, 'min_samples_split': 5, 'n_estimators': 104}
Accuracy on Test Set: 0.84


### Support Vector Machine (SVM)

In [19]:
from sklearn.svm import SVC

param_grid_svm = {
  'C': [0.1, 1, 10, 100],
  'kernel': ['linear', 'rbf', 'poly'],
  'gamma': ['scale', 'auto', 0.01, 0.1, 1],
  'degree': [2, 3, 4],  # only used if kernel='poly'
  'class_weight': [None, 'balanced']
}

svc = SVC()

grid_search_svm = GridSearchCV(svc, param_grid_svm, cv=5, verbose=1)
grid_search_svm.fit(X_train, y_train)

print("Best parameters:", grid_search_svm.best_params_)
print("Best cross-validation score:", grid_search_svm.best_score_)

# get the best model and parameters
best_svm_model = grid_search_svm.best_estimator_
best_params_svm = grid_search_svm.best_params_

# print the best hyperparameters
print('Best Hyperparameters:')
print(best_params_svm)

# Train the model on the full training set
best_svm_model.fit(X_train, y_train)

# Evaluate the model on the test set
y_pred_svm = best_svm_model.predict(X_test)
accuracy_svm = accuracy_score(y_test, y_pred_svm)

print(f'Accuracy on Test Set: {accuracy_svm:.2f}')

Fitting 5 folds for each of 360 candidates, totalling 1800 fits
Best parameters: {'C': 0.1, 'class_weight': None, 'degree': 2, 'gamma': 0.1, 'kernel': 'rbf'}
Best cross-validation score: 0.8267372977899292
Best Hyperparameters:
{'C': 0.1, 'class_weight': None, 'degree': 2, 'gamma': 0.1, 'kernel': 'rbf'}
Accuracy on Test Set: 0.84


In [22]:
# save best resulting models
import joblib

joblib.dump(best_svm_model, '../models/svm_best.pkl')
joblib.dump(best_dst_model, '../models/dst_best.pkl')
joblib.dump(best_log_model, '../models/log_best.pkl')
joblib.dump(best_rf_model, '../models/rf_best.pkl')

['../models/rf_best.pkl']