# Modeling

In this notebook, we will be testing and evaluating models.

***

# Initialization

Importing libraries and notebooks.

In [None]:
# Setting PYTHONHASHSEED
import os

pyhashseed1 = os.environ.get('PYTHONHASHSEED')
os.environ['PYTHONHASHSEED'] = '0'
pyhashseed2 = os.environ.get('PYTHONHASHSEED')

# NOTEBOOK EXCLUSIVE CODE 
if __name__ == "__main__":
    print('Make sure the following says \'None\': ', pyhashseed1)
    print('Make sure the following says \'0\': ', pyhashseed2)

Make sure the following says 'None':  None
Make sure the following says '0':  0


In [2]:
# Importing libraries
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

from IPython.display import display
import copy

# Setting seed
np.random.seed(42)

In [3]:
from pathlib import Path

IMAGES_PATH = Path() / 'plots'

def save_fig(fig_name, tight_layout=True, fig_extension='png', resolution=300):
    '''Saves an image to the plots folder with the specified name.'''
    path = IMAGES_PATH / f'{fig_name}.{fig_extension}'
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

In [None]:
# Importing data: label encoded
tree_folds = list()
folds_path = Path('data/folds')

# Iterate over each fold directory
for fold in sorted(folds_path.iterdir()):  # Ensure directories are processed in sorted order
    if fold.is_dir():
        # Load data from each fold's files
        X_train = pd.read_csv(fold / 'X_train.csv')
        X_test = pd.read_csv(fold / 'X_test.csv')
        y_train = pd.read_csv(fold / 'y_train.csv')
        y_test = pd.read_csv(fold / 'y_test.csv')

        # Append the data as a list
        tree_folds.append([X_train, X_test, y_train, y_test])

In [10]:
# One-hot encoded + scaled
folds = list()
folds_path = Path('data/folds2')

# Iterate over each fold directory
for fold in sorted(folds_path.iterdir()):  # Ensure directories are processed in sorted order
    if fold.is_dir():
        # Load data from each fold's files
        X_train = pd.read_csv(fold / 'X_train.csv')
        X_test = pd.read_csv(fold / 'X_test.csv')
        y_train = pd.read_csv(fold / 'y_train.csv')
        y_test = pd.read_csv(fold / 'y_test.csv')

        # Append the data as a list
        folds.append([X_train, X_test, y_train, y_test])

***

# Feature reduction

Before we implement our models, we will drop the columns that are not the ones we selected in B_preprocessing.

In [27]:
def expanded_columns(columns, compare_columns):
    '''Find the column names that contain the names in columns.'''
    
    expanded_columns = []
    
    for col in columns:
        for column in compare_columns:
            if col in column:
                expanded_columns.append(column)
    
    return expanded_columns

In [None]:
def KFoldSimplifier(folds, drop_columns):
    '''Remove the specified columns in "drop_columns" for each of the folds in "folds".'''
    for fold in folds:
        X_train, X_test, y_train, y_test = fold
        
        # Drop the columns
        X_train.drop(columns=drop_columns, inplace=True)
        
        X_test_cols = list(X_test.columns)
        drop_test_columns = list(set(drop_columns).intersection(set(X_test_cols)))
        X_test.drop(columns=drop_test_columns, inplace=True)

In [28]:
# Keeping only features we found in B_preprocessing
folds2 = copy.deepcopy(folds)
tree_folds2 = copy.deepcopy(tree_folds)

drop_cols = ['gender', 'browsing_history']
folds_cols = folds[0][0].columns
tree_folds_cols = tree_folds[0][0].columns

drop_cols_folds = expanded_columns(drop_cols, folds_cols)
drop_cols_trees = expanded_columns(drop_cols, tree_folds_cols)

KFoldSimplifier(folds, drop_cols_folds)
KFoldSimplifier(tree_folds, drop_cols_trees)

In [55]:
# Check that the correct columns were dropped
display(folds[0][0].head())
display(tree_folds[0][0].head())

Unnamed: 0,age,time_of_day_en,device_type_Desktop,device_type_Mobile,device_type_Tablet,ad_position_Bottom,ad_position_Side,ad_position_Top
0,0.086957,1,1,0,0,0,0,1
1,0.5,1,1,0,0,0,0,1
2,0.5,2,0,1,0,0,1,0
3,0.347826,3,0,1,0,0,0,1
4,0.456522,0,0,1,0,1,0,0


Unnamed: 0,age,time_of_day_en,device_type_en,ad_position_en
0,22,1,0,2
1,41,1,0,2
2,41,2,1,1
3,34,3,1,2
4,39,0,1,0


***

# Shortlisting models

We will try out a couple different types of models to shortlist the best one.

In [57]:
# Importing metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, log_loss

In [58]:
# Initializing evaluation metrics

def evalmetrics():
    '''Return a dict of evaluation metric and a corresponding list.'''
    metrics = dict()
    metrics['accuracy_scores'] = []
    metrics['precision_scores'] = []
    metrics['recall_scores'] = []
    metrics['f1_scores'] = []
    metrics['roc_auc_scores'] = []
    metrics['log_loss'] = []
    
    return metrics

In [59]:
# Train and evaluate model

def KfoldTrainEvaluate(folds, input_model, metrics):
    '''Train and evaluate model on folds. Store metrics in 'metrics'.'''
    
    for fold in folds:
        X_train, X_test, y_train, y_test = fold
        y_train = y_train.iloc[:, 0]
        y_test = y_test.iloc[:, 0]
        
        model = input_model
        model.fit(X_train, y_train)
        
        y_pred = model.predict(X_test)
        y_prob = model.predict_proba(X_test)[:, 1]
        
        metrics['accuracy_scores'].append(accuracy_score(y_test, y_pred))
        metrics['precision_scores'].append(precision_score(y_test, y_pred))
        metrics['recall_scores'].append(recall_score(y_test, y_pred))
        metrics['f1_scores'].append(f1_score(y_test, y_pred))
        metrics['roc_auc_scores'].append(roc_auc_score(y_test, y_prob))
        metrics['log_loss'].append(log_loss(y_test, y_prob))

In [60]:
# Print metric results

def printmetrics(metrics, model_name):
    '''Print metrics in an organized output.'''
    
    print(f'Metrics for {model_name}')
    for metric, values in metrics.items():
        metric_array = np.array(values)
        metric_mean = np.mean(metric_array)
        
        print(f'Mean {metric}: {metric_mean:.2f}')

## Logistic regression

In [61]:
from sklearn.linear_model import LogisticRegression

# Initializing evaluation metrics
log_metrics = evalmetrics()

log_regression = LogisticRegression()

KfoldTrainEvaluate(folds, log_regression, log_metrics)

In [62]:
printmetrics(log_metrics, 'Logistic Regression')

Metrics for Logistic Regression
Mean accuracy_scores: 0.51
Mean precision_scores: 0.52
Mean recall_scores: 0.55
Mean f1_scores: 0.53
Mean roc_auc_scores: 0.52
Mean log_loss: 0.70


## Support vector machines (SVM)

In [63]:
from sklearn.svm import SVC

# Initializing evaluation metrics
svm_metrics = evalmetrics()

svm = SVC(kernel='rbf', probability=True)

KfoldTrainEvaluate(folds, svm, svm_metrics)

In [64]:
printmetrics(svm_metrics, 'Support Vector Machines')

Metrics for Support Vector Machines
Mean accuracy_scores: 0.49
Mean precision_scores: 0.50
Mean recall_scores: 0.56
Mean f1_scores: 0.53
Mean roc_auc_scores: 0.50
Mean log_loss: 0.70


## Decision trees

In [68]:
from sklearn.tree import DecisionTreeClassifier

# Initializing evaluation metrics
decision_metrics = evalmetrics()

decisiontree = DecisionTreeClassifier()

KfoldTrainEvaluate(tree_folds, decisiontree, decision_metrics)

In [69]:
printmetrics(decision_metrics, 'Decision Trees')

Metrics for Decision Trees
Mean accuracy_scores: 0.52
Mean precision_scores: 0.54
Mean recall_scores: 0.43
Mean f1_scores: 0.48
Mean roc_auc_scores: 0.53
Mean log_loss: 5.95


## K-NN

In [70]:
from sklearn.neighbors import KNeighborsClassifier

# Initializing evaluation metrics
knn_metrics = evalmetrics()

knn = KNeighborsClassifier(n_neighbors=5)

KfoldTrainEvaluate(folds, knn, knn_metrics)

In [71]:
printmetrics(knn_metrics, 'K-NN')

Metrics for K-NN
Mean accuracy_scores: 0.52
Mean precision_scores: 0.53
Mean recall_scores: 0.52
Mean f1_scores: 0.52
Mean roc_auc_scores: 0.52
Mean log_loss: 3.60


## XG Boost

In [77]:
from xgboost import XGBClassifier

# Initializing evaluation metrics
xgb_metrics = evalmetrics()

xgb = XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss')

KfoldTrainEvaluate(tree_folds, xgb, xgb_metrics)

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



In [78]:
printmetrics(xgb_metrics, 'XGB')

Metrics for XGB
Mean accuracy_scores: 0.52
Mean precision_scores: 0.54
Mean recall_scores: 0.48
Mean f1_scores: 0.51
Mean roc_auc_scores: 0.54
Mean log_loss: 0.80
