In [1]:
import os

import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import make_scorer, precision_score, confusion_matrix
from sklearn.model_selection import cross_validate
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC

In [2]:
os.environ['LOKY_MAX_CPU_COUNT'] = str(os.cpu_count()-1)  # To silence warning : Could not find the number of physical cores

In [3]:
# Function to calculate weighted specificity
def multiclassSpecificity(yTrue, yPredict):
    cm = confusion_matrix(yTrue, yPredict)

    specificities = []
    for i in range(len(cm)):
        trueNegative = np.sum(cm) - np.sum(cm[i, :]) - np.sum(cm[:, i]) + cm[i, i]
        falsePositive = np.sum(cm[:, i]) - cm[i, i]
        specificity = trueNegative / (trueNegative + falsePositive) if (trueNegative + falsePositive) > 0 else 0
        specificities.append(specificity)

    return np.mean(specificities)

In [9]:
def makePipeline(modelToUsed):
    pipeline = Pipeline(steps=[
        ('feature_selection', SelectFromModel(
            estimator=RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1),
            threshold='median')
         ),
        ('classifier', modelToUsed),
    ])
    return pipeline

#### Read data and define target

In [4]:
df = pd.read_csv('../Dataset/dataFrameProcessed')
targetCol = 'imdb_rating'

Split features and target data

In [5]:
X = df.drop(columns=[targetCol])
y = df[targetCol]
fold = 5

Built set of models selected

In [6]:
models = {
    'Support Vector Machine': SVC(kernel='rbf', random_state=42),
    'Random Forest'         : RandomForestClassifier(random_state=42),
    'Hist Gradient Boosting': HistGradientBoostingClassifier(random_state=42),
}

Built scoring metrix

In [7]:
scoringMetrix = {
    'accuracy'    : 'accuracy',
    'precision'   : make_scorer(precision_score, average='weighted', zero_division=1),
    'recall'      : 'recall_weighted',
    'f1'          : 'f1_weighted',
    'specificity' : make_scorer(multiclassSpecificity)
}

In [14]:
dataset_results = []

for name, model in models.items():
    pipeCV = makePipeline(model)

    cvResult = cross_validate(
        pipeCV,
        X,
        y,
        cv= fold,
        scoring=scoringMetrix,
        n_jobs=-1,
        error_score='raise'
    )

    dataset_results.append({
        'Model'         : name,
        'Accuracy'      : np.mean(cvResult['test_accuracy']),
        'Precision'     : np.mean(cvResult['test_precision']),
        'Recall'        : np.mean(cvResult['test_recall']),
        'F1 Score'      : np.mean(cvResult['test_f1']),
        'Specificity'   : np.mean(cvResult['test_specificity'])
    })

print(pd.DataFrame(dataset_results).set_index('Model'))

                        Accuracy  Precision  Recall  F1 Score  Specificity
Model                                                                     
Support Vector Machine    0.7270   0.727208  0.7270  0.717699     0.916118
Random Forest             0.7692   0.774183  0.7692  0.762852     0.926033
Hist Gradient Boosting    0.7746   0.780896  0.7746  0.766508     0.929418
