# Classical ML Models Benchmark

This notebook contains attempts to solve the problem of predicting ratings will classical ML models which support multinomial classification. The scores achieved by these models will serve as a benchmark for the deep neural network based approach

In [1]:
# General Imports
from os.path import join
import multiprocessing
import pickle
import numpy as np
import pandas as pd

# Classifier Imports
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, KFold


SEED = 0
CPU_COUNT = multiprocessing.cpu_count() - 2

In [2]:
# Wrapper class for a general SKLearn classifier
class Classifier():
    def __init__(self, classifier_name, classifier, init_params, param_grid, seed):
        self.classifier_name = classifier_name
        self.seed = seed
        self.param_grid = param_grid

        #Init classifier
        self.init_params = init_params
        self.init_params["random_state"] = seed
        self.classifier = classifier(**self.init_params) if init_params else classifier(random_state=seed)

        #Dict to explicitly store best stats
        self.best_stats = {"best_params": None, "best_score": None}
    
    def fit(self, X, y):
        print(f"Fitting {self.classifier_name} model...")
        self.classifier.fit(X, y)

    def predict(self, X):
        return self.classifier.predict(X)
    
    def evaluate(self, X_test, y_test):
        return self.classifier.score(X_test, y_test)
    
    def tune_hyperparameters(self, X, y):
        print(f"Tuning hyperparameters for {self.classifier_name} model...")
        cv = KFold(n_splits=5, random_state=self.seed, shuffle=True)
        gscv = GridSearchCV(self.classifier, self.param_grid, scoring="accuracy", cv=cv, n_jobs=-1)
        gscv.fit(X, y)
        self.classifier = gscv.best_estimator_
        self.best_stats["best_params"], self.best_stats["best_score"] = gscv.best_params_, gscv.best_score_

In [3]:
# Loading train, val, and test data (BERT text embeddings and corresponding labels)
data_dir = "data/"

X_train = pickle.load(open(join(data_dir, "downsampled_shuffled_train_embeddings.pkl"), "rb")).numpy()
y_train = pickle.load(open(join(data_dir, "downsampled_shuffled_train_labels.pkl"), "rb"))
X_val = pickle.load(open(join(data_dir, "downsampled_shuffled_val_embeddings.pkl"), "rb")).numpy()
y_val = pickle.load(open(join(data_dir, "downsampled_shuffled_val_labels.pkl"), "rb"))
X_test = pickle.load(open(join(data_dir, "downsampled_shuffled_test_embeddings.pkl"), "rb")).numpy()
y_test = pickle.load(open(join(data_dir, "downsampled_shuffled_test_labels.pkl"), "rb"))

# Combine train and validation set into one as we use K-Fold cross validation
X_train = np.concatenate([X_train, X_val])
y_train = np.concatenate([y_train, y_val])

print(f"X_train: {X_train.shape} | X_test: {X_test.shape} | \n" +
    f"y_train: {y_train.shape} | y_test: {y_test.shape} | ")

X_train: (9000, 768) | X_test: (1000, 768) | 
y_train: (9000,) | y_test: (1000,) | 


In [4]:
# Create list of classifiers
SEED = 0
param_grids = []

# Create parameter grids for hyperparameter tuning
rf_param_grid = {"max_features": ["sqrt", "log2"],
                    "max_depth" : [3, 6, 8],
                    "criterion" :["gini", "entropy"]     ,
                    "n_jobs": [-1]}

lsvc_param_grid = {"penalty": ["l2"],
                   "C": [0.0001, 0.01, 1.0, 10, 100]}

lreg_param_grid = {'penalty' : ['l1', 'l2'],
                     'C' : np.logspace(-4, 4, 20)}

clf_names = ["RandomForest", "LinearSVC", "LogisticRegression"]
clfs = [RandomForestClassifier, LinearSVC, LogisticRegression]
init_params = [{'n_jobs': CPU_COUNT}, {'multi_class': 'crammer_singer'}, {'multi_class': 'multinomial', 'solver': 'lbfgs'}]
param_grids.extend([rf_param_grid, lsvc_param_grid, lreg_param_grid])


classifiers = [Classifier(name, model, {}, param_grid, SEED) 
               for name, model, param_grid in zip(clf_names, clfs, param_grids)]

In [5]:
# Fit classifiers
[clf.fit(X_train, y_train) for clf in classifiers]

Fitting RandomForest model...
Fitting LinearSVC model...




Fitting LogisticRegression model...


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


[None, None, None]

In [6]:
# Score classifier
[clf.evaluate(X_test, y_test) for clf in classifiers]

[0.19, 0.176, 0.182]

In [7]:
# Tune hyperparameters
[clf.tune_hyperparameters(X_train, y_train) for clf in classifiers]

Tuning hyperparameters for RandomForest model...
Tuning hyperparameters for LinearSVC model...
Tuning hyperparameters for LogisticRegression model...


        nan 0.19933333        nan 0.19811111        nan 0.19211111
        nan 0.19122222        nan 0.19344444        nan 0.19411111
        nan 0.19544444        nan 0.19511111        nan 0.195
        nan 0.19388889        nan 0.19466667        nan 0.19488889
        nan 0.195             nan 0.19411111        nan 0.195
        nan 0.19344444        nan 0.19455556]


[None, None, None]

In [8]:
# Score tuned clasifiers
[clf.evaluate(X_test, y_test) for clf in classifiers]

[0.193, 0.205, 0.208]

In [10]:
# Save models
models_dir = "models/"
for clf in classifiers:
    pickle.dump(clf, open(join(models_dir, f"{clf.classifier_name}.pkl"), "wb")) 

From the above results, we can see that the best performing classical ML model (Logistic Regression) was able to achieve a 20.8% accuracy, showing that the data is highly non-linear and not very seperable.  