### Import Data and Libraries


In [9]:
import pandas as pd
import numpy as np # numpy=1.23.5
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.model_selection import GridSearchCV
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer
from sklearn.preprocessing import StandardScaler
import warnings


In [10]:
# Load datasets
train_data_fasttext_300d = pd.read_csv("numerical_datasets/train_data_mod_fasttext_300d_numerical.csv")
train_data_glove_50d_0v = pd.read_csv("numerical_datasets/train_data_mod_glove_50d_0v_numerical.csv")
train_data_glove_50d_custom = pd.read_csv("numerical_datasets/train_data_mod_glove_50d_custom_numerical.csv")
train_data_word2vec_50d = pd.read_csv("numerical_datasets/train_data_mod_word2vec_50d_numerical.csv")

test_data_fasttext_300d = pd.read_csv("numerical_datasets/test_data_mod_fasttext_300d_numerical.csv")
test_data_glove_50d_0v = pd.read_csv("numerical_datasets/test_data_mod_glove_50d_0v_numerical.csv")
test_data_glove_50d_custom = pd.read_csv("numerical_datasets/test_data_mod_glove_50d_custom_numerical.csv")
test_data_word2vec_50d = pd.read_csv("numerical_datasets/test_data_mod_word2vec_50d_numerical.csv")


In [11]:
datasets = [("fasttext_300d", train_data_fasttext_300d, test_data_fasttext_300d),
            ("glove_50d_0v", train_data_glove_50d_0v, test_data_glove_50d_0v),
            ("glove_50d_custom", train_data_glove_50d_custom, test_data_glove_50d_custom),
            ("word2vec_50d", train_data_word2vec_50d, test_data_word2vec_50d)]


### Model Evaluation: Logistic Regression

Logistic regression is a linear model that uses a logistic function to transform the output into a probability value between 0 and 1, which can be interpreted as the likelihood of the positive class. It works by finding the optimal values of the model coefficients that maximize the likelihood of the data given the model, typically using maximum likelihood estimation.

For each dataset, we evaluate the logistic regression model using stratified k-fold cross-validation and compute evaluation metrics which include: F1-score, precision, recall, accuracy, and ROC AUC.

#### Scaling Datasets

In [12]:
# # Scale datasets
# scaler = StandardScaler()

# datasets_scaled = []

# for name, train_data, test_data in datasets:
#     X_train = train_data.drop("target", axis=1)
#     y_train = train_data["target"]
#     X_test = test_data
    
#     X_train_scaled = scaler.fit_transform(X_train)
#     X_test_scaled = scaler.transform(X_test)
#     y_train_scaled = y_train
    
#     train_data_scaled = pd.concat([pd.DataFrame(X_train_scaled), y_train_scaled.reset_index(drop=True)], axis=1)
#     test_data_scaled = pd.DataFrame(X_test_scaled)
    
#     datasets_scaled.append((name + "_scaled", train_data_scaled, test_data_scaled))
    
# for name, train_data, test_data in datasets_scaled:
#     print(train_data.head())


#### Defining the Logistic Regression Model

In [13]:
# Define the logistic regression model
seed = 69
max_iter = 1000
lr_model = LogisticRegression(max_iter=max_iter, random_state=seed)

# Define the number of folds for stratified k-fold cross validation
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)

# Define the evaluation metrics
eval_metrics = [accuracy_score, f1_score, precision_score, recall_score, roc_auc_score]


#### Logistic Regression w/ Default Configurations

In [16]:
# Use a loop to iterate over each dataset, and for each dataset, train and evaluate the model using stratified k-fold cross-validation
average_scores = {name: {metric.__name__: 0 for metric in eval_metrics} for name, _, _ in datasets}
best_scores = {metric.__name__: -np.inf for metric in eval_metrics}
best_dataset = ''

for name, train_data, test_data in datasets: # test_data is redundant
    X = train_data.drop(["id", "target"], axis=1)
    y = train_data["target"]

    scores = {metric.__name__: [] for metric in eval_metrics}
    
    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        lr_model.fit(X_train, y_train)
        y_pred = lr_model.predict(X_test)
        
        for metric in eval_metrics:
            score = metric(y_test, y_pred)
            scores[metric.__name__].append(score)
    
    print(f"Results for dataset: {name}")
    for metric in eval_metrics:
        average_scores[name][metric.__name__] = np.mean(scores[metric.__name__])
        print(f"Average {metric.__name__} Score: {average_scores[name][metric.__name__]}")
    print("------------------------")

for metric in eval_metrics:
    best_score = -np.inf
    for name, _, _ in datasets:
        if average_scores[name][metric.__name__] > best_score:
            best_score = average_scores[name][metric.__name__]
            best_dataset = name
    best_scores[metric.__name__] = best_score

print(f"The dataset with the highest average scores is {best_dataset}, with the following average scores:")
for metric in eval_metrics:
    print(f"Best {metric.__name__} Score: {best_scores[metric.__name__]}")


Results for dataset: fasttext_300d
Average accuracy_score Score: 0.7571287563535211
Average f1_score Score: 0.7067025766996066
Average precision_score Score: 0.7343286833841852
Average recall_score Score: 0.6811350001167215
Average roc_auc_score Score: 0.7477546371352487
------------------------
Results for dataset: glove_50d_0v
Average accuracy_score Score: 0.7588353093132632
Average f1_score Score: 0.7091203600469612
Average precision_score Score: 0.7360393621803702
Average recall_score Score: 0.6841921703200505
Average roc_auc_score Score: 0.7496285792074233
------------------------
Results for dataset: glove_50d_custom
Average accuracy_score Score: 0.7590983802457802
Average f1_score Score: 0.7094373807659478
Average precision_score Score: 0.7363752451976321
Average recall_score Score: 0.6844984476037071
Average roc_auc_score Score: 0.749896660073218
------------------------
Results for dataset: word2vec_50d
Average accuracy_score Score: 0.7575226293633407
Average f1_score Score: 0

Using the default configurations of the logistic regression model, the best performing dataset is 'glove_50d_custom'

#### Tuning Hyperparameters using Bayesian Optimisation

Bayesian Optimisation is used to tune the hyperparameters for our Logistic Regression model. Unlike Grid Search or Random Search, Bayesian Optimisation can be more efficient as it uses information from previous iterations to guide the search towards promising regions in the hyperparameter space. This can result in better performance with fewer evaluations.

The hyperparameters to tune are the regularisation strength (C), the penalty functions (l1 or l2), and the solver method (lbfgs, liblinear, or saga)

In [None]:
# Define the hyperparameter grid to search over
hyperparameter_grid = {
    "C": Real(0.001, 100.0, prior='log-uniform'),
    "penalty": Categorical(['l1', 'l2']),
    "solver": Categorical(['lbfgs', 'liblinear', 'saga']),
}

average_scores = {name: {metric.__name__: 0 for metric in eval_metrics} for name, _, _ in datasets}
best_scores = {metric.__name__: -np.inf for metric in eval_metrics}
best_dataset = ''

# This warning occurs when the objective function has been evaluated at a certain point during the Bayesian optimization process and can be safely ignored.
warnings.filterwarnings('ignore', message='The objective has been evaluated at this point before.')

# Loop through each dataset, and for each dataset, train, tune hyperparameters, and evaluate the model using stratified k-fold cross-validation
for name, train_data, test_data in datasets: # test_data is redundant
    X_train = train_data.drop(["id", "target"], axis=1)
    y_train = train_data["target"]
    
    lr_model = LogisticRegression(max_iter=max_iter, random_state=seed)
    
    # Use Bayesian Optimization to find the best hyperparameters
    opt = BayesSearchCV(
        estimator=lr_model,
        search_spaces=hyperparameter_grid,
        cv=skf,
        n_iter=50,
        scoring="f1",
        random_state=seed
    )
    
    try:
        opt.fit(X_train, y_train)
    except ValueError as e:
        if "Solver lbfgs supports only 'l2' or 'none' penalties, got l1 penalty" in str(e):
            print("Ignoring lbfgs solver with l1 penalty.")
        else:
            raise e
        
    best_params = opt.best_params_
    print(f"Best hyperparameters for dataset {name}: {best_params}")
    
    # Train the model using the best hyperparameters
    lr_model = LogisticRegression(**best_params, max_iter=max_iter, random_state=seed)
    
    # Evaluate the model using stratified k-fold cross-validation
    scores = {metric.__name__: [] for metric in eval_metrics}
    
    for train_index, test_index in skf.split(X_train, y_train):
        X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[test_index]
        y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[test_index]
        
        lr_model.fit(X_train_fold, y_train_fold)
        y_pred = lr_model.predict(X_val_fold)
        
        for metric in eval_metrics:
            score = metric(y_val_fold, y_pred)
            scores[metric.__name__].append(score)
    
    print(f"Results for dataset: {name}")
    for metric in eval_metrics:
        average_scores[name][metric.__name__] = np.mean(scores[metric.__name__])
        print(f"Average {metric.__name__} Score: {average_scores[name][metric.__name__]}")
    print("------------------------")

for metric in eval_metrics:
    best_score = -np.inf
    for name, _, _ in datasets:
        if average_scores[name][metric.__name__] > best_score:
            best_score = average_scores[name][metric.__name__]
            best_dataset = name
    best_scores[metric.__name__] = best_score

print(f"The dataset with the highest average scores is {best_dataset}, with the following average scores:")
for metric in eval_metrics:
    print(f"Best {metric.__name__} Score: {best_scores[metric.__name__]}")
