### Import Data and Libraries


In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, make_scorer
from sklearn.model_selection import GridSearchCV
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer
from sklearn.preprocessing import StandardScaler
import warnings
from hyperopt import hp
import os


In [3]:
# Load datasets
train_data_fasttext_300d = pd.read_csv("../numerical_datasets/train_data_mod_fasttext_300d_numerical.csv")
train_data_glove_50d_0v = pd.read_csv("../numerical_datasets/train_data_mod_glove_50d_0v_numerical.csv")
train_data_glove_50d_custom = pd.read_csv("../numerical_datasets/train_data_mod_glove_50d_custom_numerical.csv")
train_data_word2vec_50d = pd.read_csv("../numerical_datasets/train_data_mod_word2vec_50d_numerical.csv")

test_data_fasttext_300d = pd.read_csv("../numerical_datasets/test_data_mod_fasttext_300d_numerical.csv")
test_data_glove_50d_0v = pd.read_csv("../numerical_datasets/test_data_mod_glove_50d_0v_numerical.csv")
test_data_glove_50d_custom = pd.read_csv("../numerical_datasets/test_data_mod_glove_50d_custom_numerical.csv")
test_data_word2vec_50d = pd.read_csv("../numerical_datasets/test_data_mod_word2vec_50d_numerical.csv")


In [4]:
datasets = [("fasttext_300d", train_data_fasttext_300d, test_data_fasttext_300d),
            ("glove_50d_0v", train_data_glove_50d_0v, test_data_glove_50d_0v),
            ("glove_50d_custom", train_data_glove_50d_custom, test_data_glove_50d_custom),
            ("word2vec_50d", train_data_word2vec_50d, test_data_word2vec_50d)]


### Model Evaluation: Logistic Regression

Logistic regression is a linear model that uses a logistic function to transform the output into a probability value between 0 and 1, which can be interpreted as the likelihood of the positive class. It works by finding the optimal values of the model coefficients that maximize the likelihood of the data given the model, typically using maximum likelihood estimation.

For each dataset, we evaluate the logistic regression model using stratified k-fold cross-validation and compute evaluation metrics which include: F1-score, precision, recall, accuracy, and ROC AUC.

#### Defining the Logistic Regression Model

In [5]:
# Define the logistic regression model
seed = 69
max_iter = 1000
lr_model = LogisticRegression(max_iter=max_iter, random_state=seed)

# Define the number of folds for stratified k-fold cross validation
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)

# Define the evaluation metrics
eval_metrics = [accuracy_score, f1_score]


#### Logistic Regression w/ Default Configurations

In [6]:
# Use a loop to iterate over each dataset, and for each dataset, train and evaluate the model using stratified k-fold cross-validation
average_scores = {name: {metric.__name__: 0 for metric in eval_metrics} for name, _, _ in datasets}
best_scores = {metric.__name__: -np.inf for metric in eval_metrics}
best_dataset = ''

for name, train_data, test_data in datasets: # test_data is redundant
    X = train_data.drop(["id", "target"], axis=1)
    y = train_data["target"]

    scores = {metric.__name__: [] for metric in eval_metrics}
    
    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        lr_model.fit(X_train, y_train)
        y_pred = lr_model.predict(X_test)
        
        for metric in eval_metrics:
            score = metric(y_test, y_pred)
            scores[metric.__name__].append(score)
    
    print(f"Results for dataset: {name}")
    for metric in eval_metrics:
        average_scores[name][metric.__name__] = np.mean(scores[metric.__name__])
        print(f"Average {metric.__name__} Score: {average_scores[name][metric.__name__]}")
    print("------------------------")

for metric in eval_metrics:
    best_score = -np.inf
    for name, _, _ in datasets:
        if average_scores[name][metric.__name__] > best_score:
            best_score = average_scores[name][metric.__name__]
            best_dataset = name
    best_scores[metric.__name__] = best_score

print(f"The dataset with the highest average scores is {best_dataset}, with the following average scores:")
for metric in eval_metrics:
    print(f"Average {metric.__name__} Score: {best_scores[metric.__name__]}")


Results for dataset: fasttext_300d
Average accuracy_score Score: 0.7571287563535211
Average f1_score Score: 0.7067025766996066
------------------------
Results for dataset: glove_50d_0v
Average accuracy_score Score: 0.7588353093132632
Average f1_score Score: 0.7091203600469612
------------------------
Results for dataset: glove_50d_custom
Average accuracy_score Score: 0.7590983802457802
Average f1_score Score: 0.7094373807659478
------------------------
Results for dataset: word2vec_50d
Average accuracy_score Score: 0.7575226293633407
Average f1_score Score: 0.7075464109754979
------------------------
The dataset with the highest average scores is glove_50d_custom, with the following average scores:
Average accuracy_score Score: 0.7590983802457802
Average f1_score Score: 0.7094373807659478


Using the default configurations of the logistic regression model, the best performing dataset is 'glove_50d_custom'

#### Tuning Hyperparameters using Bayesian Optimisation

Bayesian Optimisation is used to tune the hyperparameters for our Logistic Regression model. Unlike Grid Search or Random Search, Bayesian Optimisation can be more efficient as it uses information from previous iterations to guide the search towards promising regions in the hyperparameter space. This can result in better performance with fewer evaluations.

The hyperparameters to tune are the regularisation strength (C), the penalty functions (l1 or l2), and the solver method (lbfgs, liblinear, or saga)

#### Optimisation on Only Glove 50d Custom

In [7]:
# Final dataset
name, train_data, test_data = "glove_50d_custom", train_data_glove_50d_custom, test_data_glove_50d_custom

# Define the hyperparameter grid to search over
hyperparameter_grid = {
    "C": Real(0.001, 100.0, prior='log-uniform'),
    "penalty": Categorical(['l2']),
    "solver": Categorical(['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']),
}

# This warning occurs when the objective function has been evaluated at a certain point during the Bayesian optimization process and can be safely ignored.
warnings.filterwarnings('ignore', message='The objective has been evaluated at this point before.')

X_train = train_data.drop(["id", "target"], axis=1)
y_train = train_data["target"]

lr_model = LogisticRegression(max_iter=50000, random_state=seed) # max_iter raised to 50000 in order for model to converge

# Use Bayesian Optimization to find the best hyperparameters
opt = BayesSearchCV(
    estimator=lr_model,
    search_spaces=hyperparameter_grid,
    cv=skf,
    n_iter=50,
    scoring="f1",
    random_state=seed
)

opt.fit(X_train, y_train)
 
best_params = opt.best_params_
print(f"Best hyperparameters for dataset {name}: {best_params}")

# Train the model using the best hyperparameters
lr_model = LogisticRegression(**best_params, max_iter=50000, random_state=seed)

# Evaluate the model using stratified k-fold cross-validation
scores = {metric.__name__: [] for metric in eval_metrics}

for train_index, test_index in skf.split(X_train, y_train):
    X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[test_index]
    y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[test_index]
    
    lr_model.fit(X_train_fold, y_train_fold)
    y_pred = lr_model.predict(X_val_fold)
    
    for metric in eval_metrics:
        score = metric(y_val_fold, y_pred)
        scores[metric.__name__].append(score)

print(f"Results for dataset: {name}")
for metric in eval_metrics:
    average_scores[name][metric.__name__] = np.mean(scores[metric.__name__])
    print(f"Average {metric.__name__} Score: {average_scores[name][metric.__name__]}")
print("------------------------")


Best hyperparameters for dataset glove_50d_custom: OrderedDict([('C', 98.99856540492476), ('penalty', 'l2'), ('solver', 'liblinear')])
Results for dataset: glove_50d_custom
Average accuracy_score Score: 0.7650100129162738
Average f1_score Score: 0.7170666745676758
------------------------


Average accuracy_score Score: 0.7590983802457802 -> 0.7650100129162738

#### Optimisation on Every Dataset

In [8]:
# Define the hyperparameter grid to search over
hyperparameter_grid = {
    "C": Real(0.001, 100.0, prior='log-uniform'),
    "penalty": Categorical(['l2']),
    "solver": Categorical(['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']),
}

# max_iter raised to 50000 in order for model to converge
max_iter = 50000

average_scores = {name: {metric.__name__: 0 for metric in eval_metrics} for name, _, _ in datasets}
best_scores = {metric.__name__: -np.inf for metric in eval_metrics}
best_dataset = ''

# This warning occurs when the objective function has been evaluated at a certain point during the Bayesian optimization process and can be safely ignored.
warnings.filterwarnings('ignore', message='The objective has been evaluated at this point before.')

# Loop through each dataset, and for each dataset, train, tune hyperparameters, and evaluate the model using stratified k-fold cross-validation
for name, train_data, test_data in datasets: # test_data is redundant
    X_train = train_data.drop(["id", "target"], axis=1)
    y_train = train_data["target"]
    
    lr_model = LogisticRegression(max_iter=max_iter, random_state=seed)
    
    # Use Bayesian Optimization to find the best hyperparameters
    opt = BayesSearchCV(
        estimator=lr_model,
        search_spaces=hyperparameter_grid,
        cv=skf,
        n_iter=50,
        scoring="f1",
        random_state=seed
    )
    
    opt.fit(X_train, y_train)
        
    best_params = opt.best_params_
    print(f"Best hyperparameters for dataset {name}: {best_params}")
    
    # Train the model using the best hyperparameters
    lr_model = LogisticRegression(**best_params, max_iter=max_iter, random_state=seed)
    
    # Evaluate the model using stratified k-fold cross-validation
    scores = {metric.__name__: [] for metric in eval_metrics}
    
    for train_index, test_index in skf.split(X_train, y_train):
        X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[test_index]
        y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[test_index]
        
        lr_model.fit(X_train_fold, y_train_fold)
        y_pred = lr_model.predict(X_val_fold)
        
        for metric in eval_metrics:
            score = metric(y_val_fold, y_pred)
            scores[metric.__name__].append(score)
    
    print(f"Results for dataset: {name}")
    for metric in eval_metrics:
        average_scores[name][metric.__name__] = np.mean(scores[metric.__name__])
        print(f"Average {metric.__name__} Score: {average_scores[name][metric.__name__]}")
    print("------------------------")

for metric in eval_metrics:
    best_score = -np.inf
    for name, _, _ in datasets:
        if average_scores[name][metric.__name__] > best_score:
            best_score = average_scores[name][metric.__name__]
            best_dataset = name
    best_scores[metric.__name__] = best_score

print(f"The dataset with the highest average scores is {best_dataset}, with the following average scores:")
for metric in eval_metrics:
    print(f"Best {metric.__name__} Score: {best_scores[metric.__name__]}")


Best hyperparameters for dataset fasttext_300d: OrderedDict([('C', 48.852137128109696), ('penalty', 'l2'), ('solver', 'liblinear')])
Results for dataset: fasttext_300d
Average accuracy_score Score: 0.7656666117343959
Average f1_score Score: 0.7178081032544782
------------------------
Best hyperparameters for dataset glove_50d_0v: OrderedDict([('C', 78.51393437358462), ('penalty', 'l2'), ('solver', 'lbfgs')])
Results for dataset: glove_50d_0v
Average accuracy_score Score: 0.7648783480284348
Average f1_score Score: 0.7171108912535352
------------------------
Best hyperparameters for dataset glove_50d_custom: OrderedDict([('C', 98.99856540492476), ('penalty', 'l2'), ('solver', 'liblinear')])
Results for dataset: glove_50d_custom
Average accuracy_score Score: 0.7650100129162738
Average f1_score Score: 0.7170666745676758
------------------------
Best hyperparameters for dataset word2vec_50d: OrderedDict([('C', 100.0), ('penalty', 'l2'), ('solver', 'newton-cg')])
Results for dataset: word2ve

#### Scaled Datasets

In order to further improve the performance of the logistic regression model, we used the Standard Scaler to normalize our feature variables.

Scaling the datasets is an important step in logistic regression because it helps to ensure convergence of the optimization process by bringing features to the same scale, reduce the influence of outliers by making their impact comparable to other data points and improve the interpretability of coefficients by making them comparable and interpretable, thus potentially improving the model's performance and interpretability.

In [7]:
datasets_scaled = []

for name, train_data, test_data in datasets:
    scaler = StandardScaler()
    
    # fit the scaler on the train data
    scaler.fit(train_data.drop(columns=['id', 'target']))
    
    # transform the train data
    train_data_scaled = train_data.copy()
    train_data_scaled[train_data.columns.difference(['id', 'target'])] = scaler.transform(train_data.drop(columns=['id', 'target']))
    
    # transform the test data
    test_data_scaled = test_data.copy()
    test_data_scaled[test_data.columns.difference(['id'])] = scaler.transform(test_data.drop(columns=['id']))
    
    # append '_scaled' to the name
    name_scaled = name + '_scaled'
    
    # store the scaled train and test data into a new tuple and append it to the list
    datasets_scaled.append((name_scaled, train_data_scaled, test_data_scaled))
    

# for name, train_data, test_data in datasets_scaled:
#     print("dataset: "+ name)
#     print(train_data.head())
#     print(test_data.head())

#### Optimisation on Scaled Datasets

In [10]:
# Define the hyperparameter grid to search over
hyperparameter_grid = {
    "C": Real(0.001, 100.0, prior='log-uniform'),
    "penalty": Categorical(['l2']),
    "solver": Categorical(['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']),
}

best_params_dict = {}

# max_iter raised to 50000 in order for model to converge
max_iter = 50000

average_scores = {name: {metric.__name__: 0 for metric in eval_metrics} for name, _, _ in datasets_scaled}
best_scores = {metric.__name__: -np.inf for metric in eval_metrics}
best_dataset = ''

# This warning occurs when the objective function has been evaluated at a certain point during the Bayesian optimization process and can be safely ignored.
warnings.filterwarnings('ignore', message='The objective has been evaluated at this point before.')

# Loop through each scaled dataset, and for each dataset, train, tune hyperparameters, and evaluate the model using stratified k-fold cross-validation
for name, train_data, test_data in datasets_scaled: # test_data is redundant
    X_train = train_data.drop(["id", "target"], axis=1)
    y_train = train_data["target"]
    
    lr_model = LogisticRegression(max_iter=max_iter, random_state=seed)
    
    # Use Bayesian Optimization to find the best hyperparameters
    opt = BayesSearchCV(
        estimator=lr_model,
        search_spaces=hyperparameter_grid,
        cv=skf,
        n_iter=50,
        scoring="f1",
        random_state=seed
    )
    
    opt.fit(X_train, y_train)
        
    best_params = opt.best_params_
    print(f"Best hyperparameters for dataset {name}: {best_params}")
    
    # Store the best parameters for this dataset in the dictionary
    best_params_dict[name] = best_params
    
    # Train the model using the best hyperparameters
    lr_model = LogisticRegression(**best_params, max_iter=max_iter, random_state=seed)
    
    # Evaluate the model using stratified k-fold cross-validation
    scores = {metric.__name__: [] for metric in eval_metrics}
    
    for train_index, test_index in skf.split(X_train, y_train):
        X_train_fold, X_val_fold = X_train.iloc[train_index], X_train.iloc[test_index]
        y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[test_index]
        
        lr_model.fit(X_train_fold, y_train_fold)
        y_pred = lr_model.predict(X_val_fold)
        
        for metric in eval_metrics:
            score = metric(y_val_fold, y_pred)
            scores[metric.__name__].append(score)
    
    print(f"Results for dataset: {name}")
    for metric in eval_metrics:
        average_scores[name][metric.__name__] = np.mean(scores[metric.__name__])
        print(f"Average {metric.__name__} Score: {average_scores[name][metric.__name__]}")
    print("------------------------")

for metric in eval_metrics:
    best_score = -np.inf
    for name, _, _ in datasets_scaled:
        if average_scores[name][metric.__name__] > best_score:
            best_score = average_scores[name][metric.__name__]
            best_dataset = name
    best_scores[metric.__name__] = best_score

print(f"The dataset with the highest average scores is {best_dataset}, with the following average scores:")
for metric in eval_metrics:
    print(f"Best {metric.__name__} Score: {best_scores[metric.__name__]}")


Best hyperparameters for dataset fasttext_300d_scaled: OrderedDict([('C', 0.002988802667390649), ('penalty', 'l2'), ('solver', 'liblinear')])
Results for dataset: fasttext_300d_scaled
Average accuracy_score Score: 0.7664553931266787
Average f1_score Score: 0.7210398101940598
------------------------
Best hyperparameters for dataset glove_50d_0v_scaled: OrderedDict([('C', 0.0034381476447487057), ('penalty', 'l2'), ('solver', 'liblinear')])
Results for dataset: glove_50d_0v_scaled
Average accuracy_score Score: 0.7648781754663275
Average f1_score Score: 0.718438160277155
------------------------
Best hyperparameters for dataset glove_50d_custom_scaled: OrderedDict([('C', 0.0034381476447487057), ('penalty', 'l2'), ('solver', 'liblinear')])
Results for dataset: glove_50d_custom_scaled
Average accuracy_score Score: 0.7643531552549907
Average f1_score Score: 0.7176344861675744
------------------------
Best hyperparameters for dataset word2vec_50d_scaled: OrderedDict([('C', 13.301525779043082)

In [11]:
print(best_params_dict)

{'fasttext_300d_scaled': OrderedDict([('C', 0.002988802667390649), ('penalty', 'l2'), ('solver', 'liblinear')]), 'glove_50d_0v_scaled': OrderedDict([('C', 0.0034381476447487057), ('penalty', 'l2'), ('solver', 'liblinear')]), 'glove_50d_custom_scaled': OrderedDict([('C', 0.0034381476447487057), ('penalty', 'l2'), ('solver', 'liblinear')]), 'word2vec_50d_scaled': OrderedDict([('C', 13.301525779043082), ('penalty', 'l2'), ('solver', 'saga')])}


#### Run Optimised Model on Each Test Set

In [21]:
test_data_pred_lr = []

for name, train_data, test_data in datasets_scaled:
    X_train = train_data.drop(["id", "target"], axis=1)
    y_train = train_data["target"]
    
    X_test = test_data.drop("id", axis=1)
    
    lr_model = LogisticRegression(**best_params_dict[best_dataset], max_iter=max_iter, random_state=seed)
    lr_model.fit(X_train, y_train)

    # Predict the target values of the test set
    y_pred = lr_model.predict(X_test)
    
    # Create a copy of the test data and add the predicted values to it
    test_data_pred = test_data.copy()
    test_data_pred['target'] = y_pred
    
    # Store test data with predicted values to list
    test_data_pred_lr.append(('test_data_pred_lr_'+name, test_data_pred))


In [24]:
# Export predicted test data to CSV file
output_dir = 'logistic_regression_predictions'

for name, test_data in test_data_pred_lr:
    filename = os.path.join(output_dir, f'{name}.csv')
    test_data.to_csv(filename, index=False)

In [6]:
test_data_fasttext_300d_pred = pd.read_csv("logistic_regression_predictions/test_data_pred_lr_fasttext_300d_scaled.csv")
test_data_glove_50d_0v_pred = pd.read_csv("logistic_regression_predictions/test_data_pred_lr_glove_50d_0v_scaled.csv")
test_data_glove_50d_custom_pred = pd.read_csv("logistic_regression_predictions/test_data_pred_lr_glove_50d_custom_scaled.csv")
test_data_word2vec_50d_pred = pd.read_csv("logistic_regression_predictions/test_data_pred_lr_word2vec_50d_scaled.csv")

In [7]:
datasets_submission = [("fasttext_300d", test_data_fasttext_300d_pred),
            ("glove_50d_0v", test_data_glove_50d_0v_pred),
            ("glove_50d_custom", test_data_glove_50d_custom_pred),
            ("word2vec_50d", test_data_word2vec_50d_pred)]

In [9]:
for name, test_data in datasets_submission:
    # submission format
    submission = test_data[['id', 'target']]
    
    # Export the merged submission DataFrame to a CSV file
    submission.to_csv(f'logistic_regression_predictions/kaggle_submission/submission_{name}.csv', index=False)


In [10]:
for name, test_data in datasets_submission:
    print(test_data.shape[0])

3263
3263
3263
3263


## Results 
 
Logistic Regression Test Set Prediction Scores (from Kaggle submission): 
1. submission_word2vec_50d.csv: 0.7346 
2. submission_glove_50d_custom.csv: 0.73398 
3. submission_glove_50d_0v.csv: 0.73398 
4. submission_fasttext_300d.csv: 0.73858 
 
Overall, the best performing dataset is the one that uses fasttext_300d. This aligns with the accuracy scores from the previous cross-validation.