In [8]:
# Import useful libraries
import numpy as np
import matplotlib.pyplot as plt
import time
from helpers import *
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [4]:
# Define the path to the dataset
data_path = "data/dataset_to_release_2"

# Load data from the specified dataset path
x_train, x_test, y_train, train_ids, test_ids = load_csv_data(data_path)

# Data Cleaning

We clean our data with the same manner as in our implementation

In [11]:
# Create a copy of each array to avoid modifying the original data :
x_train_cleaned = x_train.copy()

# Calculate the fraction of NaN values for each column
nan_fraction_train = np.isnan(x_train_cleaned).mean(axis=0)

# Use the mask to filter out columns with more than 80% NaN values
mask_null = nan_fraction_train < 0.8
x_train_cleaned = x_train_cleaned[:, mask_null]

# Identify non-constant columns, columns that have a standard deviation not equal to zero :
mask_variance = (np.nanstd(x_train_cleaned, axis=0) != 0.0)

# Retain only the non-constant columns in the x_train_copied dataset :
x_train_cleaned = x_train_cleaned[:, mask_variance]

# For each element in x arrays, if the element is NaN, replace it with the median :
x_train_cleaned = np.where(np.isnan(x_train_cleaned), np.nanmedian(x_train_cleaned, axis=0), x_train_cleaned)

# Standardize the data
scaler = StandardScaler()
x_train_cleaned = scaler.fit_transform(x_train_cleaned)

# Split your data into training and testing sets
x_train_set, x_test_set, y_train_set, y_test_set = train_test_split(x_train_cleaned, y_train, test_size=0.2, random_state=42)

# Comparison and interpretation with external libraries

Let's recall that our Logistic Regression model gave this results (used for the submission) :

- **Gamma: 0.25, Max Iters: 1000**
- Accuracy: 87.49%
- F1 Score: 40.13%
- Time Taken: 87.77 seconds

We can to compare our Logistic Regression model with the sklearn Logitic Regression model. Since our data set is too heavy, we have to use the `solver = 'newton-cg'`.

In [17]:
# Train the model
start_time = time.time()
lr = LogisticRegression(solver='newton-cg')
lr.fit(x_train_set, y_train_set)
end_time = time.time()

elapsed_time = end_time - start_time

# Predicting on the test data
pred_test = lr.predict(x_test_set)

# Compute accuracy and F1 score
accuracy = accuracy_score(y_test_set, pred_test)*100
f1 = f1_score(y_test_set, pred_test)*100

print(f"Accuracy: {accuracy:.2f}%")
print(f"F1 score: {f1:.2f}%")
print(f"Time Taken: {elapsed_time:.2f} seconds")

Accuracy: 91.36%
F1 score: 17.94%
Time Taken: 141.52 seconds


It seems that our model is better than the one used in `sklearn` in term of results and performances. 

Claim : This is probably due to the unbalanced nature of the data, and the fixed decision boundary of 0.5.
Therefore we can try some data processing in order to prove our claim. 
Among them :

- Data Oversampling :

In [19]:
ros = RandomOverSampler(random_state=0)
x_train_resampled, y_train_resampled = ros.fit_resample(x_train_set, y_train_set.reshape(-1, 1))

# Train the model
start_time = time.time()
lr = LogisticRegression(solver='newton-cg')
lr.fit(x_train_resampled, y_train_resampled)
end_time = time.time()

elapsed_time = end_time - start_time

# Predicting on the test data
pred_test = lr.predict(x_test_set)

# Compute accuracy and F1 score
accuracy = accuracy_score(y_test_set, pred_test)*100
f1 = f1_score(y_test_set, pred_test)*100

print(f"Accuracy: {accuracy:.2f}%")
print(f"F1 score: {f1:.2f}%")
print(f"Time Taken: {elapsed_time:.2f} seconds")

Accuracy: 75.13%
F1 score: 35.95%
Time Taken: 552.40 seconds


According to these results, our claim in proved.

But even with data resampling, the sklearn Logitic Regression model still has lower results than our Logitic Regression implementation, and takes much more time.

- Applying Class weight :

In [20]:
# Train the model
start_time = time.time()
lr = LogisticRegression(solver='newton-cg', class_weight={-1: 0.3, 1: 1.0})
lr.fit(x_train_set, y_train_set)
end_time = time.time()

elapsed_time = end_time - start_time

# Predicting on the test data
pred_test = lr.predict(x_test_set)

# Compute accuracy and F1 score
accuracy = accuracy_score(y_test_set, pred_test)*100
f1 = f1_score(y_test_set, pred_test)*100

print(f"Accuracy: {accuracy:.2f}%")
print(f"F1 score: {f1:.2f}%")
print(f"Time Taken: {elapsed_time:.2f} seconds")

Accuracy: 87.88%
F1 score: 40.01%
Time Taken: 85.47 seconds


When applying Class weights, the sklearn Logitic Regression model is **extremely similar** to ours not only in term of results,  but also performances :

- Accuracy: 87.49%
- F1 Score: 40.13%
- Time Taken: 87.77 seconds

Therefore, we can conclude that our Logistic Regression Model is quite good.