## Model 1: Logistic Regression

In [1]:
import pandas as pd
import numpy as np
import time

import matplotlib as mpl
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_curve, auc, RocCurveDisplay
from sklearn.metrics import balanced_accuracy_score

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

from sklearn.model_selection import StratifiedKFold


save_figures = False
figure_path = "../figures/models/"
# df_init = pd.read_parquet("../data/init.parquet")
# df_before = pd.read_parquet("../data/before_split.parquet")
# df_train = pd.read_parquet("../data/prep_train.parquet")
# df_val = pd.read_parquet("../data/prep_val.parquet")
# df_test = pd.read_parquet("../data/prep_test.parquet")
# df_tt_train = pd.read_parquet("../data/prep_tt_train.parquet")
# df_tt_test = pd.read_parquet("../data/prep_tt_test.parquet")
df_all = pd.read_parquet("../data/prep_all.parquet")

In [2]:
df_all.head()

Unnamed: 0,Year_Y2015,Year_Y2016,Country_England,Country_Wales,Supermarket_Asda,Supermarket_Tesco Extra,Supermarket_Tesco Metro,Supermarket_Waitrose,Time_Evening,Time_Morning,...,YearCountryAge_Y2016EnglandAge_g2,YearCountryAge_Y2016EnglandAge_g3,YearCountryAge_Y2016WalesAge_g1,YearCountryAge_Y2016WalesAge_g2,YearCountryAge_Y2016WalesAge_g3,YearCountryAge_nan,ObsSize,FemaleN,MaleN,y
0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,-0.471136,-1.156159,0.871226,1
1,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,-0.471136,-1.156159,0.871226,1
2,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,-0.471136,0.583858,-0.993069,1
3,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.811174,2.323876,-0.993069,1
4,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,-0.471136,0.583858,-0.993069,1


In [3]:
X_all = df_all.iloc[:,0:-1].to_numpy()
y_all = df_all["y"].to_numpy()

### Logistic Regression No Penalty

In [4]:
random_state_LR = 132
LR_no = LogisticRegression(penalty = 'none', 
                           solver = 'saga', 
                           random_state = random_state_LR,
                           fit_intercept = False, 
                           max_iter = 10000, 
                           class_weight = 'balanced')
LR_no 

LogisticRegression(class_weight='balanced', fit_intercept=False, max_iter=10000,
                   penalty='none', random_state=132, solver='saga')

In [5]:
skf = StratifiedKFold(n_splits = 5, 
                      shuffle = True,
                      random_state = random_state_LR)

random_states_split = [123, 456, 789, 
                       741, 852, 963, 
                       159, 753, 951, 357]

In [6]:
%%time
BalAccuracy_no = []

for each_rs in random_states_split:
    start_time = time.time()
    
    X_other_temp, X_test_temp, y_other_temp, y_test_temp = \
        train_test_split(X_all, y_all, 
                         test_size = 0.1, 
                         random_state = each_rs, 
                         stratify = y_all)
    
    BalAccuracy_no_cv = []
    
    for train_index, val_index in skf.split(X_other_temp, y_other_temp):
        cv_start_time = time.time()

        X_train_no, X_val_no = X_other_temp[train_index], X_other_temp[val_index]
        y_train_no, y_val_no = y_other_temp[train_index], y_other_temp[val_index]

        LR_no.fit(X_train_no, y_train_no)

        y_pred_no_temp = LR_no.predict(X_val_no)
        BalAccuracy_no_cv.append(balanced_accuracy_score(y_val_no, y_pred_no_temp))

        cv_end_time = time.time()
        print(f"\tCV: {cv_end_time - cv_start_time:.3f} seconds", flush = True)
    
    BalAccuracy_no.append(np.mean(BalAccuracy_no_cv))
    
    end_time = time.time()
    print(f"Iteration {each_rs}: {end_time - start_time:.3f} seconds", flush = True)

	CV: 2.665 seconds
	CV: 2.498 seconds
	CV: 2.587 seconds
	CV: 2.382 seconds
	CV: 2.468 seconds
Iteration 123: 12.616 seconds
	CV: 2.606 seconds
	CV: 2.554 seconds
	CV: 2.449 seconds
	CV: 2.570 seconds
	CV: 2.379 seconds
Iteration 456: 12.565 seconds
	CV: 2.422 seconds
	CV: 2.593 seconds
	CV: 2.439 seconds
	CV: 2.440 seconds
	CV: 2.547 seconds
Iteration 789: 12.447 seconds
	CV: 2.440 seconds
	CV: 2.425 seconds
	CV: 2.521 seconds
	CV: 2.379 seconds
	CV: 2.433 seconds
Iteration 741: 12.202 seconds
	CV: 2.502 seconds
	CV: 2.372 seconds
	CV: 2.460 seconds
	CV: 2.541 seconds
	CV: 2.399 seconds
Iteration 852: 12.280 seconds
	CV: 2.392 seconds
	CV: 2.547 seconds
	CV: 2.440 seconds
	CV: 2.442 seconds
	CV: 2.517 seconds
Iteration 963: 12.348 seconds
	CV: 2.515 seconds
	CV: 2.471 seconds
	CV: 2.589 seconds
	CV: 2.551 seconds
	CV: 2.486 seconds
Iteration 159: 12.619 seconds
	CV: 2.605 seconds
	CV: 2.452 seconds
	CV: 2.462 seconds
	CV: 2.545 seconds
	CV: 2.393 seconds
Iteration 753: 12.466 seconds


In [7]:
print(f"{np.mean(BalAccuracy_no):.4f} ({np.std(BalAccuracy_no):.4f})")

0.6811 (0.0041)


### Logistic Regression L1 Penalty

In [8]:
random_state_LR = 132
LR_l1 = LogisticRegression(penalty = 'l1', 
                           solver = 'saga', 
                           random_state = random_state_LR,
                           fit_intercept = False, 
                           max_iter = 10000, 
                           class_weight = 'balanced')
LR_l1

LogisticRegression(class_weight='balanced', fit_intercept=False, max_iter=10000,
                   penalty='l1', random_state=132, solver='saga')

In [9]:
c_penalty = np.logspace(-3, 2, 21)
params_l1 = {'C':c_penalty}
random_states_split = [123, 456, 789, 
                       741, 852, 963, 
                       159, 753, 951, 357]

gridCV_l1 = GridSearchCV(estimator = LR_l1, 
                         param_grid = params_l1, 
                         scoring = 'balanced_accuracy', 
                         n_jobs = -2,
                         refit = True, 
                         cv = 5)


In [10]:
%%time

BalAccuracy_l1 = []

for each_rs in random_states_split:
    start_time = time.time()
    
    X_other_temp, X_test_temp, y_other_temp, y_test_temp = \
        train_test_split(X_all, y_all, 
                         test_size = 0.1, 
                         random_state = each_rs, 
                         stratify = y_all)
    
    gridCV_l1.fit(X_other_temp, y_other_temp)
    
    BalAccuracy_l1.append(gridCV_l1.best_score_)
    
    print(gridCV_l1.best_params_, flush = True)
    
    end_time = time.time()
    print(f"Iteration {each_rs}: {end_time - start_time:.3f} seconds", flush = True)

{'C': 0.03162277660168379}
Iteration 123: 26.326 seconds
{'C': 0.5623413251903491}
Iteration 456: 26.266 seconds
{'C': 0.5623413251903491}
Iteration 789: 26.989 seconds
{'C': 0.1778279410038923}
Iteration 741: 26.615 seconds
{'C': 0.1}
Iteration 852: 26.783 seconds
{'C': 1.0}
Iteration 963: 27.883 seconds
{'C': 0.31622776601683794}
Iteration 159: 28.505 seconds
{'C': 0.1778279410038923}
Iteration 753: 27.126 seconds
{'C': 0.5623413251903491}
Iteration 951: 28.188 seconds
{'C': 0.31622776601683794}
Iteration 357: 27.754 seconds
Wall time: 4min 32s


In [11]:
print(f"{np.mean(BalAccuracy_l1):.4f} ({np.std(BalAccuracy_l1):.4f})")


0.6909 (0.0043)


### Logistic Regression L2 Penalty

In [12]:
random_state_LR = 132
LR_l2 = LogisticRegression(penalty = 'l2', 
                           solver = 'saga', 
                           random_state = random_state_LR,
                           fit_intercept = False, 
                           max_iter = 10000, 
                           class_weight = 'balanced')
LR_l2

LogisticRegression(class_weight='balanced', fit_intercept=False, max_iter=10000,
                   random_state=132, solver='saga')

In [13]:
c_penalty = np.logspace(-3, 2, 21)
params_l2 = {'C':c_penalty}
random_states_split = [123, 456, 789, 
                       741, 852, 963, 
                       159, 753, 951, 357]

gridCV_l2 = GridSearchCV(estimator = LR_l2, 
                         param_grid = params_l2, 
                         scoring = 'balanced_accuracy', 
                         n_jobs = -2,
                         refit = True, 
                         cv = 5)

In [14]:
%%time

BalAccuracy_l2 = []

for each_rs in random_states_split:
    start_time = time.time()
    
    X_other_temp, X_test_temp, y_other_temp, y_test_temp = \
        train_test_split(X_all, y_all, 
                         test_size = 0.1, 
                         random_state = each_rs, 
                         stratify = y_all)
    
    gridCV_l2.fit(X_other_temp, y_other_temp)
    
    BalAccuracy_l2.append(gridCV_l2.best_score_)
    
    print(gridCV_l2.best_params_, flush = True)
    
    end_time = time.time()
    print(f"Iteration {each_rs}: {end_time - start_time:.3f} seconds", flush = True)

{'C': 0.005623413251903491}
Iteration 123: 6.904 seconds
{'C': 0.1}
Iteration 456: 7.019 seconds
{'C': 0.1}
Iteration 789: 7.032 seconds
{'C': 0.01778279410038923}
Iteration 741: 6.752 seconds
{'C': 0.03162277660168379}
Iteration 852: 6.493 seconds
{'C': 0.31622776601683794}
Iteration 963: 6.394 seconds
{'C': 0.03162277660168379}
Iteration 159: 6.946 seconds
{'C': 0.01}
Iteration 753: 6.971 seconds
{'C': 0.5623413251903491}
Iteration 951: 7.208 seconds
{'C': 0.1778279410038923}
Iteration 357: 7.082 seconds
Wall time: 1min 8s


In [15]:
print(f"{np.mean(BalAccuracy_l2):.4f} ({np.std(BalAccuracy_l2):.4f})")


0.6891 (0.0052)


### Logistic Regression Elastic Net

In [16]:
random_state_LR = 132
LR_el = LogisticRegression(penalty = 'elasticnet', 
                           solver = 'saga', 
                           random_state = random_state_LR,
                           fit_intercept = False, 
                           max_iter = 10000, 
                           class_weight = 'balanced')
LR_el

LogisticRegression(class_weight='balanced', fit_intercept=False, max_iter=10000,
                   penalty='elasticnet', random_state=132, solver='saga')

In [17]:
c_penalty = np.logspace(-3, 2, 21)
l1_ratio = np.linspace(0.1, 0.9, 9)
params_el = {'C':c_penalty, 
             'l1_ratio':l1_ratio}
random_states_split = [123, 456, 789, 
                       741, 852, 963, 
                       159, 753, 951, 357]

gridCV_el = GridSearchCV(estimator = LR_el, 
                         param_grid = params_el, 
                         scoring = 'balanced_accuracy', 
                         n_jobs = -2,
                         refit = True, 
                         cv = 5)

In [18]:
%%time

BalAccuracy_el = []

for each_rs in random_states_split:
    start_time = time.time()
    
    X_other_temp, X_test_temp, y_other_temp, y_test_temp = \
        train_test_split(X_all, y_all, 
                         test_size = 0.1, 
                         random_state = each_rs, 
                         stratify = y_all)
    
    gridCV_el.fit(X_other_temp, y_other_temp)
    
    BalAccuracy_el.append(gridCV_el.best_score_)
    
    print(gridCV_el.best_params_, flush = True)
    
    end_time = time.time()
    print(f"Iteration {each_rs}: {end_time - start_time:.3f} seconds", flush = True)

{'C': 0.03162277660168379, 'l1_ratio': 0.8}
Iteration 123: 154.971 seconds
{'C': 0.1778279410038923, 'l1_ratio': 0.30000000000000004}
Iteration 456: 152.725 seconds
{'C': 0.1778279410038923, 'l1_ratio': 0.1}
Iteration 789: 153.414 seconds
{'C': 0.01, 'l1_ratio': 0.30000000000000004}
Iteration 741: 147.254 seconds
{'C': 0.1, 'l1_ratio': 0.9}
Iteration 852: 148.345 seconds
{'C': 0.5623413251903491, 'l1_ratio': 0.7000000000000001}
Iteration 963: 148.515 seconds
{'C': 0.05623413251903491, 'l1_ratio': 0.2}
Iteration 159: 156.025 seconds
{'C': 0.1778279410038923, 'l1_ratio': 0.9}
Iteration 753: 153.238 seconds
{'C': 0.31622776601683794, 'l1_ratio': 0.9}
Iteration 951: 154.325 seconds
{'C': 0.1, 'l1_ratio': 0.6}
Iteration 357: 154.398 seconds
Wall time: 25min 23s


In [19]:
BalAccuracy_el

[0.6916709853968059,
 0.6878088528225927,
 0.6863517390925538,
 0.6844716110336937,
 0.6913817733213203,
 0.7009445595816558,
 0.6945675822636019,
 0.6947443836138149,
 0.6911192311780187,
 0.6956824226415487]

In [20]:
print(f"{np.mean(BalAccuracy_el):.4f} ({np.std(BalAccuracy_el):.4f})")


0.6919 (0.0046)


In [21]:
print(f"{np.mean(BalAccuracy_no):.4f} ({np.std(BalAccuracy_no):.4f})")
print(f"{np.mean(BalAccuracy_l1):.4f} ({np.std(BalAccuracy_l1):.4f})")
print(f"{np.mean(BalAccuracy_l2):.4f} ({np.std(BalAccuracy_l2):.4f})")
print(f"{np.mean(BalAccuracy_el):.4f} ({np.std(BalAccuracy_el):.4f})")

0.6811 (0.0041)
0.6909 (0.0043)
0.6891 (0.0052)
0.6919 (0.0046)


In [22]:
import json

logistic_scores = {"no": BalAccuracy_no,
                   "l1": BalAccuracy_l1,
                   "l2": BalAccuracy_l2,
                   "elastic": BalAccuracy_el}

with open("../results/logistic_scores.json", "w") as outfile:
    json.dump(logistic_scores, outfile)
    
# with open("logistic.json", "r") as readfile:
#     dict_data = json.load(readfile)