# 4. ElasticNet

**Import packages**

In [1]:
import os
import numpy as np
import pandas as pd
import pickle

from sklearn.preprocessing import StandardScaler

import matplotlib.pyplot as plt
%matplotlib inline

**Set directories**

In [2]:
print(os.getcwd())
dirRawData = "../RawData/"
dirPData = "../PData/"
dirPOutput = "../POutput/"

C:\Users\munch\Documents\Cass MSc\Term 3\Machine Learning\Coursework\PCode


**Load data**

In [3]:
f_name = dirPData + '02_df.pickle'
with open(f_name, "rb") as f:
    dict_ = pickle.load(f)
    
df_all_onehot = dict_['df_all_onehot']

del f_name, dict_


f_name = dirPData + '02_vars.pickle'
with open(f_name, "rb") as f:
    dict_ = pickle.load(f)
    
vars_ind_num = dict_['vars_ind_num']
var_dep = dict_['var_dep']
vars_ind_categorical = dict_['vars_ind_categorical']
vars_ind_onehot = dict_['vars_ind_onehot']

del f_name, dict_

**Split data into training, validation and test folds**

In [4]:
idx_train  = np.where(df_all_onehot['fold'].isin(np.arange(0,8)))[0] #[0,1,2,3,4,5,6,7]
idx_val    = np.where(df_all_onehot['fold'].isin([8,9]))[0] #[8,9]
idx_design = np.where(df_all_onehot['fold'].isin(np.arange(0,10)))[0] #[0,1,2,3,4,5,6,7,8,9]
idx_test = np.where(df_all_onehot['fold'].isin([10]))[0]

In [5]:
vars_ind = vars_ind_num + vars_ind_onehot

In [6]:
#prepare x and y data
x = df_all_onehot[vars_ind].values
y = df_all_onehot[var_dep].values

y_train = y[idx_train]
y_val = y[idx_val]
y_design = y[idx_design]

In [7]:
#Standardize the data
from sklearn.preprocessing import StandardScaler

standardScaler_ = StandardScaler()
standardScaler_.fit(x[idx_train])

X_train = standardScaler_.transform(x[idx_train])
X_val   = standardScaler_.transform(x[idx_val])
X_test = standardScaler_.transform(x[idx_test])
X_design = standardScaler_.transform(x[idx_design])

**Elastic Net using sklearn SGDClassifier**

We use SGDClassifier as it allows us to change l1_ratio and alpha. 
l1_ratio = 0 corresponds to L2 penalty, and l1_ratio = 1 corresponds to L1 penalty. 

We begin with the default l1_ratio of 0.15

In [8]:
from sklearn.linear_model import SGDClassifier
EN = SGDClassifier(loss='log',penalty='elasticnet') #use elastic net with log-loss function
EN.fit(X_train,np.ravel(y_train)) #fit model

SGDClassifier(alpha=0.0001, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='log', max_iter=1000,
              n_iter_no_change=5, n_jobs=None, penalty='elasticnet',
              power_t=0.5, random_state=None, shuffle=True, tol=0.001,
              validation_fraction=0.1, verbose=0, warm_start=False)

In [9]:
# prediction (probabilities)
EN__pred_train_prob = EN.predict_proba(X_train)
EN__pred_val_prob   = EN.predict_proba(X_val)
EN__pred_test_prob = EN.predict_proba(X_test)

#prediction (0 or 1)
EN__pred_train = EN.predict(X_train)
EN__pred_val   = EN.predict(X_val)
EN__pred_test = EN.predict(X_test)

In [10]:
#print accuracy of the training and validation
train_score = EN.score(X_train,y_train)
val_score = EN.score(X_val, y_val)
print("train score is {}".format(train_score))
print("val score is {}".format(val_score))

train score is 0.8999895930898116
val score is 0.8865278368040799


In [11]:
#print roc_auc_score
from sklearn.metrics import roc_auc_score
auc_train = roc_auc_score(y_train,EN__pred_train)
auc_val = roc_auc_score(y_val,EN__pred_val)
print("train auc: {}".format(auc_train))
print("val auc: {}".format(auc_val))

train auc: 0.6409818703121701
val auc: 0.5969511475796622


In [17]:
from sklearn.model_selection import cross_val_score
result_roc_auc = cross_val_score(EN, X_design, np.ravel(y_design), cv=10, scoring='roc_auc')
result_accuracy = cross_val_score(EN, X_design, np.ravel(y_design), cv=10, scoring='accuracy')
print("average score using 10 fold cross validation on design data: {}".format(result_accuracy.mean()))
print("average roc_auc score using 10 fold cross validation on design data: {}".format(result_roc_auc.mean()))

average score using 10 fold cross validation on design data: 0.9051999594523268
average roc_auc score using 10 fold cross validation on design data: 0.7905287291911214


**Grid Search for Elastic Net**

In [26]:
import time
from sklearn.model_selection import GridSearchCV

start_time = time.time()
alpha = [0.001,0.01,0.1]
l1_ratio = [0.15,0.25,0.35]
param_grid = dict(alpha = alpha,l1_ratio = l1_ratio)
grid = GridSearchCV(estimator=EN, param_grid=param_grid, cv = 10, n_jobs=-1, scoring='roc_auc')

In [27]:
grid_result = grid.fit(X_design,np.ravel(y_design))
# Summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
print("Execution time: " + str((time.time() - start_time)) + ' ms')

Best: 0.836468 using {'alpha': 0.01, 'l1_ratio': 0.25}
Execution time: 214.9397165775299 ms


Unfortunately the best accuracy score was not higher than the optimized logistic regression (l1) which was 0.85, thus we do not expect its kaggle score to be better. Its kaggle score was only 0.85632

In [22]:
#use optimized  EN to predict test data
ENopt_= grid.predict(X_test)
ENopt_prob = grid.predict_proba(X_test)

**Save prediction**

In [25]:
import csv
f_name = dirPOutput + 'optimizeelasticnet.csv'

df_test = pd.read_csv(dirRawData + 'test.csv')

with open(f_name, 'w',newline='') as csvfile:
    writer=csv.writer(csvfile,delimiter=',')
    writer.writerow(["id", "target"])
    writer.writerows(zip(df_test[df_test.columns[0]], ENopt_prob[:,1]))

In [32]:
ENopt_prob_design = grid.predict_proba(X_design)
dict_ = {'ENopt_prob': ENopt_prob_design,
        'ENopt_prob_test':ENopt_prob}

f_name = dirPData + 'EN.pickle'
with open(f_name, "wb") as f:
    pickle.dump(dict_, f)
del f_name