In [1]:
import numpy as np
import pandas as pd
%matplotlib inline
from matplotlib import pyplot as plt

from sklearn import linear_model
from sklearn import metrics
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix
import pylab 
import scipy.stats as stats
from sklearn.metrics import auc
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

In [2]:
#USEFUL FUNCTIONS FOR PLOTTING

def plot_cdf(data, votes, bin_edges, ax, xlabel=None, color=None):

    '''
    This function is useful to understand whether in inout feature "data" there is information which is correlated to the
    satisfaction of the users.
    Generally speaking, the meaning of the CDFs is that, if there is a gap between the distributions of the data
    conditioned to the satisfaction class of the corresponding users, it means that the information in the data is
    correlated to users satisfaction and thus can be learnt by a supervised classifier.
    Generally speaking, looking at data distributions is the first step to decide whether some data may be useful or
    not for ML problems.
    :param data: data to be plot (one-dimensional array)
    :param votes: satisfaction labels, int; if already binary, set threshold to None
    :param bin_edges: array of type  np.linspace(min(data), max(data), num_bins+1)
    :param ax: axis of type plt.subplots(figsize=(a,b));
    :param xlabel: label to gice to x axis
    :param color: axis color
    :return:
    '''

    if xlabel is None:
        xlabel = 'your data'
    if color is None:
        color = 'black'

    yt = votes.copy()

    neg, _ = np.histogram(data[yt == +1], bins=bin_edges)  # count number of evidences per bin
    pos, _ = np.histogram(data[yt == 0], bins=bin_edges)

    sumpos =  sum(pos)
    sumneg =  sum(neg)
    pos = pos.astype(float) / sumpos  # normalize to total number of evidences
    neg = neg.astype(float) / sumneg

    xrange = bin_edges[1:] - bin_edges[:1]
    
    title = 'CDF'
    ax.plot(xrange, np.cumsum(pos))
    ax.plot(xrange, np.cumsum(neg))
    ax.xaxis.label.set_color(color)
    ax.yaxis.label.set_color(color)
    ax.tick_params(axis='x', colors=color)
    ax.tick_params(axis='y', colors=color)
    ax.xaxis.grid(True)
    ax.yaxis.grid(True)
    ax.set_xlabel(xlabel)
    ax.set_title(title, color=color)
    ax.legend(['High QoE', 'Low QoE'])
    return

# USEFUL FUNCTIONS FOR PREDICTION

def hyperparameter_tuning(train_sample, train_target, names, classifiers, parameters_grid,
                          n_splits_in=None, ref_metric=None):
    '''
    This function applies a cross validation strategy to select, for each of the classifiers provided in input, 
    the best hyper-parameters (hp) values out of a pool of candidate values (Grid Search Procedure). 
    The function saves on a file the best hp values, for the input Training Fold. Finally, it returns the prediction 
    performance on the input Validation Fold.
    (ref: https://machinelearningmastery.com/hyperparameter-optimization-with-random-search-and-grid-search/)

    :param train_sample: training samples set
    :param train_target: training users satisfaction labels
    :param test_sample: test samples set
    :param test_target: test users satisfaction labels
    :param names: involved classifiers names
    :param classifiers: involved classifiers scikitlearn functions
    :param n_splits_in: number of k fold splits for validation (our results were derived with 10 folds, which is the default value)
    :param ref_metric: optimization metric (sklearn.metrics); default roc_auc
    :return: prediction performance on the test set (AUC)
    '''

    if ref_metric is None:
        ref_metric = 'roc_auc'
    if n_splits_in is None:
        n_splits_in = 2

    best_hp = pd.DataFrame(index = names, columns = ['BestHP_Values'])
    print('Choose Best hyper-parameters through Cross Validation')
    text_file = open('Best_hyper-parameters (HP Tuning).txt', "w") # If this filename already exists in folder, 
    # results will be appended to older file. Delete older version to fill a new txt file.
    text_file.write("############\n")
    for name, clf in zip(names, classifiers):
        text_file.write("{}:\n".format(name))
        print("############")
        print(' Classifier {} - Processing'.format(name))
        grid = parameters_grid[names.index(name)] #take hyper-parameters candidate values grid 
        estimator = model_selection.GridSearchCV(clf, grid, scoring=ref_metric,refit=True,
                                                 cv=n_splits_in).fit(train_sample, train_target) #Grid Search 
        bp = estimator.best_params_
        print(' Best Parameters Values: {}'.format(bp))
        print(list(bp.values()))
        best_hp.at[name,'BestHP_Values'] = list(bp.values())
        text_file.write("{}:\n".format(estimator.best_params_))
        text_file.write("############\n")
        print("############")
    text_file.write("******************\n")
    text_file.close()

    return best_hp
    

def direct_prediction(train_sample, train_target, test_sample, test_target, names, classifiers):
    '''
    This function takes in input a group of classifiers with already fixed HP values, train them on the input data 
    train_sample --> train_target and finally performs prediction on the input test_sample-->test_target. 
    
    Note that each classifier outputs the probability that a given test user belongs to the
    class of Dissatisfied Users. By thresholding such probability, one can effectively assign to the test user 
    either the Satisfied ('0') or the Dissatisfied ('1') label. Computing the FPR and TPR of the classifier for 
    different threshold values, it is possible to draw a ROC Curve.
    Finally, the performance in terms of Area Under the ROC Curve are returned as output.
    (ref: https://machinelearningmastery.com/roc-curves-and-precision-recall-curves-for-classification-in-python/)
    
    
    :param train_sample: training samples set
    :param train_target: training users satisfaction labels
    :param test_sample: test samples set
    :param test_target: test users satisfaction labels
    :param names: names of the considered classifiers 
    :param classifiers: the scikit methods corresponding to the considered classifiers
    :return: prediction performance (AUC) on the test set
    '''
    
    perf = pd.DataFrame(index=names,columns=['AUC'])
    prediction_proba = np.empty((len(names), len(test_sample)))

    plt.figure(figsize=(20, 5))
    color = ['b', 'r', 'g', 'c', 'k', 'm'] #choose a color for each classifier
    color = color[:len(names)]
    plt.plot([0, 1], [0, 1], linestyle='--', lw=2, color='r', label='Chance', alpha=.3) # ROC Curve of a dummy 
    # Classifier
    for name, clf in zip(names, classifiers):
        print(' Classifier {} - Fit & Predict'.format(name))
        estimator = clf.fit(train_sample, train_target) # fit the classifier on training set

        prediction_proba[names.index(name),:] = estimator.predict_proba(test_sample)[:, 1] # generate, for each test
        # user, the probability that the user is not satisfied

        fpr, tpr, decision_thresholds = metrics.roc_curve(test_target, prediction_proba[names.index(name),:]
                                                          , pos_label=1)

        perf.at[name,'AUC'] = metrics.auc(fpr, tpr)
        plt.plot(fpr, tpr, color=color[names.index(name)], label=r'ROC %s (AUC = %0.3f)' % (name,perf.loc[name,
                                                                                                          'AUC']), lw=2, alpha=.8)
        perf.at[name,'AUC'] = metrics.auc(fpr, tpr)
    
    plt.plot(0, 1, '*', color='k', label=r'Optimum: FPR = 0, TPR = 1', lw=2, alpha=.8, markersize=15)
    plt.xlim([-0.05, 1.05])
    plt.ylim([-0.05, 1.05])
    plt.xticks(color='black')
    plt.yticks(color='black')
    plt.grid(1)
    plt.xlabel('False Positive Rate', color='black', fontsize=14)
    plt.ylabel('True Positive Rate', color='black', fontsize=14)
    plt.legend(loc="lower right")
    plt.show()
    #plt.savefig('ROC.png', bbox_inches='tight') #uncomment to save the plot
    return perf


        

# Import Training Data

In [3]:
'''
if 'dataset' in globals():
    del dataset
if 'ground_truth' in globals():
    del ground_truth
'''

path = '../input/basicdataset-training-mrn' # PUT YOUR FILE PATH
basic_train = pd.read_csv(path + '/BasicDataset_Training_MRN.csv')
basic_test = pd.read_csv('../input/basicdataset-test-mrn/BasicDataset_Test_MRN.csv')

dataset = basic_train.drop(['User_Satisfaction','Unnamed: 0', 'Unnamed: 0.1'], axis=1)
testset = basic_test.drop(['User_Satisfaction','Unnamed: 0', 'Unnamed: 0.1'], axis=1)
ground_truth = basic_train.loc[:, 'User_Satisfaction'].copy()
test_truth = basic_test.loc[:, 'User_Satisfaction'].copy()

print('Train Data:', dataset.shape)
print('Train Target:', ground_truth.shape)

print('Test Data:', testset.shape)
print('Train Target:', test_truth.shape)

In [4]:
list(dataset.columns)
dataset.head()

In [5]:
ground_truth.value_counts()
ground_truth.head()

# Data Visualization

In [6]:
import seaborn as sns
import matplotlib.pyplot as plt

df = basic_train.drop(['Unnamed: 0', 'Unnamed: 0.1'], axis=1)
corr = df.corr()
fig, ax = plt.subplots()
fig.set_size_inches(8, 6)
sns.heatmap(corr, annot=True, fmt='.2f', 
            cmap=plt.get_cmap('coolwarm'), cbar=False, ax=ax)
ax.set_yticklabels(ax.get_yticklabels(), rotation="horizontal")
plt.savefig('result.png', bbox_inches='tight', pad_inches=20.1)

In [7]:
# 1 Cumulative_YoutubeSess_LTE_DL_Volume

fig, ax = plt.subplots(1, 1, figsize=(20, 5))
x = dataset.iloc[:,1].copy()
print("number of zeros: {}. %: {}".format(len(x[x==0]),len(x[x==0])/len(x) ))
x[x==0] = 1.000001
tmp_0 = ax.hist(np.log(x), bins=100)
#tmp_0 = ax.hist(x, bins=100)
ax.set_ylabel('Bincount')
ax.xaxis.label.set_color('red')
ax.yaxis.label.set_color('red')
ax.tick_params(axis='x', colors='red')
ax.tick_params(axis='y', colors='red')

In [8]:
# 1 Cumulative_YoutubeSess_LTE_DL_Volume

measurements = np.random.normal(loc = 20, scale = 5, size=100)   
stats.probplot(np.log(x), dist="norm", plot=pylab)
pylab.show()

In [9]:
# 3 

fig, ax = plt.subplots(1, 1, figsize=(20, 5))
x = dataset.iloc[:,5].copy()
print("number of zeros: {}. %: {}".format(len(x[x==0]),len(x[x==0])/len(x) ))
x[x==0] = 1.000001
tmp_0 = ax.hist(x, bins=100)
#tmp_0 = ax.hist(x, bins=100)
ax.set_ylabel('Bincount')
ax.xaxis.label.set_color('red')
ax.yaxis.label.set_color('red')
ax.tick_params(axis='x', colors='red')
ax.tick_params(axis='y', colors='red')

In [11]:
# Python3 code to show Box-cox Transformation
# of non-normal data
 
# import modules
import numpy as np
from scipy import stats
 
# plotting modules
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import boxcox
 
# generate non-normal data (exponential)
original_data= dataset.iloc[:,2].copy()
original_data[original_data == 0] = 1.000001
 
# transform training data & save lambda value
fitted_data, fitted_lambda = stats.boxcox(original_data)
 
# creating axes to draw plots
fig, ax = plt.subplots(1, 2)
 
# plotting the original data(non-normal) and
# fitted data (normal)
sns.distplot(original_data, hist = False, kde = True,
            kde_kws = {'shade': True, 'linewidth': 2},
            label = "Non-Normal", color ="green", ax = ax[0])
 
sns.distplot(fitted_data, hist = False, kde = True,
            kde_kws = {'shade': True, 'linewidth': 2},
            label = "Normal", color ="green", ax = ax[1])
 
# adding legends to the subplots
plt.legend(loc = "upper right")
 
# rescaling the subplots
fig.set_figheight(5)
fig.set_figwidth(10)
 
print(f"Lambda value used for Transformation: {fitted_lambda}")

In [12]:
# let's try to combine most of the features

c = dataset.copy()
c = c.drop(['Cumulative_YoutubeSess_LTE_DL_Time','Cumulative_YoutubeSess_LTE_DL_Volume','Cumulative_Lim_Service_Time_LTE',
            'Cumulative_Lim_Service_Time_UMTS','Cumulative_YoutubeSess_UMTS_DL_Time','Cumulative_Full_Service_Time_UMTS',
            'Cumulative_No_Service_Time_UMTS','Cumulative_Full_Service_Time_LTE','Cumulative_No_Service_Time_LTE'], axis=1)
t = testset.copy()
t = t.drop(['Cumulative_YoutubeSess_LTE_DL_Time','Cumulative_YoutubeSess_LTE_DL_Volume','Cumulative_Lim_Service_Time_LTE',
            'Cumulative_Lim_Service_Time_UMTS','Cumulative_YoutubeSess_UMTS_DL_Time','Cumulative_Full_Service_Time_UMTS',
            'Cumulative_No_Service_Time_UMTS','Cumulative_Full_Service_Time_LTE','Cumulative_No_Service_Time_LTE'], axis=1)

c1 = c.copy()
c2 = c.copy()
c3 = c.copy()
c4 = dataset.copy()

t1 = t.copy()
t2 = t.copy()
t3 = t.copy()
t4 = testset.copy()


x0 = dataset['Cumulative_YoutubeSess_LTE_DL_Time'].copy()
x0[x0==0] = 1.000001
x0a = (np.divide(dataset['Cumulative_YoutubeSess_LTE_DL_Volume'],x0)).copy()
x = dataset['Cumulative_Full_Service_Time_LTE'] + dataset['Cumulative_Full_Service_Time_UMTS']
x1 = dataset['Cumulative_Lim_Service_Time_LTE'] + dataset['Cumulative_Lim_Service_Time_UMTS']
x2 = x + x1
x3 = dataset['Cumulative_No_Service_Time_LTE'] + dataset['Cumulative_No_Service_Time_UMTS']
x4 = dataset['Cumulative_YoutubeSess_LTE_DL_Time'] + dataset['Cumulative_YoutubeSess_UMTS_DL_Time']
x5 = dataset['Cumulative_YoutubeSess_LTE_DL_Volume'] + dataset['Cumulative_YoutubeSess_UMTS_DL_Volume']
x4[x4 == 0] = 1.00001
x6 = x5 / x4
log_x = ['Cumulative_Full_Service_Time_UMTS','Cumulative_No_Service_Time_UMTS','Cumulative_Full_Service_Time_LTE']


t_x0 = testset['Cumulative_YoutubeSess_LTE_DL_Time'].copy()
t_x0[t_x0==0] = 1.000001
t_x0a = (np.divide(testset['Cumulative_YoutubeSess_LTE_DL_Volume'],t_x0)).copy()
t_x = testset['Cumulative_Full_Service_Time_LTE'] + testset['Cumulative_Full_Service_Time_UMTS']
t_x1 = testset['Cumulative_Lim_Service_Time_LTE'] + testset['Cumulative_Lim_Service_Time_UMTS']
t_x2 = t_x + t_x1
t_x3 = testset['Cumulative_No_Service_Time_LTE'] + testset['Cumulative_No_Service_Time_UMTS']
t_x4 = testset['Cumulative_YoutubeSess_LTE_DL_Time'] + testset['Cumulative_YoutubeSess_UMTS_DL_Time']
t_x5 = testset['Cumulative_YoutubeSess_LTE_DL_Volume'] + testset['Cumulative_YoutubeSess_UMTS_DL_Volume']
t_x4[t_x4 == 0] = 1.00001
t_x6 = t_x5 / t_x4


c['Rate_LTE_DL'] = x0a
c['Some_Service'] = x2
c['No_Service'] = x3
t['Rate_LTE_DL'] = t_x0a
t['Some_Service'] = t_x2
t['No_Service'] = t_x3

c_sd = ((c - np.mean(c))/np.std(c)).copy()   
t_sd = ((t - np.mean(c))/np.std(c)).copy()


c1['Rate_LTE_DL'] = x0a
c1['Cumulative_Full_Service_Time'] = x
c1['Cumulative_Lim_Service_Time'] = x1
c1['Cumulative_No_Service_Time'] = x3
t1['Rate_LTE_DL'] = t_x0a
t1['Cumulative_Full_Service_Time'] = t_x
t1['Cumulative_Lim_Service_Time'] = t_x1
t1['Cumulative_No_Service_Time'] = t_x3

c1_sd = ((c1 - np.mean(c1))/np.std(c1)).copy()   
t1_sd = ((t1 - np.mean(c1))/np.std(c1)).copy()


c2['Cumulative_Full_Service_Time'] = x
c2['Cumulative_Lim_Service_Time'] = x1
c2['Cumulative_No_Service_Time'] = x3
c2['Total_Rate'] = x6
c2 = c2.drop(['Cumulative_YoutubeSess_UMTS_DL_Volume'], axis=1)
t2['Cumulative_Full_Service_Time'] = t_x
t2['Cumulative_Lim_Service_Time'] = t_x1
t2['Cumulative_No_Service_Time'] = t_x3
t2['Total_Rate'] = t_x6
t2 = t2.drop(['Cumulative_YoutubeSess_UMTS_DL_Volume'], axis=1)

c2_sd = ((c2 - np.mean(c2))/np.std(c2)).copy()   
t2_sd = ((t2 - np.mean(c2))/np.std(c2)).copy()


c3['Some_Service'] = x2
c3['No_Service'] = x3
c3['Total_Rate'] = x6
c3 = c3.drop(['Cumulative_YoutubeSess_UMTS_DL_Volume'], axis=1)
t3['Some_Service'] = t_x2
t3['No_Service'] = t_x3
t3['Total_Rate'] = t_x6
t3 = t3.drop(['Cumulative_YoutubeSess_UMTS_DL_Volume'], axis=1)

c3_sd = ((c3 - np.mean(c3))/np.std(c3)).copy()   
t3_sd = ((t3 - np.mean(c3))/np.std(c3)).copy()


c4['Rate_LTE_DL'] = x0a
c4 = c4.drop(['Cumulative_YoutubeSess_LTE_DL_Time','Cumulative_YoutubeSess_LTE_DL_Volume',
              'Cumulative_YoutubeSess_UMTS_DL_Time','Cumulative_Full_Service_Time_UMTS','Cumulative_No_Service_Time_UMTS',
              'Cumulative_Full_Service_Time_LTE'], axis=1)
t4['Rate_LTE_DL'] = t_x0a
t4 = t4.drop(['Cumulative_YoutubeSess_LTE_DL_Time','Cumulative_YoutubeSess_LTE_DL_Volume',
              'Cumulative_YoutubeSess_UMTS_DL_Time','Cumulative_Full_Service_Time_UMTS','Cumulative_No_Service_Time_UMTS',
              'Cumulative_Full_Service_Time_LTE'], axis=1)
for s in log_x:
    words = ["log",s]
    words = "_".join(words)
    temp = dataset[s].copy()
    temp2 = testset[s].copy()
    c4[words] = np.log(temp + 1)
    t4[words] = np.log(temp2 + 1)
    
    
c4_sd = ((c4 - np.mean(c4))/np.std(c4)).copy()   
t4_sd = ((t4 - np.mean(c4))/np.std(c4)).copy()



print("c\n\n",c.head())
print("t\n\n",t.head())

print("c1\n\n",c1.head())
print("t1\n\n",t1.head())

print("c2\n\n",c2.head())
print("t2\n\n",t2.head())

print("c3\n\n",c3.head())
print("t3\n\n",t3.head())

print("c4\n\n",c4.head())
print("t4\n\n",t4.head())

print("c4_sd\n\n",c4_sd.head())
print("t4_sd\n\n",t4_sd.head())

In [13]:
df = c4.copy()
corr = df.corr()
corr.style.background_gradient(cmap='coolwarm')

In [14]:
# Standardization
df = c4_sd.copy()
corr = df.corr()
corr.style.background_gradient(cmap='coolwarm')

In [16]:
# PCA
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

X = c4_sd.copy()

pca = PCA()
pca.fit(X)

std_slc = StandardScaler()
X_std = std_slc.fit_transform(X)
X_std_pca = pca.fit_transform(X_std) # scores

print("Explained variance ratio: {}".format(pca.explained_variance_ratio_))
s = 0
vec = []

for s1 in pca.explained_variance_ratio_:
    s = s + s1
    vec.append(s)
    
print("Cumulative explained variance ratio: {}".format(vec))
    
# we noticec that the first 8 explain a lot of the variability, so we deleted just two components 
X_std_pca = X_std_pca[:,0:8]
X_std_pca = pd.DataFrame(X_std_pca, columns=['PC1', 'PC2','PC3','PC4','PC5','PC6','PC7','PC8'])
#X_std_pca['Satisfaction'] = ground_truth
print(X_std_pca.head())

loadings = pca.components_.T * np.sqrt(pca.explained_variance_)
loading_matrix = pd.DataFrame(loadings, columns=['PC1', 'PC2','PC3','PC4','PC5','PC6','PC7','PC8','PC9','PC10'])

test_std_pca = np.dot(t4_sd,loading_matrix)
test_std_pca = pd.DataFrame(test_std_pca[:,0:8],columns=X_std_pca.columns)

In [17]:
# Is the dataset balanced?
num_1 = sum(ground_truth[ground_truth == 1])
prop_1 = num_1/len(ground_truth)

num_2 = len(ground_truth) - num_1
prop_2 = num_2/len(ground_truth)
print("number of 1s is: {}, while the number of 0s is: {}".format(num_1,num_2) )
print("proportion of 1s is: {}, while proportion of 0s is: {}".format(prop_1,prop_2) )

#it's slightly unbalanced. We can performe some oversampling techniques 

# Random oversampling to balance the class distribution
from collections import Counter
from sklearn.datasets import make_classification
from imblearn.over_sampling import RandomOverSampler

# summarize class distribution
print(Counter(ground_truth))
# define oversampling strategy
oversample = RandomOverSampler(sampling_strategy='minority')
# fit and apply the transform
c5, ground_truth_5 = oversample.fit_resample(c4, ground_truth)
# summarize class distribution
print(Counter(ground_truth_5))

c5_sd = ((c5 - np.mean(c5))/np.std(c5)).copy()   
t5_sd = ((t4 - np.mean(c5))/np.std(c5)).copy()

In [18]:
# check on the correctness of the scores for the testset
# print(loading_matrix.iloc[:,0])
# print(t_sd.iloc[0,:])
# print(np.dot(loading_matrix.iloc[:,0],t_sd.iloc[0,:])) # first score of the first component
# print(test_std_pca.iloc[0,0])

In [19]:
# to write the new data in a csv file
# X.to_csv (r'dataaa.csv', index = False, header=True)

In [20]:
# Function to calculate True Positive Rate and False Positive Rate

def calc_TP_FP_rate(y_true, y_pred):
    
    # Convert predictions to series with index matching y_true
    y_pred = pd.Series(y_pred, index=y_true.index)
    
    # Instantiate counters
    TP = 0
    FP = 0
    TN = 0
    FN = 0

    # Determine whether each prediction is TP, FP, TN, or FN
    for i in y_true.index: 
        if y_true[i]==y_pred[i]==1:
           TP += 1
        if y_pred[i]==1 and y_true[i]!=y_pred[i]:
           FP += 1
        if y_true[i]==y_pred[i]==0:
           TN += 1
        if y_pred[i]==0 and y_true[i]!=y_pred[i]:
           FN += 1
    
    # Calculate true positive rate and false positive rate
    tpr = TP / (TP + FN)
    fpr = FP / (FP + TN)

    return tpr, fpr

# Test function

In [21]:
# lists for saving models and results
models = []
preds = []
auc_l = []
dataset_l = []
solver_l = []
penalty_l = []
c_l = []

In [22]:
# 0 - CrossValidation with logisticRegression on the original data

# define model
model = LogisticRegression()
# define evaluation
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3,random_state=0)
# define search space
space = dict()
space['solver'] = ['liblinear']#,'newton-cg', 'lbfgs']
space['penalty'] = ['l2','l1']#, 'none', 'l2', 'elasticnet']
space['C'] = [  1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100, 1000]
#space['class_weight'] = ['balanced','auto']

search = GridSearchCV(model, space, scoring='roc_auc',refit=True, n_jobs=-1,cv=cv)
# execute search
result = search.fit(dataset,ground_truth )
# summarize result
print('Best Score: %s' % result.best_score_)
print('Best Hyperparameters: %s' % result.best_params_)

# results:
# Best Score: 0.6181891168533392
# Best Hyperparameters: {'C': 1000, 'penalty': 'l1', 'solver': 'liblinear'}

In [23]:
# 0b - LogisticRegression on the original data 

# let's use the parameters choosen by the previous cross-validation
model = LogisticRegression(solver='liblinear',penalty='l1',C = 1000 ,random_state=0)
model.fit(dataset, ground_truth)

print("intercept {}, coef {}".format(model.intercept_,model.coef_))

pred = model.predict(testset)
predd = model.score(testset, test_truth)
print("accuracy on prediction: {}".format(predd))
print("Confusion matrix:\n {}".format(confusion_matrix(test_truth, model.predict(testset))))

y_test_probs = model.predict_proba(testset)[:,1]
# Containers for true positive / false positive rates
lr_tp_rates = []
lr_fp_rates = []

# Define probability thresholds to use, between 0 and 1
probability_thresholds = np.linspace(0,1,num=100)

# Find true positive / false positive rate for each threshold
for p in probability_thresholds:
    
    y_test_preds = []
    
    for prob in y_test_probs:
        if prob > p:
            y_test_preds.append(1)
        else:
            y_test_preds.append(0)
            
    tp_rate, fp_rate = calc_TP_FP_rate(test_truth, y_test_preds)
        
    lr_tp_rates.append(tp_rate)
    lr_fp_rates.append(fp_rate)

fig, ax = plt.subplots(figsize=(6,6))
ax.plot(lr_fp_rates, lr_tp_rates, label='Logistic Regression')
ax.set_xlabel('False Positive Rate')
ax.set_ylabel('True Positive Rate')
ax.legend();

print(f'Logistic Regression (No reg.) AUC {auc(lr_fp_rates, lr_tp_rates)}')
models.append(model)
preds.append(predd)
auc_l.append(auc(lr_fp_rates, lr_tp_rates))
dataset_l.append("dataset")
solver_l.append("liblinear")
penalty_l.append("l1")
c_l.append(1000)

# results:
# intercept [-0.69278593], coef [[ 6.24708093e-05 -1.61327154e-07  1.27366935e-04 -7.22990659e-07
#   -8.04987193e-02 -6.70831241e-03 -9.31903442e-06  8.14338374e-05
#    3.29723019e-05 -1.93030926e-06  5.35052302e-05  9.52176101e-06]]
# accuracy on prediction: 0.6797385620915033
# Confusion matrix:
#  [[3113  108]
#  [1411  111]]
# Logistic Regression (No reg.) AUC 0.6256044535266877

In [24]:
# 0c - CrossValidation with logisticRegression on c_sd

# define model
model = LogisticRegression()
# define evaluation
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=0)
# define search space
space = dict()
space['solver'] = ['liblinear','saga']#,'newton-cg', 'lbfgs']
space['penalty'] = ['l2','l1']#, 'none', 'l2', 'elasticnet']
space['C'] = [  1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100,1000]
#space['class_weight'] = ['balanced','auto']

search = GridSearchCV(model, space, scoring='roc_auc', n_jobs=-1, cv=cv)
# execute search
result = search.fit(c_sd,ground_truth )
# summarize result
print('Best Score: %s' % result.best_score_)
print('Best Hyperparameters: %s' % result.best_params_)

# results:
# Best Score: 0.5837018758075153
# Best Hyperparameters: {'C': 0.001, 'penalty': 'l2', 'solver': 'liblinear'}

In [25]:
# 0d - LogisticRegression on c_sd

# let's use the parameters choosen by the previous cross-validation
model = LogisticRegression(solver='liblinear',penalty='l2',C = 0.001, random_state=0)
model.fit(c_sd, ground_truth)

print("intercept {}, coef {}".format(model.intercept_,model.coef_))

pred = model.predict(t_sd)
predd = model.score(t_sd, test_truth)
print("accuracy on prediction: {}".format(predd))
print("Confusion matrix:\n {}".format(confusion_matrix(test_truth, model.predict(t_sd))))

y_test_probs = model.predict_proba(t_sd)[:,1]
# Containers for true positive / false positive rates
lr_tp_rates = []
lr_fp_rates = []

# Define probability thresholds to use, between 0 and 1
probability_thresholds = np.linspace(0,1,num=100)

# Find true positive / false positive rate for each threshold
for p in probability_thresholds:
    
    y_test_preds = []
    
    for prob in y_test_probs:
        if prob > p:
            y_test_preds.append(1)
        else:
            y_test_preds.append(0)
            
    tp_rate, fp_rate = calc_TP_FP_rate(test_truth, y_test_preds)
        
    lr_tp_rates.append(tp_rate)
    lr_fp_rates.append(fp_rate)

fig, ax = plt.subplots(figsize=(6,6))
ax.plot(lr_fp_rates, lr_tp_rates, label='Logistic Regression')
ax.set_xlabel('False Positive Rate')
ax.set_ylabel('True Positive Rate')
ax.legend();

print(f'Logistic Regression (No reg.) AUC {auc(lr_fp_rates, lr_tp_rates)}')
models.append(model)
preds.append(predd)
auc_l.append(auc(lr_fp_rates, lr_tp_rates))
dataset_l.append("c_sd")
solver_l.append("liblinear")
penalty_l.append("l2")
c_l.append(0.001)

# results:
# intercept [-0.5885437], coef [[-0.01479649 -0.09051643 -0.09862094 -0.06723762 -0.17436779  0.0351862 ]]
# accuracy on prediction: 0.6810035842293907
# Confusion matrix:
#  [[3212    9]
#  [1504   18]]
# Logistic Regression (No reg.) AUC 0.597905050667413

In [26]:
# 1 - CrossValidation with logisticRegression on c1_sd

# define model
model = LogisticRegression()
# define evaluation
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
# define search space
space = dict()
space['solver'] = ['liblinear','saga']#,'newton-cg', 'lbfgs']
space['penalty'] = ['l2','l1']#, 'none', 'l2', 'elasticnet']
space['C'] = [  1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100,1000]
#space['class_weight'] = ['balanced','auto']

search = GridSearchCV(model, space, scoring='roc_auc', n_jobs=-1, cv=cv)
# execute search
result = search.fit(c1_sd,ground_truth )
# summarize result
print('Best Score: %s' % result.best_score_)
print('Best Hyperparameters: %s' % result.best_params_)

# results:
# Best Score: 0.6055966218461358
# Best Hyperparameters: {'C': 0.001, 'penalty': 'l2', 'solver': 'saga'}

In [27]:
# 1a - LogisticRegression on c1_sd

# let's use the parameters choosen by the previous cross-validation
model = LogisticRegression(solver='saga', penalty='l2',C = 0.001 ,random_state=0)
model.fit(c1_sd, ground_truth)

print("intercept {}, coef {}".format(model.intercept_,model.coef_))

pred = model.predict(t1_sd)
predd = model.score(t1_sd, test_truth)
print("accuracy on prediction: {}".format(predd))
print("Confusion matrix:\n {}".format(confusion_matrix(test_truth, model.predict(t1_sd))))

y_test_probs = model.predict_proba(t1_sd)[:,1]
# Containers for true positive / false positive rates
lr_tp_rates = []
lr_fp_rates = []

# Define probability thresholds to use, between 0 and 1
probability_thresholds = np.linspace(0,1,num=100)

# Find true positive / false positive rate for each threshold
for p in probability_thresholds:
    
    y_test_preds = []
    
    for prob in y_test_probs:
        if prob > p:
            y_test_preds.append(1)
        else:
            y_test_preds.append(0)
            
    tp_rate, fp_rate = calc_TP_FP_rate(test_truth, y_test_preds)
        
    lr_tp_rates.append(tp_rate)
    lr_fp_rates.append(fp_rate)

fig, ax = plt.subplots(figsize=(6,6))
ax.plot(lr_fp_rates, lr_tp_rates, label='Logistic Regression')
ax.set_xlabel('False Positive Rate')
ax.set_ylabel('True Positive Rate')
ax.legend();

print('Logistic Regression (No reg.) AUC {}'.format(auc(lr_fp_rates, lr_tp_rates)))
models.append(model)
preds.append(predd)
auc_l.append(auc(lr_fp_rates, lr_tp_rates))
dataset_l.append("c1_sd")
solver_l.append("saga")
penalty_l.append("l2")
c_l.append(0.001)

# results: 
# intercept [-0.73311552], coef [[-0.0148945  -0.09430394 -0.10359717 -0.07015207 -0.18855392  0.16246657
#    0.03656921]]
# accuracy on prediction: 0.6791060510225596
# Confusion matrix:
#  [[3198   23]
#  [1499   23]]
# logistic Regression (No reg.) AUC 0.6187158353463086

In [28]:
# 2 - CrossValidation with logisticRegression on c2_sd

# define model
model = LogisticRegression()
# define evaluation
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=0)
# define search space
space = dict()
space['solver'] = ['liblinear','saga']#,'newton-cg', 'lbfgs']
space['penalty'] = ['l2','l1']#, 'none', 'l2', 'elasticnet']
space['C'] = [  1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100,1000]
#space['class_weight'] = ['balanced','auto']

search = GridSearchCV(model, space, scoring='roc_auc', n_jobs=-1, cv=cv)
# execute search
result = search.fit(c2_sd,ground_truth )
# summarize result
print('Best Score: %s' % result.best_score_)
print('Best Hyperparameters: %s' % result.best_params_)

# results:
# Best Score: 0.6065395800274344
# Best Hyperparameters: {'C': 0.01, 'penalty': 'l2', 'solver': 'saga'}

In [29]:
# 2a - LogisticRegression on c2_sd

# let's use the parameters choosen by the previous cross-validation
model = LogisticRegression(solver='saga', penalty='l2',C = 0.01, random_state=0)
model.fit(c2_sd, ground_truth)

print("intercept {}, coef {}".format(model.intercept_,model.coef_))

pred = model.predict(t2_sd)
predd = model.score(t2_sd, test_truth)
print("accuracy on prediction: {}".format(predd))
print("Confusion matrix:\n {}".format(confusion_matrix(test_truth, model.predict(t2_sd))))

y_test_probs = model.predict_proba(t2_sd)[:,1]
# Containers for true positive / false positive rates
lr_tp_rates = []
lr_fp_rates = []

# Define probability thresholds to use, between 0 and 1
probability_thresholds = np.linspace(0,1,num=100)

# Find true positive / false positive rate for each threshold
for p in probability_thresholds:
    
    y_test_preds = []
    
    for prob in y_test_probs:
        if prob > p:
            y_test_preds.append(1)
        else:
            y_test_preds.append(0)
            
    tp_rate, fp_rate = calc_TP_FP_rate(test_truth, y_test_preds)
        
    lr_tp_rates.append(tp_rate)
    lr_fp_rates.append(fp_rate)

fig, ax = plt.subplots(figsize=(6,6))
ax.plot(lr_fp_rates, lr_tp_rates, label='Logistic Regression')
ax.set_xlabel('False Positive Rate')
ax.set_ylabel('True Positive Rate')
ax.legend();

print(f'Logistic Regression (No reg.) AUC {auc(lr_fp_rates, lr_tp_rates)}')
models.append(model)
preds.append(predd)
auc_l.append(auc(lr_fp_rates, lr_tp_rates))
dataset_l.append("c2_sd")
solver_l.append("saga")
penalty_l.append("l2")
c_l.append(0.01)

# results:
# intercept [-0.7407721], coef [[-0.12778345 -0.13892725 -0.23089109  0.199131    0.04443668 -0.10084862]]
# accuracy on prediction: 0.6793168880455408
# Confusion matrix:
#  [[3160   61]
#  [1460   62]]
# Logistic Regression (No reg.) AUC 0.61672852800344

In [30]:
# 3 - CrossValidation with logisticRegression on c3_sd

# define model
model = LogisticRegression()
# define evaluation
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=0)
# define search space
space = dict()
space['solver'] = ['liblinear','saga']#,'newton-cg', 'lbfgs']
space['penalty'] = ['l2','l1']#, 'none', 'l2', 'elasticnet']
space['C'] = [  1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100,1000]
#space['class_weight'] = ['balanced','auto']

search = GridSearchCV(model, space, scoring='roc_auc', n_jobs=-1, cv=cv)
# execute search
result = search.fit(c3_sd,ground_truth )
# summarize result
print('Best Score: %s' % result.best_score_)
print('Best Hyperparameters: %s' % result.best_params_)

# results:
# Best Score: 0.5850336573664553
# Best Hyperparameters: {'C': 0.01, 'penalty': 'l2', 'solver': 'saga'}


In [31]:
# 3a - LogisticRegression on c3_sd

# let's use the parameters choosen by the previous cross-validation
model = LogisticRegression(solver='liblinear', penalty='l2',C = 0.01, random_state=0)
model.fit(c3_sd, ground_truth)

print("intercept {}, coef {}".format(model.intercept_,model.coef_))

pred = model.predict(t3_sd)
predd = model.score(t3_sd, test_truth)
print("accuracy on prediction: {}".format(predd))
print("Confusion matrix:\n {}".format(confusion_matrix(test_truth, model.predict(t3_sd))))

y_test_probs = model.predict_proba(t3_sd)[:,1]
# Containers for true positive / false positive rates
lr_tp_rates = []
lr_fp_rates = []

# Define probability thresholds to use, between 0 and 1
probability_thresholds = np.linspace(0,1,num=100)

# Find true positive / false positive rate for each threshold
for p in probability_thresholds:
    
    y_test_preds = []
    
    for prob in y_test_probs:
        if prob > p:
            y_test_preds.append(1)
        else:
            y_test_preds.append(0)
            
    tp_rate, fp_rate = calc_TP_FP_rate(test_truth, y_test_preds)
        
    lr_tp_rates.append(tp_rate)
    lr_fp_rates.append(fp_rate)

fig, ax = plt.subplots(figsize=(6,6))
ax.plot(lr_fp_rates, lr_tp_rates, label='Logistic Regression')
ax.set_xlabel('False Positive Rate')
ax.set_ylabel('True Positive Rate')
ax.legend();

print(f'Logistic Regression (No reg.) AUC {auc(lr_fp_rates, lr_tp_rates)}')
models.append(model)
preds.append(predd)
auc_l.append(auc(lr_fp_rates, lr_tp_rates))
dataset_l.append("c3_sd")
solver_l.append("liblinear")
penalty_l.append("l2")
c_l.append(0.01)

# results:
# intercept [-0.71593362], coef [[-0.12489584 -0.13473968 -0.21839289  0.04386749 -0.09953964]]
# accuracy on prediction: 0.6805819101834282
# Confusion matrix:
#  [[3206   15]
#  [1500   22]]
# Logistic Regression (No reg.) AUC 0.5976721017338172

In [32]:
# 4 - CrossValidation with logisticRegression on c4_sd

# define model
model = LogisticRegression()
# define evaluation
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=0)
# define search space
space = dict()
space['solver'] = ['liblinear','saga']#,'newton-cg', 'lbfgs']
space['penalty'] = ['l2','l1']#, 'none', 'l2', 'elasticnet']
space['C'] = [  1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100,1000]
#space['class_weight'] = ['balanced','auto']

search = GridSearchCV(model, space, scoring='roc_auc', n_jobs=-1, cv=cv)
# execute search
result = search.fit(c4_sd,ground_truth )
# summarize result
print('Best Score: %s' % result.best_score_)
print('Best Hyperparameters: %s' % result.best_params_)

# results:
# Best Score: 0.6289946077164603
# Best Hyperparameters: {'C': 0.001, 'penalty': 'l2', 'solver': 'liblinear'}

In [33]:
# 4a - LogisticRegression on c4_sd

# let's use the parameters choosen by the previous cross-validation
model = LogisticRegression(solver='liblinear', penalty='l2',C = 0.001, random_state=0)
model.fit(c4_sd, ground_truth)

print("intercept {}, coef {}".format(model.intercept_,model.coef_))

pred = model.predict(t4_sd)
predd = model.score(t4_sd, test_truth)
print("accuracy on prediction: {}".format(predd))
print("Confusion matrix:\n {}".format(confusion_matrix(test_truth, model.predict(t4_sd))))

y_test_probs = model.predict_proba(t4_sd)[:,1]
# Containers for true positive / false positive rates
lr_tp_rates = []
lr_fp_rates = []

# Define probability thresholds to use, between 0 and 1
probability_thresholds = np.linspace(0,1,num=100)

# Find true positive / false positive rate for each threshold
for p in probability_thresholds:
    
    y_test_preds = []
    
    for prob in y_test_probs:
        if prob > p:
            y_test_preds.append(1)
        else:
            y_test_preds.append(0)
            
    tp_rate, fp_rate = calc_TP_FP_rate(test_truth, y_test_preds)
        
    lr_tp_rates.append(tp_rate)
    lr_fp_rates.append(fp_rate)

fig, ax = plt.subplots(figsize=(6,6))
ax.plot(lr_fp_rates, lr_tp_rates, label='Logistic Regression')
ax.set_xlabel('False Positive Rate')
ax.set_ylabel('True Positive Rate')
ax.legend();

print(f'Logistic Regression (No reg.) AUC {auc(lr_fp_rates, lr_tp_rates)}')
models.append(model)
preds.append(predd)
auc_l.append(auc(lr_fp_rates, lr_tp_rates))
dataset_l.append("c4_sd")
solver_l.append("liblinear")
penalty_l.append("l2")
c_l.append(0.001)

# results:
# intercept [-0.59304071], coef [[-0.01496925 -0.09180171 -0.09934159  0.14442221  0.068932    0.02632641
#   -0.0684977  -0.25355605  0.03641513 -0.10105887]]
# accuracy on prediction: 0.6883828800337339
# Confusion matrix:
#  [[3111  110]
#  [1368  154]]
# Logistic Regression (No reg.) AUC 0.6361301960157165


In [34]:
# 5 - CrossValidation with logisticRegression on X_std_pca

# define model
model = LogisticRegression()
# define evaluation
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=0)
# define search space
space = dict()
space['solver'] = ['liblinear','saga']#,'newton-cg', 'lbfgs']
space['penalty'] = ['l2','l1']#, 'none', 'l2', 'elasticnet']
space['C'] = [  1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100,1000]
#space['class_weight'] = ['balanced','auto']

search = GridSearchCV(model, space, scoring='roc_auc', n_jobs=-1, cv=cv)
# execute search
result = search.fit(X_std_pca,ground_truth )
# summarize result
print('Best Score: %s' % result.best_score_)
print('Best Hyperparameters: %s' % result.best_params_)

# results:
# Best Score: 0.6124305265311364
# Best Hyperparameters: {'C': 0.1, 'penalty': 'l1', 'solver': 'saga'}

In [35]:
# 5a - LogisticRegression on X_std_pca

# let's use the parameters choosen by the previous cross-validation
model = LogisticRegression(solver='saga', penalty='l1',C = 0.1, random_state=0)
model.fit(X_std_pca, ground_truth)

print("intercept {}, coef {}".format(model.intercept_,model.coef_))

pred = model.predict(test_std_pca)
predd = model.score(test_std_pca, test_truth)
print("accuracy on prediction: {}".format(predd))
# acc = np.count_nonzero(pred == test_truth)/len(test_truth)
# print("accuracy on prediction: {}".format(acc))
print("accuracy on prediction: {}".format(model.score(test_std_pca, test_truth)))
print("Confusion matrix:\n {}".format(confusion_matrix(test_truth, model.predict(test_std_pca))))

y_test_probs = model.predict_proba(test_std_pca)[:,1]
# Containers for true positive / false positive rates
lr_tp_rates = []
lr_fp_rates = []

# Define probability thresholds to use, between 0 and 1
probability_thresholds = np.linspace(0,1,num=100)

# Find true positive / false positive rate for each threshold
for p in probability_thresholds:
    
    y_test_preds = []
    
    for prob in y_test_probs:
        if prob > p:
            y_test_preds.append(1)
        else:
            y_test_preds.append(0)
            
    tp_rate, fp_rate = calc_TP_FP_rate(test_truth, y_test_preds)
        
    lr_tp_rates.append(tp_rate)
    lr_fp_rates.append(fp_rate)

fig, ax = plt.subplots(figsize=(6,6))
ax.plot(lr_fp_rates, lr_tp_rates, label='Logistic Regression')
ax.set_xlabel('False Positive Rate')
ax.set_ylabel('True Positive Rate')
ax.legend();

print(f'Logistic Regression (No reg.) AUC {auc(lr_fp_rates, lr_tp_rates)}')
models.append(model)
preds.append(predd)
auc_l.append(auc(lr_fp_rates, lr_tp_rates))
dataset_l.append("X_std_pca")
solver_l.append("saga")
penalty_l.append("l1")
c_l.append(0.1)


# results:
# intercept [-0.74053783], coef [[ 0.         -0.01744303 -0.07252896 -0.223862    0.25787507 -0.06524403
#    0.15524025  0.0866031 ]]
# accuracy on prediction: 0.6866961838498841
# Confusion matrix:
#  [[3134   87]
#  [1399  123]]
# Logistic Regression (No reg.) AUC 0.6074691138679681

In [36]:
# 6 - CrossValidation with logisticRegression on c5_sd

# define model
model = LogisticRegression()
# define evaluation
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=0)
# define search space
space = dict()
space['solver'] = ['liblinear','saga']#,'newton-cg', 'lbfgs']
space['penalty'] = ['l2','l1']#, 'none', 'l2', 'elasticnet']
space['C'] = [  1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100,1000]
#space['class_weight'] = ['balanced','auto']

search = GridSearchCV(model, space, scoring='roc_auc', n_jobs=-1, cv=cv)
# execute search
result = search.fit(c5_sd,ground_truth_5 )
# summarize result
print('Best Score: %s' % result.best_score_)
print('Best Hyperparameters: %s' % result.best_params_)

# results:
# Best Score: 0.6308096004523736
# Best Hyperparameters: {'C': 0.1, 'penalty': 'l1', 'solver': 'saga'}

In [37]:
# 6a - LogisticRegression on c5_sd

# let's use the parameters choosen by the previous cross-validation
model = LogisticRegression(solver='saga', penalty='l1',C = 0.1, random_state=0)
model.fit(c5_sd, ground_truth_5)

print("intercept {}, coef {}".format(model.intercept_,model.coef_))

pred = model.predict(t5_sd)
predd = model.score(t5_sd, test_truth)
print("accuracy on prediction: {}".format(predd))
print("Confusion matrix:\n {}".format(confusion_matrix(test_truth, model.predict(t5_sd))))

y_test_probs = model.predict_proba(t5_sd)[:,1]
# Containers for true positive / false positive rates
lr_tp_rates = []
lr_fp_rates = []

# Define probability thresholds to use, between 0 and 1
probability_thresholds = np.linspace(0,1,num=100)

# Find true positive / false positive rate for each threshold
for p in probability_thresholds:
    
    y_test_preds = []
    
    for prob in y_test_probs:
        if prob > p:
            y_test_preds.append(1)
        else:
            y_test_preds.append(0)
            
    tp_rate, fp_rate = calc_TP_FP_rate(test_truth, y_test_preds)
        
    lr_tp_rates.append(tp_rate)
    lr_fp_rates.append(fp_rate)

fig, ax = plt.subplots(figsize=(6,6))
ax.plot(lr_fp_rates, lr_tp_rates, label='Logistic Regression')
ax.set_xlabel('False Positive Rate')
ax.set_ylabel('True Positive Rate')
ax.legend();

print(f'Logistic Regression (No reg.) AUC {auc(lr_fp_rates, lr_tp_rates)}')
models.append(model)
preds.append(predd)
auc_l.append(auc(lr_fp_rates, lr_tp_rates))
dataset_l.append("c5_sd")
solver_l.append("saga")
penalty_l.append("l1")
c_l.append(0.1)

# results:
# intercept [0.0097279], coef [[-0.02142671 -0.13389792 -0.13818234  0.19113956  0.09035076  0.03315708
#   -0.09103414 -0.36718629  0.04736628 -0.14834248]]
# accuracy on prediction: 0.6088973223698081
# Confusion matrix:
#  [[2044 1177]
#  [ 678  844]]
# Logistic Regression (No reg.) AUC 0.6368032389285002

In [38]:
print(preds)
print(auc_l)
print(dataset_l)
print(solver_l)
print(penalty_l)
print(c_l)
grid_data = pd.DataFrame({'dataset_name':dataset_l,'Predictions':preds,
                          'Auc':auc_l,'Solver':solver_l,'Penalty':penalty_l,'C':c_l})
grid_data
# [0.6824794433902593, 0.6814252582753532, 0.6791060510225596, 0.6791060510225596, 0.6810035842293907, 0.6879612059877714, 0.687539531941809]
# [0.6193000435300371, 0.5955459837523218, 0.5330532506575402, 0.5328790489156043, 0.5401594170320347, 0.6337366559221861, 0.6053417719866464]
# ['dataset', 'c_sd', 'c1_sd', 'c2_sd', 'c3_sd', 'c4_sd', 'X_std_pca']
# ['liblinear', 'liblinear', 'saga', 'saga', 'liblinear', 'liblinear', 'liblinear']
# ['l1', 'l2', 'l2', 'l2', 'l2', 'l1', 'l1']
# [0.001, 0.0001, 1e-05, 1e-05, 1e-05, 0.01, 0.01]