resources:
https://stackabuse.com/one-hot-encoding-in-python-with-pandas-and-scikit-learn/

In [22]:
%config InlineBackend.figure_format = 'retina'
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## Format Data

In [23]:
header = ['age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital_status', 
         'occupation', 'relationship', 'race', 'sex', 'capital_gain', 'capital_loss',
         'hours_per_week', 'native_country', 'y_both']
adult = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data', names=header)
adult = adult.reset_index(drop=True)
adult.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,y_both
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


### One-Hot-encode data

In [24]:
# set up new dataframes of each variable, 
# and one-hot-encode the necessary variables using dummy variables
age_oh = adult['age']
workclass_oh = pd.get_dummies(adult.workclass, prefix='workclass')
fnlwgt_oh = adult['fnlwgt']
education_oh = pd.get_dummies(adult.education, prefix='education')
education_num_oh = adult['education_num']
marital_status_oh = pd.get_dummies(adult.marital_status, prefix='marital_status')
occupation_oh = pd.get_dummies(adult.occupation, prefix='occupation')
relationship_oh = pd.get_dummies(adult.relationship, prefix='relationship')
race_oh = pd.get_dummies(adult.race, prefix='race')
sex_oh = pd.get_dummies(adult.sex, prefix='sex')
capital_gain_oh = adult['capital_gain']
capital_loss_oh = adult['capital_loss']
hours_per_week_oh = adult['hours_per_week']
native_country_oh = pd.get_dummies(adult.native_country, prefix='native_country')
y_oh = pd.get_dummies(adult.y_both, prefix='y')

y_oh.head()

Unnamed: 0,y_ <=50K,y_ >50K
0,1,0
1,1,0
2,1,0
3,1,0
4,1,0


In [28]:
var_list = [age_oh, workclass_oh, fnlwgt_oh, education_oh, education_num_oh, marital_status_oh, 
         occupation_oh, relationship_oh, race_oh, sex_oh, capital_gain_oh, capital_loss_oh,
         hours_per_week_oh, native_country_oh, y_oh['y_ >50K']]
adult_oh = pd.concat(var_list, axis=1).reindex(age_oh.index)
adult_oh = adult_oh.rename(columns={'y_ >50K': 'Y'})
print(adult_oh.shape)
adult_oh.head()

(32561, 109)


Unnamed: 0,age,workclass_ ?,workclass_ Federal-gov,workclass_ Local-gov,workclass_ Never-worked,workclass_ Private,workclass_ Self-emp-inc,workclass_ Self-emp-not-inc,workclass_ State-gov,workclass_ Without-pay,...,native_country_ Puerto-Rico,native_country_ Scotland,native_country_ South,native_country_ Taiwan,native_country_ Thailand,native_country_ Trinadad&Tobago,native_country_ United-States,native_country_ Vietnam,native_country_ Yugoslavia,Y
0,39,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0
1,50,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0
2,38,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,53,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,28,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [44]:
adult_oh.to_csv('adult_oh.csv', index=False)

### Create 5 Trials - Training and Testing Sets

In [31]:
from sklearn.model_selection import train_test_split

XY_train1, XY_test1 = train_test_split(adult_oh, train_size=5000, shuffle=True)
XY_train2, XY_test2 = train_test_split(adult_oh, train_size=5000, shuffle=True)
XY_train3, XY_test3 = train_test_split(adult_oh, train_size=5000, shuffle=True)
XY_train4, XY_test4 = train_test_split(adult_oh, train_size=5000, shuffle=True)
XY_train5, XY_test5 = train_test_split(adult_oh, train_size=5000, shuffle=True)

In [32]:
print(XY_train1.shape)
print(XY_test1.shape)
XY_train1.head()

(5000, 109)
(27561, 109)


Unnamed: 0,age,workclass_ ?,workclass_ Federal-gov,workclass_ Local-gov,workclass_ Never-worked,workclass_ Private,workclass_ Self-emp-inc,workclass_ Self-emp-not-inc,workclass_ State-gov,workclass_ Without-pay,...,native_country_ Puerto-Rico,native_country_ Scotland,native_country_ South,native_country_ Taiwan,native_country_ Thailand,native_country_ Trinadad&Tobago,native_country_ United-States,native_country_ Vietnam,native_country_ Yugoslavia,Y
16708,49,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4332,36,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
14689,72,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
11860,38,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
12983,27,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0


## Run GridSearchCV on classifier

In [33]:
%%time
#import warnings
# there are a lot of convergence warnings for some params, however be careful with this!!
# sometimes you need to see those wanrings, and now we've screwed tha tup for the whole notebook from here on!!
#warnings.filterwarnings('ignore')

from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression 
import numpy as np
from sklearn.model_selection import GridSearchCV

# Create a pipeline - RF is a stand in, we will populate the classifier part below
pipe = Pipeline([('std', StandardScaler()),
                 ('classifier', LogisticRegression())])

# Create search space of candidate learning algorithms and their hyperparameters
# note lbfgs can't do l1, and if you pass penalty='none' it expects no C value
search_space = [{'classifier': [LogisticRegression(max_iter=5000)],
                 'classifier__solver': ['saga'],
                 'classifier__penalty': ['l1', 'l2'],
                 'classifier__C': np.logspace(-4, 4, 9)},
                {'classifier': [LogisticRegression(max_iter=5000)],
                 'classifier__solver': ['lbfgs'],
                 'classifier__penalty': ['l2'],
                 'classifier__C': np.logspace(-4, 4, 9)},
                {'classifier': [LogisticRegression(max_iter=5000)],
                 'classifier__solver': ['lbfgs','saga'],
                 'classifier__penalty': ['none']}
                ]

trialnum = 0
accuracy_sum = 0 # sum of top accuracy to later calculate the average of all 5 trials
roc_sum = 0 # sum of top roc score to later calculate the average of all 5 trials
f1_sum = 0 # sum of top accuracy to later calculate the average of all 5 trials
accuracy_scores = []
roc_scores = []
f1_scores = []
all_accuracy_models = []
all_roc_models = []
all_f1_models = []

# for every trial
for trial in [XY_train1, XY_train2, XY_train3, XY_train4, XY_train5]:
 
    trialnum = trialnum + 1
    X_l = trial.drop(['Y'],1)
    y_l = trial['Y']
    
    # Create grid search 
    clf = GridSearchCV(pipe, search_space, cv=StratifiedKFold(n_splits=5), 
                       scoring=['accuracy', 'roc_auc_ovr', 'f1_micro'], refit=False,
                       verbose=0)

    # Fit grid search
    best_model = clf.fit(X_l, y_l)
    
    print("------------------------------------------------------------------------------------")
    print("RESULTS FOR TRIAL:")
    print(trialnum)
    print("------------------------------------------------------------------------------------")
    
    # the detailed results of the whole model selection search...
#     print(best_model.cv_results_)

    print("---------------BEST MODEL FOR ACCURACY: ----------")
    print( best_model.cv_results_['params'][ np.argmin(best_model.cv_results_['rank_test_accuracy']) ] )
    print("---WITH ACCURACY: ---")
    current_accuracy = best_model.cv_results_['mean_test_accuracy'][ np.argmax(best_model.cv_results_['mean_test_accuracy']) ]
    print(current_accuracy)
    accuracy_sum = accuracy_sum + current_accuracy
    accuracy_scores.append(current_accuracy)
    all_accuracy_models.append( best_model.cv_results_['params'][ np.argmin(best_model.cv_results_['rank_test_accuracy']) ] )
    
    print("---------------BEST MODEL FOR ROC: ---------------")
    print( best_model.cv_results_['params'][ np.argmin(best_model.cv_results_['rank_test_roc_auc_ovr']) ] )
    print("---WITH ROC: ---")
    current_roc = best_model.cv_results_['mean_test_roc_auc_ovr'][ np.argmax(best_model.cv_results_['mean_test_roc_auc_ovr']) ]
    print(current_roc)
    roc_sum = roc_sum + current_roc
    roc_scores.append(current_roc)
    all_roc_models.append( best_model.cv_results_['params'][ np.argmin(best_model.cv_results_['rank_test_roc_auc_ovr']) ] )

    
    print("---------------BEST MODEL FOR F1: ----------------")
    print( best_model.cv_results_['params'][ np.argmin(best_model.cv_results_['rank_test_f1_micro']) ] )
    print("---WITH F1: ---")    
    current_f1 = best_model.cv_results_['mean_test_f1_micro'][ np.argmax(best_model.cv_results_['mean_test_f1_micro']) ]
    print(current_f1)
    f1_sum = f1_sum + current_f1
    f1_scores.append(current_f1)
    all_f1_models.append( best_model.cv_results_['params'][ np.argmin(best_model.cv_results_['rank_test_f1_micro']) ] )

    
    # below (optional): check that the above outputs actually show best scores

#     print results just to check alignment with the above
#     results = pd.DataFrame( best_model.cv_results_['params'] ) # parameter settings for best model
#     grab the accuracy score resulting from those parameters
#     results['score_acc'] = best_model.cv_results_['mean_test_accuracy']
#     results['score_roc'] = best_model.cv_results_['mean_test_roc_auc_ovr']
#     results['score_f1'] = best_model.cv_results_['mean_test_f1_micro']
#     get rid of classifier__XX in columns
#     cols = results.columns.to_series().str.split('__').apply(lambda x: x[-1])
#     results.columns = cols
#     print(results)


------------------------------------------------------------------------------------
RESULTS FOR TRIAL:
1
------------------------------------------------------------------------------------
---------------BEST MODEL FOR ACCURACY: ----------
{'classifier': LogisticRegression(max_iter=5000), 'classifier__C': 0.1, 'classifier__penalty': 'l1', 'classifier__solver': 'saga'}
---WITH ACCURACY: ---
0.8476000000000001
---------------BEST MODEL FOR ROC: ---------------
{'classifier': LogisticRegression(max_iter=5000), 'classifier__C': 0.1, 'classifier__penalty': 'l1', 'classifier__solver': 'saga'}
---WITH ROC: ---
0.898799431513402
---------------BEST MODEL FOR F1: ----------------
{'classifier': LogisticRegression(max_iter=5000), 'classifier__C': 0.1, 'classifier__penalty': 'l1', 'classifier__solver': 'saga'}
---WITH F1: ---
0.8476000000000001
------------------------------------------------------------------------------------
RESULTS FOR TRIAL:
2
----------------------------------------------

### Get Train metrics

In [34]:
print("===== ACCURACY SCORES: =====")
print(accuracy_scores)
print("===== ROC SCORES: =====")
print(roc_scores)
print("===== F1 SCORES: =====")
print(f1_scores)

===== ACCURACY SCORES: =====
[0.8476000000000001, 0.8421999999999998, 0.8504000000000002, 0.8488, 0.8479999999999999]
===== ROC SCORES: =====
[0.898799431513402, 0.9009229617213899, 0.9022767884142755, 0.9043436556485002, 0.9049710231102231]
===== F1 SCORES: =====
[0.8476000000000001, 0.8421999999999998, 0.8504000000000002, 0.8488, 0.8479999999999999]


In [35]:
print("================ BEST ACCURACY MODEL IN TRAINING: ==================")
print(all_accuracy_models[ np.argmax(accuracy_scores) ])
print("WITH ACCURACY:")
print(max(accuracy_scores))
print("================ BEST ROC MODEL IN TRAINING: ==================")
print(all_roc_models[ np.argmax(roc_scores) ])
print("WITH ROC SCORE:")
print(max(roc_scores))
print("================ BEST F1 MODEL IN TRAINING: ==================")
print(all_f1_models[ np.argmax(f1_scores) ])
print("WITH F1 SCORE:")
print(max(f1_scores))

{'classifier': LogisticRegression(max_iter=5000), 'classifier__C': 0.1, 'classifier__penalty': 'l1', 'classifier__solver': 'saga'}
WITH ACCURACY:
0.8504000000000002
{'classifier': LogisticRegression(max_iter=5000), 'classifier__C': 0.1, 'classifier__penalty': 'l1', 'classifier__solver': 'saga'}
WITH ROC SCORE:
0.9049710231102231
{'classifier': LogisticRegression(max_iter=5000), 'classifier__C': 0.1, 'classifier__penalty': 'l1', 'classifier__solver': 'saga'}
WITH F1 SCORE:
0.8504000000000002


In [36]:
print("================ AVERAGE ACCURACY ON TRAIN SET: ==================")
print(accuracy_sum / 5)
print("================ AVERAGE ROC SCORE ON TRAIN SET: ==================")
print(roc_sum / 5)
print("================ AVERAGE F1 SCORE ON TRAIN SET: ==================")
print(f1_sum / 5)


0.8474
0.9022627720815581
0.8474


### Get Test Metrics

In [39]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score

# test performance of model rated as best for accuracy and f1 score on training set
performance_AccModel = pd.DataFrame(index=['acc', 'roc', 'f1'], columns=['Trial 1', 'Trial 2', 'Trial 3', 'Trial 4', 'Trial 5'])
performance_AccModel

# test performance on trial 1 test set
clf = LogisticRegression(penalty='l1', C=0.1, solver='saga', max_iter=5000).fit(XY_train1.drop(['Y'],1), XY_train1['Y'])
pred = clf.predict(XY_test1.drop(['Y'],1))
performance_AccModel.loc['acc', 'Trial 1'] = accuracy_score(XY_test1['Y'], pred)
performance_AccModel.loc['roc', 'Trial 1'] = roc_auc_score(XY_test1['Y'], pred)
performance_AccModel.loc['f1', 'Trial 1'] = f1_score(XY_test1['Y'], pred)

# test performance on trial 2 test set
clf = LogisticRegression(penalty='l1', C=0.1, solver='saga', max_iter=5000).fit(XY_train2.drop(['Y'],1), XY_train2['Y'])
pred = clf.predict(XY_test2.drop(['Y'],1))
performance_AccModel.loc['acc', 'Trial 2'] = accuracy_score(XY_test2['Y'], pred)
performance_AccModel.loc['roc', 'Trial 2'] = roc_auc_score(XY_test2['Y'], pred)
performance_AccModel.loc['f1', 'Trial 2'] = f1_score(XY_test2['Y'], pred)

# test performance on trial 3 test set
clf = LogisticRegression(penalty='l1', C=0.1, solver='saga', max_iter=5000).fit(XY_train3.drop(['Y'],1), XY_train3['Y'])
pred = clf.predict(XY_test3.drop(['Y'],1))
performance_AccModel.loc['acc', 'Trial 3'] = accuracy_score(XY_test3['Y'], pred)
performance_AccModel.loc['roc', 'Trial 3'] = roc_auc_score(XY_test3['Y'], pred)
performance_AccModel.loc['f1', 'Trial 3'] = f1_score(XY_test3['Y'], pred)

# test performance on trial 4 test set
clf = LogisticRegression(penalty='l1', C=0.1, solver='saga', max_iter=5000).fit(XY_train4.drop(['Y'],1), XY_train4['Y'])
pred = clf.predict(XY_test4.drop(['Y'],1))
performance_AccModel.loc['acc', 'Trial 4'] = accuracy_score(XY_test4['Y'], pred)
performance_AccModel.loc['roc', 'Trial 4'] = roc_auc_score(XY_test4['Y'], pred)
performance_AccModel.loc['f1', 'Trial 4'] = f1_score(XY_test4['Y'], pred)

# test performance on trial 5 test set
clf = LogisticRegression(penalty='l1', C=0.1, solver='saga', max_iter=5000).fit(XY_train5.drop(['Y'],1), XY_train5['Y'])
pred = clf.predict(XY_test5.drop(['Y'],1))
performance_AccModel.loc['acc', 'Trial 5'] = accuracy_score(XY_test5['Y'], pred)
performance_AccModel.loc['roc', 'Trial 5'] = roc_auc_score(XY_test5['Y'], pred)
performance_AccModel.loc['f1', 'Trial 5'] = f1_score(XY_test5['Y'], pred)

performance_AccModel



Unnamed: 0,Trial 1,Trial 2,Trial 3,Trial 4,Trial 5
acc,0.794529,0.793295,0.793222,0.793186,0.794492
roc,0.600042,0.596359,0.601015,0.599663,0.60656
f1,0.344788,0.335627,0.348909,0.344978,0.364167


## Get final Logistic Regression results

In [41]:
print("================ BEST ACCURACY MODEL IN KNN: ==================")
print(all_accuracy_models[ np.argmax(np.array(performance_AccModel.iloc[0])) ])
print("WITH ACCURACY:")
print((np.array(performance_AccModel.iloc[1])).mean())
print("================ BEST ROC MODEL IN KNN: ==================")
print(all_roc_models[ np.argmax(np.array(performance_AccModel.iloc[1])) ])
print("WITH ROC SCORE:")
print((performance_AccModel.iloc[1]).mean())
print("================ BEST F1 MODEL IN KNN: ==================")
print(all_f1_models[ np.argmax(np.array(performance_AccModel.iloc[2])) ])
print("WITH F1 SCORE:")
print((np.array(performance_AccModel.iloc[2])).mean())

{'classifier': LogisticRegression(max_iter=5000), 'classifier__C': 0.1, 'classifier__penalty': 'l1', 'classifier__solver': 'saga'}
WITH ACCURACY:
0.6007278877326276
{'classifier': LogisticRegression(max_iter=5000), 'classifier__C': 0.1, 'classifier__penalty': 'l1', 'classifier__solver': 'saga'}
WITH ROC SCORE:
0.6007278877326276
{'classifier': LogisticRegression(max_iter=5000), 'classifier__C': 0.1, 'classifier__penalty': 'l1', 'classifier__solver': 'saga'}
WITH F1 SCORE:
0.3476937327845045
