In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
pd.options.display.float_format = '{:.3f}'.format

## Read the data a divide into train and test datasets

In [4]:
df_1 = pd.read_csv("adult.data", names=['age', 'workclass', 'fnlwgt', 'education', 'education-num', 
                                             'marital-status', 'occupation', 'relationship', 'race', 'sex', 
                                             'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 
                                             '<=50K'], skipinitialspace=True)
df_2 = pd.read_csv("adult.test", names=['age', 'workclass', 'fnlwgt', 'education', 'education-num', 
                                             'marital-status', 'occupation', 'relationship', 'race', 'sex', 
                                             'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 
                                             '<=50K'], skipinitialspace=True, skiprows=1)
df_combined = df_1.append(df_2)

df_combined = df_combined.sample(frac=1, random_state=0)

df_train = df_combined.iloc[:int(df_combined.shape[0] * .75)].copy()
df_test = df_combined.iloc[int(df_combined.shape[0] * .75):].copy()
df_train.shape, df_test.shape

((36631, 15), (12211, 15))

## Pre-process data using an automated data transformation pipeline

In [5]:
from custom_transformers import NominalCategoryMerger, OrdinalCategoryMerger
from custom_transformers import UnknownValuesDropper, CharacterStripper 
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

num_features = ['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']
cat_features = ['workclass', 'marital-status', 'occupation', 'relationship', 'race', 'sex', 
                  'native-country']
occ_features = ['education-num']
label = '<=50K'

df_train = df_train[num_features + cat_features + [label]]
df_test = df_test[num_features + cat_features + [label]]

ncc_threshold = None
ncc_alpha = .1

# Create and parametatrize data transformers
dropper = UnknownValuesDropper(features=['native-country'], unknown_value='?')
stripper = CharacterStripper([label], '.')
nominal_merger = NominalCategoryMerger(cat_features, label, alpha=ncc_alpha)
ordinal_merger = OrdinalCategoryMerger(occ_features, label, alpha=ncc_alpha)
scaler = StandardScaler()
encoder = OneHotEncoder(handle_unknown='ignore')

num_pipeline = Pipeline([('StandardScaler', scaler)])
cat_pipeline = Pipeline([('OneHotEncoder', encoder)])
column_transformer = ColumnTransformer([('Numeric Pipeline', num_pipeline, num_features), 
                                        ('Categorical Pipeline', cat_pipeline, cat_features),
                                        ('Label', encoder, [label]) ])

# Define the data transformation pipeline
X_pipeline = Pipeline([('UnknownValuesDropper', dropper), 
                       ('CharacterStripper', stripper), 
                       ('NominalCategoryMerger', nominal_merger), 
                       ('OrdinalCategoryMerger', ordinal_merger), 
                       ('ColumnTransformer', column_transformer)])

# Fit and transform the data
dataset_train = X_pipeline.fit_transform(df_train)
dataset_test = X_pipeline.transform(df_test)

X_train = dataset_train[:, :-2]
y_train = dataset_train[:, -2].flatten()
X_test = dataset_test[:, :-2]
y_test = dataset_test[:, -2].flatten()

X_train.shape, X_test.shape, y_train.shape, y_test.shape

Dropped 648 rows with unknown values.


  return self.partial_fit(X, y)
  return self.fit(X, **fit_params).transform(X)


Dropped 209 rows with unknown values.


  Xt = transform.transform(Xt)


((35983, 44), (12002, 44), (35983,), (12002,))

## Preliminary testing of models

In [8]:
from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import cross_val_score

log_reg = LogisticRegressionCV(cv=5, random_state=0, max_iter=500).fit(X_train, y_train)
score = cross_val_score(log_reg, X_train, y_train, cv=5, n_jobs=-1)
np.average(score)

0.8518189003195694

In [9]:
from sklearn.ensemble import RandomForestClassifier

random_forest = RandomForestClassifier(n_estimators=500, random_state=0, n_jobs=-1).fit(X_train, y_train)
score = cross_val_score(random_forest, X_train, y_train, cv=5, n_jobs=-1)
np.average(score)

0.8551816789465538

In [10]:
from sklearn.svm import LinearSVC

linear_svc = LinearSVC(random_state=0, tol=1e-5, max_iter=10000, C=.01).fit(X_train, y_train)
score = cross_val_score(linear_svc, X_train, y_train, cv=5, n_jobs=-1)
np.average(score)

0.8517910491808781

In [11]:
from sklearn.svm import SVC

poly_svc = SVC(kernel='poly', gamma='auto', C=1000, random_state=0).fit(X_train, y_train)
score = cross_val_score(poly_svc, X_train, y_train, cv=5, n_jobs=-1)
np.average(score)

0.8526805325680072

In [12]:
rbf_svc = SVC(kernel='rbf', gamma='auto', C=30, random_state=0).fit(X_train, y_train)
score = cross_val_score(rbf_svc, X_train, y_train, cv=5, n_jobs=-1)
np.average(score)

0.8573770806357407

In [13]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=100, weights='distance').fit(X_train, y_train)
score = cross_val_score(knn, X_train, y_train, cv=5, n_jobs=-1)
np.average(score)

0.845788146470913

In [14]:
from sklearn.ensemble import AdaBoostClassifier

adaboost = AdaBoostClassifier(n_estimators=100).fit(X_train, y_train)
score = cross_val_score(adaboost, X_train, y_train, cv=5, n_jobs=-1)
np.average(score)

0.8655198758163811

In [15]:
from sklearn.ensemble import GradientBoostingClassifier

gbc = GradientBoostingClassifier(learning_rate=0.1, n_estimators=100).fit(X_train, y_train)
score = cross_val_score(gbc, X_train, y_train, cv=5, n_jobs=-1)
np.average(score)

0.8667149031179537

## Tuning the selected gradient boosted tree classifier
The experiments shown above (plus some more that are not presented here) lead to the conclusion that Gradient Boosting Classifier has the best performance for this dataset. In this section, I tune the model following these steps:

1. GBC performs better the lower its learning rate is, but requires more estimators (and therefore more training time) for lower learning rates. I will start by setting the learning rate of 0.1 and lower it on the final step. 
2. Find the optimal number of trees for this learning rate.
3. Tune tree specific parameters.
4. Lower the learning rate and increase the number of trees.

In [22]:
def gbc_early_stopping_tuning(estimator, initial_n_estimators):
    max_val_score = 0
    best_n_estimators = initial_n_estimators
    score_going_down = 0
    estimator.n_estimators = initial_n_estimators
    while score_going_down < 3:
        estimator.fit(X_train, y_train)
        val_score = np.average(cross_val_score(estimator, X_train, y_train, cv=5, n_jobs=-1))
        print(estimator.n_estimators, val_score)
        if val_score > max_val_score:
            max_val_score = val_score
            best_n_estimators =  estimator.n_estimators
            score_going_down = 0
        else:
            score_going_down += 1
        estimator.n_estimators = int(estimator.n_estimators * 1.2)

    return best_n_estimators, max_val_score

In [17]:
estimator = GradientBoostingClassifier(learning_rate=0.1)
initial_n_estimators = 20
gbc_early_stopping_tuning(estimator, initial_n_estimators)

20 0.8515130783828402
24 0.8547645798937442
28 0.8561263124443677
33 0.8578215906510419
39 0.8589332846332252
46 0.860239504116435
55 0.861767972811409
66 0.86407467092986
79 0.8653530971935425
94 0.865936766368572
112 0.8671039772831662
134 0.868354548550671
160 0.8696606599212565
192 0.8705221299846629
230 0.8718004713010256
276 0.8726063626532252
331 0.8724674815104171
397 0.8726898465658731
476 0.8726898427019488
571 0.8727176783913808
685 0.8721896111046659
822 0.871550432728293
986 0.8714670492079316
1183 0.8701052510135149
1419 0.8694382794133222


(571, 0.8727176783913808)

In [19]:
from sklearn.model_selection import GridSearchCV

param_test = {'max_depth':range(2, 8), 'max_features':[4, 5, 7, 10, 15, None], 
              'subsample':[0.6, 0.7, 0.8, 0.9, 1]}

gsearch = GridSearchCV(estimator = GradientBoostingClassifier(learning_rate=0.1, n_estimators=500, random_state=0),
                       param_grid = param_test, n_jobs=-1, cv=5)
gsearch.fit(X_train, y_train)
gsearch.best_params_, gsearch.best_score_

({'max_depth': 3, 'max_features': None, 'subsample': 1}, 0.8726065086290749)

In [21]:
estimator = GradientBoostingClassifier(learning_rate=0.1, max_depth=3)
initial_n_estimators = 60
gbc_early_stopping_tuning(estimator, initial_n_estimators)

60 0.8625739800073007
72 0.864658305347812
86 0.8656032284371511
103 0.8667426924746302
123 0.8674651964511281
147 0.8691326775807882
176 0.8700496723055272
211 0.8713002667383367
253 0.8721617792770123
303 0.8723840671033404
363 0.8726064360195018
435 0.8727176088911746
522 0.872356370414321
626 0.872745425273861
751 0.8720506681819892
901 0.8715226626740702
1081 0.8705221763163454
1297 0.8697439507557198
1556 0.8691603665365937


(626, 0.872745425273861)

In [25]:
estimator = GradientBoostingClassifier(learning_rate=0.01, max_depth=3)
initial_n_estimators = 1000
gbc_early_stopping_tuning(estimator, initial_n_estimators)

For now, I am skipping, for lack of time, the final tuning step - lowering the learning rate and adjusting the number of estimators accordingly. Also missing here is an evaluation of the selected model against the test set. The selected model performed on the test set drastically worse than on the validation set. I believe that the reason for this is a bug in the custom transfomers - I have received in some experiments test set results of over 87.5% accuracy. I will update the repo later addressing these issues.  