In [1]:
import numpy as np
import pandas as pd
from time import time
from IPython.display import display # Allows the use of display() for DataFrames

# Import supplementary visualization code visuals.py
import visuals as vs

# Pretty display for notebooks
%matplotlib inline

In [2]:
# Load the Census dataset
data = pd.read_csv("census.csv")

In [3]:
# Split the data into features and target label
income_raw = data['income']
features_raw = data.drop('income', axis = 1)

In [4]:
skewed = ['capital-gain', 'capital-loss']
features_log_transformed = pd.DataFrame(data = features_raw)
features_log_transformed[skewed] = features_raw[skewed].apply(lambda x: np.log(x + 1))

In [5]:
from sklearn.preprocessing import MinMaxScaler

# Initialize a scaler, then apply it to the features
scaler = MinMaxScaler() # default=(0, 1)
numerical = ['age', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']

features_log_minmax_transform = pd.DataFrame(data = features_log_transformed)
features_log_minmax_transform[numerical] = scaler.fit_transform(features_log_transformed[numerical])

In [6]:
features_final = pd.get_dummies(features_log_minmax_transform)

# Encode the 'income_raw' data to numerical values
income = income_raw.map({
    '<=50K': '0',
    '>50K': '1'
})

# Print the number of features after one-hot encoding
encoded = list(features_final.columns)

In [7]:
# Import train_test_split
from sklearn.model_selection import train_test_split

# Split the 'features' and 'income' data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features_final, 
                                                    income, 
                                                    test_size = 0.2, 
                                                    random_state = 0)

In [8]:
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.metrics import confusion_matrix, accuracy_score

In [9]:
clf = GradientBoostingClassifier(random_state=0, verbose=1)

clf.fit(X_train, y_train)

predictions = clf.predict(X_test)

accuracy_score(y_test, predictions)

      Iter       Train Loss   Remaining Time 
         1           1.0505            5.48s
         2           0.9965            5.25s
         3           0.9538            5.14s
         4           0.9180            5.10s
         5           0.8885            5.10s
         6           0.8625            5.21s
         7           0.8403            5.10s
         8           0.8214            5.06s
         9           0.8053            4.98s
        10           0.7907            4.89s
        20           0.7051            4.25s
        30           0.6638            3.70s
        40           0.6403            3.15s
        50           0.6244            2.62s
        60           0.6146            2.09s
        70           0.6064            1.56s
        80           0.5985            1.04s
        90           0.5936            0.52s
       100           0.5888            0.00s


0.8630182421227197

In [10]:
clf = GradientBoostingClassifier(n_estimators=300, random_state=0, verbose=1)

clf.fit(X_train, y_train)

predictions = clf.predict(X_test)

accuracy_score(y_test, predictions)

      Iter       Train Loss   Remaining Time 
         1           1.0505           16.52s
         2           0.9965           16.06s
         3           0.9538           16.37s
         4           0.9180           16.34s
         5           0.8885           16.11s
         6           0.8625           15.98s
         7           0.8403           15.86s
         8           0.8214           15.81s
         9           0.8053           15.78s
        10           0.7907           15.64s
        20           0.7051           14.85s
        30           0.6638           14.20s
        40           0.6403           13.59s
        50           0.6244           13.03s
        60           0.6146           12.48s
        70           0.6064           11.94s
        80           0.5985           11.37s
        90           0.5936           10.84s
       100           0.5888           10.28s
       200           0.5627            5.04s
       300           0.5480            0.00s


0.8698728579325594

In [11]:
clf = GradientBoostingClassifier(n_estimators=300, learning_rate=0.5, random_state=0, verbose=1)

clf.fit(X_train, y_train)

predictions = clf.predict(X_test)

accuracy_score(y_test, predictions)

      Iter       Train Loss   Remaining Time 
         1           0.8480           16.12s
         2           0.7636           16.00s
         3           0.7218           16.30s
         4           0.6868           15.80s
         5           0.6682           15.57s
         6           0.6547           15.28s
         7           0.6430           15.19s
         8           0.6354           15.04s
         9           0.6301           14.98s
        10           0.6199           14.81s
        20           0.5919           14.26s
        30           0.5719           13.71s
        40           0.5616           13.06s
        50           0.5539           12.58s
        60           0.5478           11.99s
        70           0.5439           11.51s
        80           0.5395           10.99s
        90           0.5350           10.49s
       100           0.5309            9.97s
       200           0.5052            4.95s
       300           0.4875            0.00s


0.8688778330569376

In [12]:
clf = GradientBoostingClassifier(n_estimators=300, learning_rate=0.3, random_state=0, verbose=1)

clf.fit(X_train, y_train)

predictions = clf.predict(X_test)

accuracy_score(y_test, predictions)

      Iter       Train Loss   Remaining Time 
         1           0.9320           15.43s
         2           0.8442           14.97s
         3           0.7910           14.93s
         4           0.7557           14.97s
         5           0.7304           14.92s
         6           0.7091           14.82s
         7           0.6939           14.63s
         8           0.6780           14.61s
         9           0.6673           14.63s
        10           0.6593           14.56s
        20           0.6119           14.09s
        30           0.5930           13.50s
        40           0.5828           12.88s
        50           0.5722           12.45s
        60           0.5645           11.95s
        70           0.5578           11.43s
        80           0.5532           10.92s
        90           0.5491           10.41s
       100           0.5458            9.92s
       200           0.5229            4.91s
       300           0.5072            0.00s


0.8710889994472084

In [13]:
clf = GradientBoostingClassifier(n_estimators=600, learning_rate=0.3, random_state=0, verbose=1)

clf.fit(X_train, y_train)

predictions = clf.predict(X_test)

accuracy_score(y_test, predictions)

      Iter       Train Loss   Remaining Time 
         1           0.9320           32.81s
         2           0.8442           32.07s
         3           0.7910           32.57s
         4           0.7557           32.92s
         5           0.7304           32.16s
         6           0.7091           31.56s
         7           0.6939           31.24s
         8           0.6780           30.72s
         9           0.6673           30.68s
        10           0.6593           30.37s
        20           0.6119           29.38s
        30           0.5930           28.88s
        40           0.5828           28.33s
        50           0.5722           27.75s
        60           0.5645           27.12s
        70           0.5578           26.51s
        80           0.5532           26.04s
        90           0.5491           25.46s
       100           0.5458           24.90s
       200           0.5229           19.81s
       300           0.5072           14.85s
       40

0.871420674405749

In [14]:
clf = GradientBoostingClassifier(n_estimators=1000, learning_rate=0.3, random_state=0, verbose=1)

clf.fit(X_train, y_train)

predictions = clf.predict(X_test)

accuracy_score(y_test, predictions)

      Iter       Train Loss   Remaining Time 
         1           0.9320           53.19s
         2           0.8442           51.40s
         3           0.7910           50.62s
         4           0.7557           50.57s
         5           0.7304           50.56s
         6           0.7091           50.00s
         7           0.6939           49.95s
         8           0.6780           49.60s
         9           0.6673           49.33s
        10           0.6593           49.31s
        20           0.6119           48.46s
        30           0.5930           48.48s
        40           0.5828           47.73s
        50           0.5722           47.17s
        60           0.5645           46.58s
        70           0.5578           46.05s
        80           0.5532           45.69s
        90           0.5491           45.08s
       100           0.5458           44.77s
       200           0.5229           39.55s
       300           0.5072           34.53s
       40

0.8684355997788834

In [15]:
clf = GradientBoostingClassifier(n_estimators=750, learning_rate=0.3, random_state=0, verbose=1)

clf.fit(X_train, y_train)

predictions = clf.predict(X_test)

accuracy_score(y_test, predictions)

      Iter       Train Loss   Remaining Time 
         1           0.9320           43.11s
         2           0.8442           40.20s
         3           0.7910           38.51s
         4           0.7557           38.48s
         5           0.7304           38.22s
         6           0.7091           38.10s
         7           0.6939           37.60s
         8           0.6780           37.20s
         9           0.6673           37.37s
        10           0.6593           37.05s
        20           0.6119           36.33s
        30           0.5930           36.03s
        40           0.5828           35.42s
        50           0.5722           34.90s
        60           0.5645           34.50s
        70           0.5578           33.83s
        80           0.5532           33.27s
        90           0.5491           32.83s
       100           0.5458           32.34s
       200           0.5229           27.26s
       300           0.5072           22.31s
       40

0.8679933665008291

In [16]:
clf = GradientBoostingClassifier(
    n_estimators=600, 
    learning_rate=0.3, 
    random_state=0,
    max_depth=6,
    verbose=1
)

clf.fit(X_train, y_train)

predictions = clf.predict(X_test)

accuracy_score(y_test, predictions)

      Iter       Train Loss   Remaining Time 
         1           0.8963           58.19s
         2           0.7955           58.07s
         3           0.7311           58.46s
         4           0.6909           58.31s
         5           0.6624           58.30s
         6           0.6419           58.05s
         7           0.6267           58.27s
         8           0.6096           58.38s
         9           0.6006           58.73s
        10           0.5914           58.60s
        20           0.5447           57.56s
        30           0.5236           56.35s
        40           0.5101           55.13s
        50           0.5008           53.74s
        60           0.4917           52.88s
        70           0.4832           51.70s
        80           0.4718           50.56s
        90           0.4649           49.46s
       100           0.4577           48.48s
       200           0.3832           39.51s
       300 6362344998730141212596291711107270659070669

0.8591487009397457

In [17]:
clf = GradientBoostingClassifier(
    n_estimators=600, 
    learning_rate=0.3, 
    random_state=0,
    max_depth=4,
    verbose=1
)

clf.fit(X_train, y_train)

predictions = clf.predict(X_test)

accuracy_score(y_test, predictions)

      Iter       Train Loss   Remaining Time 
         1           0.9188           44.24s
         2           0.8248           41.66s
         3           0.7661           40.83s
         4           0.7265           40.38s
         5           0.6985           40.17s
         6           0.6784           40.05s
         7           0.6628           39.89s
         8           0.6493           40.23s
         9           0.6397           40.43s
        10           0.6320           40.27s
        20           0.5878           39.18s
        30           0.5654           38.10s
        40           0.5531           37.34s
        50           0.5448           36.57s
        60           0.5394           35.97s
        70           0.5330           35.23s
        80           0.5286           34.51s
        90           0.5226           33.84s
       100           0.5190           33.17s
       200           0.4868           26.39s
       300           0.4627           19.75s
       40

0.8655610834715313

In [18]:
clf = GradientBoostingClassifier(
    n_estimators=600, 
    learning_rate=0.3, 
    random_state=0,
    min_samples_leaf=5,
    verbose=1
)

clf.fit(X_train, y_train)

predictions = clf.predict(X_test)

accuracy_score(y_test, predictions)

      Iter       Train Loss   Remaining Time 
         1           0.9320           31.65s
         2           0.8443           31.37s
         3           0.7910           30.84s
         4           0.7557           30.80s
         5           0.7304           30.77s
         6           0.7091           30.61s
         7           0.6939           30.57s
         8           0.6781           30.37s
         9           0.6674           30.32s
        10           0.6594           30.10s
        20           0.6122           29.10s
        30           0.5929           28.94s
        40           0.5813           28.57s
        50           0.5730           28.02s
        60           0.5651           27.32s
        70           0.5615           26.69s
        80           0.5559           26.19s
        90           0.5517           25.68s
       100           0.5491           25.14s
       200           0.5246           19.86s
       300           0.5098           14.83s
       40

0.8685461580983969

In [19]:
clf = GradientBoostingClassifier(
    n_estimators=600, 
    learning_rate=0.3, 
    random_state=0,
    min_samples_leaf=3,
    verbose=1
)

clf.fit(X_train, y_train)

predictions = clf.predict(X_test)

accuracy_score(y_test, predictions)

      Iter       Train Loss   Remaining Time 
         1           0.9320           32.28s
         2           0.8442           32.89s
         3           0.7910           31.58s
         4           0.7557           31.20s
         5           0.7304           30.96s
         6           0.7091           30.58s
         7           0.6939           30.50s
         8           0.6781           30.18s
         9           0.6674           30.20s
        10           0.6593           30.03s
        20           0.6128           29.27s
        30           0.5945           28.78s
        40           0.5841           28.11s
        50           0.5740           27.52s
        60           0.5656           26.99s
        70           0.5607           26.45s
        80           0.5543           25.89s
        90           0.5495           25.38s
       100           0.5474           24.82s
       200           0.5239           19.68s
       300           0.5089           14.73s
       40

0.8685461580983969

In [20]:
clf = GradientBoostingClassifier(
    n_estimators=600, 
    learning_rate=0.3, 
    random_state=0,
    loss='exponential',
    verbose=1
)

clf.fit(X_train, y_train)

predictions = clf.predict(X_test)

accuracy_score(y_test, predictions)

      Iter       Train Loss   Remaining Time 
         1           0.7708           32.81s
         2           0.7107           32.95s
         3           0.6701           31.76s
         4           0.6444           31.35s
         5           0.6177           30.95s
         6           0.6016           30.49s
         7           0.5901           30.47s
         8           0.5798           30.05s
         9           0.5708           30.04s
        10           0.5621           29.80s
        20           0.5251           28.86s
        30           0.5102           28.47s
        40           0.5030           27.85s
        50           0.4940           27.28s
        60           0.4884           26.98s
        70           0.4824           26.46s
        80           0.4781           25.88s
        90           0.4745           25.30s
       100           0.4712           24.77s
       200           0.4490           19.75s
       300           0.4340           14.85s
       40

0.867330016583748

In [21]:
clf = GradientBoostingClassifier(
    n_estimators=200, 
    learning_rate=0.3, 
    random_state=0,
    max_depth=6,
    verbose=1
)

clf.fit(X_train, y_train)

predictions = clf.predict(X_test)

accuracy_score(y_test, predictions)

      Iter       Train Loss   Remaining Time 
         1           0.8963           21.58s
         2           0.7955           20.72s
         3           0.7311           20.12s
         4           0.6909           19.72s
         5           0.6624           19.52s
         6           0.6419           19.20s
         7           0.6267           19.11s
         8           0.6096           18.99s
         9           0.6006           18.84s
        10           0.5914           18.70s
        20           0.5447           17.89s
        30           0.5236           16.84s
        40           0.5101           15.77s
        50           0.5008           14.75s
        60           0.4917           13.78s
        70           0.4832           12.74s
        80           0.4718           11.76s
        90           0.4649           10.76s
       100           0.4577            9.75s
       200           0.3832            0.00s


0.8676616915422886

In [22]:
clf = GradientBoostingClassifier(
    n_estimators=200, 
    learning_rate=0.3, 
    random_state=0,
    max_depth=6,
    min_samples_leaf=3,
    verbose=1
)

clf.fit(X_train, y_train)

predictions = clf.predict(X_test)

accuracy_score(y_test, predictions)

      Iter       Train Loss   Remaining Time 
         1           0.8963           23.08s
         2           0.7956           23.39s
         3           0.7313           22.27s
         4           0.6912           21.26s
         5           0.6618           20.81s
         6           0.6397           20.44s
         7           0.6218           20.17s
         8           0.6096           19.98s
         9           0.6000           19.83s
        10           0.5929           19.53s
        20           0.5456           18.34s
        30           0.5256           17.26s
        40           0.5134           16.04s
        50           0.5021           14.96s
        60           0.4936           13.91s
        70           0.4845           12.92s
        80           0.4770           11.95s
        90           0.4685           10.93s
       100           0.4635            9.93s
       200           0.4216            0.00s


0.8644555002763958

In [23]:
clf = GradientBoostingClassifier(
    n_estimators=200, 
    learning_rate=0.3, 
    random_state=0,
    max_depth=6,
    min_samples_leaf=10,
    verbose=1
)

clf.fit(X_train, y_train)

predictions = clf.predict(X_test)

accuracy_score(y_test, predictions)

      Iter       Train Loss   Remaining Time 
         1           0.8968           22.71s
         2           0.7962           23.08s
         3           0.7325           22.35s
         4           0.6924           21.52s
         5           0.6648           21.05s
         6           0.6428           20.55s
         7           0.6280           20.20s
         8           0.6168           19.94s
         9           0.6069           19.67s
        10           0.5990           19.53s
        20           0.5508           18.41s
        30           0.5319           17.23s
        40           0.5214           16.04s
        50           0.5124           14.95s
        60           0.5038           13.89s
        70           0.4967           12.88s
        80           0.4886           11.90s
        90           0.4811           10.92s
       100           0.4746            9.92s
       200           1.0718            0.00s


0.8660033167495854

In [24]:
clf = GradientBoostingClassifier(
    n_estimators=200, 
    learning_rate=0.3, 
    random_state=0,
    max_depth=6,
    min_samples_leaf=20,
    verbose=1
)

clf.fit(X_train, y_train)

predictions = clf.predict(X_test)

accuracy_score(y_test, predictions)

      Iter       Train Loss   Remaining Time 
         1           0.8979           22.81s
         2           0.7976           23.35s
         3           0.7346           22.15s
         4           0.6947           21.09s
         5           0.6651           20.65s
         6           0.6449           20.21s
         7           0.6299           19.99s
         8           0.6180           19.80s
         9           0.6091           19.55s
        10           0.5991           19.25s
        20           0.5571           18.06s
        30           0.5379           17.06s
        40           0.5276           15.87s
        50           0.5197           14.80s
        60           0.5103           13.78s
        70           0.5032           12.78s
        80           0.4970           11.78s
        90           0.4920           10.78s
       100           0.4866            9.78s
       200           0.4395            0.00s


0.8682144831398563

In [25]:
clf = GradientBoostingClassifier(
    n_estimators=300, 
    learning_rate=0.3, 
    random_state=0,
    max_depth=6,
    min_samples_leaf=20,
    verbose=1
)

clf.fit(X_train, y_train)

predictions = clf.predict(X_test)

accuracy_score(y_test, predictions)

      Iter       Train Loss   Remaining Time 
         1           0.8979           34.13s
         2           0.7976           33.95s
         3           0.7346           34.16s
         4           0.6947           33.38s
         5           0.6651           33.29s
         6           0.6449           32.39s
         7           0.6299           31.86s
         8           0.6180           31.33s
         9           0.6091           30.98s
        10           0.5991           30.53s
        20           0.5571           29.17s
        30           0.5379           27.61s
        40           0.5276           26.25s
        50           0.5197           25.00s
        60           0.5103           23.91s
        70           0.5032           22.85s
        80           0.4970           21.90s
        90           0.4920           20.82s
       100           0.4866           19.79s
       200           0.4395            9.81s
       300           0.4044            0.00s


0.86489773355445

In [26]:
clf = GradientBoostingClassifier(
    n_estimators=250, 
    learning_rate=0.3, 
    random_state=0,
    max_depth=6,
    min_samples_leaf=20,
    verbose=1
)

clf.fit(X_train, y_train)

predictions = clf.predict(X_test)

accuracy_score(y_test, predictions)

      Iter       Train Loss   Remaining Time 
         1           0.8979           28.18s
         2           0.7976           28.04s
         3           0.7346           27.75s
         4           0.6947           26.61s
         5           0.6651           26.06s
         6           0.6449           25.50s
         7           0.6299           25.22s
         8           0.6180           24.96s
         9           0.6091           24.66s
        10           0.5991           24.41s
        20           0.5571           22.90s
        30           0.5379           22.00s
        40           0.5276           20.84s
        50           0.5197           19.70s
        60           0.5103           18.68s
        70           0.5032           17.66s
        80           0.4970           16.66s
        90           0.4920           15.65s
       100           0.4866           14.65s
       200           0.4395            4.89s


0.8665561083471531

In [27]:
clf = GradientBoostingClassifier(
    n_estimators=200, 
    learning_rate=0.3, 
    random_state=0,
    max_depth=6,
    min_samples_leaf=30,
    verbose=1
)

clf.fit(X_train, y_train)

predictions = clf.predict(X_test)

accuracy_score(y_test, predictions)

      Iter       Train Loss   Remaining Time 
         1           0.8984           22.56s
         2           0.7983           22.59s
         3           0.7355           22.14s
         4           0.6953           21.25s
         5           0.6670           20.69s
         6           0.6456           20.30s
         7           0.6313           20.02s
         8           0.6202           19.78s
         9           0.6116           19.56s
        10           0.6041           19.42s
        20           0.5581           18.17s
        30           0.5400           17.10s
        40           0.5288           15.90s
        50           0.5201           14.88s
        60           0.5133           13.82s
        70           0.5041           12.85s
        80           0.4996           11.83s
        90           0.4951           10.82s
       100           0.4897            9.83s
       200           0.4506            0.00s


0.866334991708126

In [28]:
clf = GradientBoostingClassifier(
    n_estimators=200, 
    learning_rate=0.3, 
    random_state=0,
    max_depth=6,
    min_samples_leaf=25,
    verbose=1
)

clf.fit(X_train, y_train)

predictions = clf.predict(X_test)

accuracy_score(y_test, predictions)

      Iter       Train Loss   Remaining Time 
         1           0.8980           23.29s
         2           0.7977           22.84s
         3           0.7351           22.16s
         4           0.6950           21.20s
         5           0.6656           20.78s
         6           0.6444           20.42s
         7           0.6305           20.09s
         8           0.6194           19.87s
         9           0.6081           19.58s
        10           0.6008           19.60s
        20           0.5550           18.50s
        30           0.5379           17.26s
        40           0.5269           16.11s
        50           0.5186           14.95s
        60           0.5105           13.90s
        70           0.5057           12.85s
        80           0.5000           11.84s
        90           0.4956           10.82s
       100           0.4893            9.83s
       200           0.4464            0.00s


0.869983416252073

In [None]:
clf = GradientBoostingClassifier(
    n_estimators=200, 
    learning_rate=0.3, 
    random_state=0,
    max_depth=5,
    min_samples_leaf=25,
    verbose=1
)

clf.fit(X_train, y_train)

predictions = clf.predict(X_test)

accuracy_score(y_test, predictions)