In [1]:
import numpy as np
import pandas as pd
from time import time
from IPython.display import display # Allows the use of display() for DataFrames

# Import supplementary visualization code visuals.py
import visuals as vs

# Pretty display for notebooks
%matplotlib inline

In [2]:
# Load the Census dataset
data = pd.read_csv("census.csv")

In [3]:
# Split the data into features and target label
income_raw = data['income']
features_raw = data.drop('income', axis = 1)

In [4]:
skewed = ['capital-gain', 'capital-loss']
features_log_transformed = pd.DataFrame(data = features_raw)
features_log_transformed[skewed] = features_raw[skewed].apply(lambda x: np.log(x + 1))

In [5]:
from sklearn.preprocessing import MinMaxScaler

# Initialize a scaler, then apply it to the features
scaler = MinMaxScaler() # default=(0, 1)
numerical = ['age', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']

features_log_minmax_transform = pd.DataFrame(data = features_log_transformed)
features_log_minmax_transform[numerical] = scaler.fit_transform(features_log_transformed[numerical])

In [6]:
features_final = pd.get_dummies(features_log_minmax_transform)

# Encode the 'income_raw' data to numerical values
income = income_raw.map({
    '<=50K': '0',
    '>50K': '1'
})

# Print the number of features after one-hot encoding
encoded = list(features_final.columns)

In [7]:
# Import train_test_split
from sklearn.model_selection import train_test_split

# Split the 'features' and 'income' data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features_final, 
                                                    income, 
                                                    test_size = 0.2, 
                                                    random_state = 0)

In [8]:
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.metrics import confusion_matrix, accuracy_score

In [13]:
clf = GradientBoostingClassifier(n_estimators=600, learning_rate=0.3, random_state=0, verbose=1)

clf.fit(X_train, y_train)

predictions = clf.predict(X_test)

accuracy_score(y_test, predictions)

      Iter       Train Loss   Remaining Time 
         1           0.9320           32.81s
         2           0.8442           32.07s
         3           0.7910           32.57s
         4           0.7557           32.92s
         5           0.7304           32.16s
         6           0.7091           31.56s
         7           0.6939           31.24s
         8           0.6780           30.72s
         9           0.6673           30.68s
        10           0.6593           30.37s
        20           0.6119           29.38s
        30           0.5930           28.88s
        40           0.5828           28.33s
        50           0.5722           27.75s
        60           0.5645           27.12s
        70           0.5578           26.51s
        80           0.5532           26.04s
        90           0.5491           25.46s
       100           0.5458           24.90s
       200           0.5229           19.81s
       300           0.5072           14.85s
       40

0.871420674405749

In [29]:
clf = GradientBoostingClassifier(
    n_estimators=200, 
    learning_rate=0.3, 
    random_state=0,
    max_depth=5,
    min_samples_leaf=25,
    verbose=1
)

clf.fit(X_train, y_train)

predictions = clf.predict(X_test)

accuracy_score(y_test, predictions)

      Iter       Train Loss   Remaining Time 
         1           0.9091           18.42s
         2           0.8115           18.08s
         3           0.7509           17.96s
         4           0.7123           17.29s
         5           0.6834           16.88s
         6           0.6643           16.59s
         7           0.6492           16.47s
         8           0.6371           16.30s
         9           0.6282           16.18s
        10           0.6214           15.96s
        20           0.5762           15.18s
        30           0.5552           14.29s
        40           0.5452           13.28s
        50           0.5380           12.42s
        60           0.5313           11.54s
        70           0.5244           10.69s
        80           0.5182            9.89s
        90           0.5151            9.03s
       100           0.5096            8.20s
       200           0.4762            0.00s


0.8698728579325594

In [30]:
clf = GradientBoostingClassifier(learning_rate=0.3, verbose=1)

clf.fit(X_train, y_train)

predictions = clf.predict(X_test)

accuracy_score(y_test, predictions)

0.8695411829740188

In [31]:
clf = GradientBoostingClassifier(learning_rate=0.6, verbose=1)

clf.fit(X_train, y_train)

predictions = clf.predict(X_test)

accuracy_score(y_test, predictions)

      Iter       Train Loss   Remaining Time 
         1           0.8182            5.72s
         2           0.7381            5.39s
         3           0.6944            5.40s
         4           0.6769            5.29s
         5           0.6549            5.37s
         6           0.6435            5.24s
         7           0.6336            5.14s
         8           0.6292            5.02s
         9           0.6225            4.97s
        10           0.6168            4.91s
        20           0.5799            4.27s
        30           0.5657            3.69s
        40           0.5529            3.12s
        50           0.5472            2.59s
        60           0.5417            2.06s
        70           0.5367            1.55s
        80           0.5331            1.03s
        90           0.5290            0.52s
       100           0.5243            0.00s


0.8688778330569376

In [32]:
clf = GradientBoostingClassifier(learning_rate=0.4, verbose=1)

clf.fit(X_train, y_train)

predictions = clf.predict(X_test)

accuracy_score(y_test, predictions)

      Iter       Train Loss   Remaining Time 
         1           0.8857            5.66s
         2           0.7966            5.52s
         3           0.7479            5.31s
         4           0.7143            5.31s
         5           0.6956            5.32s
         6           0.6767            5.21s
         7           0.6653            5.12s
         8           0.6535            5.08s
         9           0.6449            5.01s
        10           0.6373            4.96s
        20           0.6031            4.17s
        30           0.5826            3.61s
        40           0.5735            3.08s
        50           0.5635            2.56s
        60           0.5573            2.04s
        70           0.5516            1.52s
        80           0.5484            1.01s
        90           0.5428            0.51s
       100           0.5378            0.00s


0.8700939745715865

In [33]:
clf = GradientBoostingClassifier(learning_rate=0.4, verbose=1, n_estimators=200)

clf.fit(X_train, y_train)

predictions = clf.predict(X_test)

accuracy_score(y_test, predictions)

      Iter       Train Loss   Remaining Time 
         1           0.8857           11.63s
         2           0.7966           10.94s
         3           0.7479           10.69s
         4           0.7143           10.74s
         5           0.6956           10.94s
         6           0.6767           10.73s
         7           0.6653           10.66s
         8           0.6535           10.74s
         9           0.6449           10.63s
        10           0.6373           10.49s
        20           0.6031            9.49s
        30           0.5826            8.87s
        40           0.5735            8.26s
        50           0.5635            7.74s
        60           0.5573            7.19s
        70           0.5516            6.65s
        80           0.5484            6.12s
        90           0.5428            5.59s
       100           0.5378            5.08s
       200           0.5112            0.00s


0.871199557766722

In [34]:
clf = GradientBoostingClassifier(learning_rate=0.4, verbose=1, n_estimators=400)

clf.fit(X_train, y_train)

predictions = clf.predict(X_test)

accuracy_score(y_test, predictions)

      Iter       Train Loss   Remaining Time 
         1           0.8857           23.06s
         2           0.7966           21.91s
         3           0.7479           21.90s
         4           0.7143           22.02s
         5           0.6956           22.02s
         6           0.6767           21.75s
         7           0.6653           21.57s
         8           0.6535           21.55s
         9           0.6449           21.45s
        10           0.6373           21.41s
        20           0.6031           20.08s
        30           0.5826           19.29s
        40           0.5735           18.67s
        50           0.5635           18.17s
        60           0.5573           17.55s
        70           0.5516           16.99s
        80           0.5484           16.45s
        90           0.5428           15.94s
       100           0.5378           15.39s
       200           0.5112           10.10s
       300           0.4949            5.02s
       40

0.8689883913764511

In [35]:
clf = GradientBoostingClassifier(learning_rate=0.4, verbose=1, n_estimators=300)

clf.fit(X_train, y_train)

predictions = clf.predict(X_test)

accuracy_score(y_test, predictions)

      Iter       Train Loss   Remaining Time 
         1           0.8857           17.26s
         2           0.7966           16.44s
         3           0.7479           16.09s
         4           0.7143           16.25s
         5           0.6956           16.49s
         6           0.6767           16.32s
         7           0.6653           16.05s
         8           0.6535           15.85s
         9           0.6449           15.71s
        10           0.6373           15.52s
        20           0.6031           14.44s
        30           0.5826           13.71s
        40           0.5735           13.22s
        50           0.5635           12.72s
        60           0.5573           12.22s
        70           0.5516           11.67s
        80           0.5484           11.13s
        90           0.5428           10.60s
       100           0.5378           10.08s
       200           0.5112            5.00s
       300           0.4949            0.00s


0.8692095080154781

In [37]:
clf = GradientBoostingClassifier(learning_rate=0.45, verbose=1, n_estimators=200)

clf.fit(X_train, y_train)

predictions = clf.predict(X_test)

accuracy_score(y_test, predictions)

      Iter       Train Loss   Remaining Time 
         1           0.8658           11.02s
         2           0.7786           10.65s
         3           0.7323           10.63s
         4           0.7062           10.48s
         5           0.6804           10.60s
         6           0.6657           10.48s
         7           0.6514           10.42s
         8           0.6402           10.33s
         9           0.6322           10.32s
        10           0.6274           10.21s
        20           0.5965            9.47s
        30           0.5774            8.75s
        40           0.5675            8.16s
        50           0.5583            7.59s
        60           0.5511            7.04s
        70           0.5454            6.57s
        80           0.5413            6.05s
        90           0.5369            5.53s
       100           0.5342            5.02s
       200           0.5083            0.00s


0.8702045328911

In [40]:
clf = GradientBoostingClassifier(
    learning_rate=0.45, 
    verbose=1, 
    n_estimators=220,
    min_samples_split=20
)

clf.fit(X_train, y_train)

predictions = clf.predict(X_test)

accuracy_score(y_test, predictions)

      Iter       Train Loss   Remaining Time 
         1           0.8658           12.50s
         2           0.7786           11.96s
         3           0.7323           11.78s
         4           0.7062           11.66s
         5           0.6804           11.72s
         6           0.6657           11.57s
         7           0.6514           11.54s
         8           0.6402           11.45s
         9           0.6323           11.46s
        10           0.6275           11.31s
        20           0.5965           10.48s
        30           0.5797            9.82s
        40           0.5638            9.25s
        50           0.5569            8.68s
        60           0.5514            8.12s
        70           0.5464            7.60s
        80           0.5420            7.08s
        90           0.5389            6.55s
       100           0.5354            6.03s
       200           0.5100            1.00s


0.8706467661691543

In [41]:
clf = GradientBoostingClassifier(
    learning_rate=0.45, 
    verbose=1, 
    n_estimators=220,
    min_samples_split=30
)

clf.fit(X_train, y_train)

predictions = clf.predict(X_test)

accuracy_score(y_test, predictions)

      Iter       Train Loss   Remaining Time 
         1           0.8658           12.72s
         2           0.7786           12.11s
         3           0.7323           12.89s
         4           0.7062           12.78s
         5           0.6804           12.48s
         6           0.6657           12.21s
         7           0.6514           12.06s
         8           0.6402           11.89s
         9           0.6323           11.83s
        10           0.6275           11.71s
        20           0.5965           10.75s
        30           0.5797            9.99s
        40           0.5639            9.35s
        50           0.5567            8.74s
        60           0.5522            8.17s
        70           0.5473            7.65s
        80           0.5432            7.12s
        90           0.5393            6.60s
       100           0.5370            6.07s
       200           0.5118            1.00s


0.8704256495301271

In [None]:
clf = GradientBoostingClassifier(
    learning_rate=0.45, 
    verbose=1, 
    n_estimators=220,
    min_samples_split=20,
    min_samples_leaf=20
)

clf.fit(X_train, y_train)

predictions = clf.predict(X_test)

accuracy_score(y_test, predictions)

      Iter       Train Loss   Remaining Time 
         1           0.8658           12.77s
         2           0.7786           12.23s
         3           0.7324           12.01s
         4           0.7064           11.91s
         5           0.6806           11.91s
         6           0.6659           11.76s
         7           0.6516           11.67s
         8           0.6398           11.54s
         9           0.6315           11.56s
        10           0.6244           11.53s
        20           0.5900           10.59s
        30           0.5769            9.82s
        40           0.5687            9.32s
        50           0.5594            8.76s
        60           0.5560            8.20s
        70           0.5498            7.67s
        80           0.5477            7.13s
        90           0.5440            6.62s
       100           0.5410            6.09s
