In [1]:
import numpy as np
import pandas as pd
from time import time
from IPython.display import display # Allows the use of display() for DataFrames

# Import supplementary visualization code visuals.py
import visuals as vs

# Pretty display for notebooks
%matplotlib inline

In [2]:
# Load the Census dataset
data = pd.read_csv("census.csv")

In [3]:
# Split the data into features and target label
income_raw = data['income']
features_raw = data.drop('income', axis = 1)

In [4]:
skewed = ['capital-gain', 'capital-loss']
features_log_transformed = pd.DataFrame(data = features_raw)
features_log_transformed[skewed] = features_raw[skewed].apply(lambda x: np.log(x + 1))

In [5]:
from sklearn.preprocessing import MinMaxScaler

# Initialize a scaler, then apply it to the features
scaler = MinMaxScaler() # default=(0, 1)
numerical = ['age', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']

features_log_minmax_transform = pd.DataFrame(data = features_log_transformed)
features_log_minmax_transform[numerical] = scaler.fit_transform(features_log_transformed[numerical])

In [6]:
features_final = pd.get_dummies(features_log_minmax_transform)

# Encode the 'income_raw' data to numerical values
income = income_raw.map({
    '<=50K': '0',
    '>50K': '1'
})

# Print the number of features after one-hot encoding
encoded = list(features_final.columns)

In [7]:
# Import train_test_split
from sklearn.model_selection import train_test_split

# Split the 'features' and 'income' data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features_final, 
                                                    income, 
                                                    test_size = 0.2, 
                                                    random_state = 0)

In [8]:
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.metrics import confusion_matrix, accuracy_score

In [42]:
clf = GradientBoostingClassifier(
    learning_rate=0.45, 
    verbose=1, 
    n_estimators=220,
    min_samples_split=20,
    min_samples_leaf=20
)

clf.fit(X_train, y_train)

predictions = clf.predict(X_test)

accuracy_score(y_test, predictions)

      Iter       Train Loss   Remaining Time 
         1           0.8658           12.77s
         2           0.7786           12.23s
         3           0.7324           12.01s
         4           0.7064           11.91s
         5           0.6806           11.91s
         6           0.6659           11.76s
         7           0.6516           11.67s
         8           0.6398           11.54s
         9           0.6315           11.56s
        10           0.6244           11.53s
        20           0.5900           10.59s
        30           0.5769            9.82s
        40           0.5687            9.32s
        50           0.5594            8.76s
        60           0.5560            8.20s
        70           0.5498            7.67s
        80           0.5477            7.13s
        90           0.5440            6.62s
       100           0.5410            6.09s
       200           0.5170            1.01s


0.8717523493642897

In [44]:
clf = GradientBoostingClassifier(
    learning_rate=0.45, 
    verbose=1, 
    n_estimators=200,
    min_samples_split=20,
    min_samples_leaf=20
)

clf.fit(X_train, y_train)

predictions = clf.predict(X_test)

accuracy_score(y_test, predictions)

      Iter       Train Loss   Remaining Time 
         1           0.8658           11.49s
         2           0.7786           10.97s
         3           0.7324           10.94s
         4           0.7064           10.80s
         5           0.6806           10.63s
         6           0.6659           10.51s
         7           0.6516           10.48s
         8           0.6398           10.37s
         9           0.6315           10.46s
        10           0.6244           10.38s
        20           0.5900            9.53s
        30           0.5769            8.92s
        40           0.5687            8.35s
        50           0.5594            7.79s
        60           0.5560            7.22s
        70           0.5498            6.69s
        80           0.5477            6.14s
        90           0.5440            5.61s
       100           0.5410            5.08s
       200           0.5170            0.00s


0.8718629076838033

In [47]:
clf = GradientBoostingClassifier(
    learning_rate=0.45, 
    verbose=1, 
    n_estimators=180,
    min_samples_split=20,
    min_samples_leaf=20
)

clf.fit(X_train, y_train)

predictions = clf.predict(X_test)

accuracy_score(y_test, predictions)

      Iter       Train Loss   Remaining Time 
         1           0.8658           10.08s
         2           0.7786            9.96s
         3           0.7324            9.74s
         4           0.7064            9.65s
         5           0.6806            9.68s
         6           0.6659            9.55s
         7           0.6516            9.50s
         8           0.6398            9.44s
         9           0.6315            9.38s
        10           0.6244            9.30s
        20           0.5900            8.55s
        30           0.5769            7.97s
        40           0.5687            7.34s
        50           0.5594            6.73s
        60           0.5560            6.16s
        70           0.5498            5.63s
        80           0.5477            5.09s
        90           0.5440            4.57s
       100           0.5410            4.05s


0.8727473742399116