In [1]:
import numpy as np
import pandas as pd
from time import time
from IPython.display import display # Allows the use of display() for DataFrames

# Import supplementary visualization code visuals.py
import visuals as vs

# Pretty display for notebooks
%matplotlib inline

In [2]:
# Load the Census dataset
data = pd.read_csv("census.csv")

In [3]:
# Split the data into features and target label
income_raw = data['income']
features_raw = data.drop('income', axis = 1)

In [4]:
skewed = ['capital-gain', 'capital-loss']
features_log_transformed = pd.DataFrame(data = features_raw)
features_log_transformed[skewed] = features_raw[skewed].apply(lambda x: np.log(x + 1))

In [5]:
from sklearn.preprocessing import MinMaxScaler

# Initialize a scaler, then apply it to the features
scaler = MinMaxScaler() # default=(0, 1)
numerical = ['age', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']

features_log_minmax_transform = pd.DataFrame(data = features_log_transformed)
features_log_minmax_transform[numerical] = scaler.fit_transform(features_log_transformed[numerical])

In [6]:
features_final = pd.get_dummies(features_log_minmax_transform)

# Encode the 'income_raw' data to numerical values
income = income_raw.map({
    '<=50K': 0,
    '>50K': 1
})

# Print the number of features after one-hot encoding
encoded = list(features_final.columns)

In [7]:
# Import train_test_split
from sklearn.model_selection import train_test_split

# Split the 'features' and 'income' data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features_final, 
                                                    income, 
                                                    test_size = 0.2, 
                                                    random_state = 0)

In [8]:
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.metrics import accuracy_score, fbeta_score

In [9]:
clf = GradientBoostingClassifier()

clf.fit(X_train, y_train)

predictions = clf.predict(X_test)

accuracy_score(y_test, predictions), fbeta_score(y_test, predictions, beta=0.5)

(0.8630182421227197, 0.7395338561802719)

In [10]:
clf = GradientBoostingClassifier(
    learning_rate=0.45, 
    verbose=1, 
    n_estimators=180,
    min_samples_split=20,
    min_samples_leaf=20
)

clf.fit(X_train, y_train)

predictions = clf.predict(X_test)

accuracy_score(y_test, predictions), fbeta_score(y_test, predictions, beta=0.5)

      Iter       Train Loss   Remaining Time 
         1           0.8658            9.79s
         2           0.7786            9.59s
         3           0.7324            9.50s
         4           0.7064            9.54s
         5           0.6806            9.41s
         6           0.6659            9.35s
         7           0.6516            9.28s
         8           0.6398            9.19s
         9           0.6315            9.23s
        10           0.6244            9.16s
        20           0.5900            8.36s
        30           0.5769            7.74s
        40           0.5687            7.16s
        50           0.5594            6.63s
        60           0.5560            6.11s
        70           0.5498            5.58s
        80           0.5477            5.06s
        90           0.5440            4.55s
       100           0.5410            4.03s


(0.8727473742399116, 0.7561841143930698)