In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, LabelBinarizer
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.ensemble import HistGradientBoostingClassifier

In [2]:
CSV_HEADER = [
    "age",
    "workclass",
    "fnlwgt",
    "education",
    "education_num",
    "marital_status",
    "occupation",
    "relationship",
    "race",
    "gender",
    "capital_gain",
    "capital_loss",
    "hours_per_week",
    "native_country",
    "income_bracket",
]

train_data_file = ("../../input/census_income/adult.data")
train_data = pd.read_csv(train_data_file, header=None, names=CSV_HEADER)

test_data_file = ("../../input/census_income/adult.test")
test_data = pd.read_csv(test_data_file, header=None, names=CSV_HEADER)
test_data['income_bracket'] = test_data['income_bracket'].str[:-1]

print(f"Train dataset shape: {train_data.shape}")
print(f"Test dataset shape: {test_data.shape}")

Train dataset shape: (32561, 15)
Test dataset shape: (16281, 15)


In [3]:
train_data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,gender,capital_gain,capital_loss,hours_per_week,native_country,income_bracket
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [4]:
train_data['gender'] = 1*(train_data['gender'] == ' Male')
test_data['gender'] = 1*(test_data['gender'] == ' Male')

train_data['native_country'] = 1*(train_data['native_country'] == ' United-States')
test_data['native_country'] = 1*(test_data['native_country'] == ' United-States')

In [5]:
cat_columns = ['workclass', 'marital_status', 'occupation', 'relationship', 'race']

for col in cat_columns:
    le = LabelEncoder()
    train_data[col] = le.fit_transform(train_data[col])
    test_data[col] = le.transform(test_data[col])
    del le

In [6]:
train_data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,gender,capital_gain,capital_loss,hours_per_week,native_country,income_bracket
0,39,7,77516,Bachelors,13,4,1,1,4,1,2174,0,40,1,<=50K
1,50,6,83311,Bachelors,13,2,4,0,4,1,0,0,13,1,<=50K
2,38,4,215646,HS-grad,9,0,6,1,4,1,0,0,40,1,<=50K
3,53,4,234721,11th,7,2,6,0,2,1,0,0,40,1,<=50K
4,28,4,338409,Bachelors,13,2,10,5,2,0,0,0,40,0,<=50K


In [7]:
columns = ['education_num', 'age', 'capital_gain', 'capital_loss', 'hours_per_week', 'gender', 'native_country',
           'workclass', 'marital_status', 'occupation', 'relationship', 'race']

clf = HistGradientBoostingClassifier()
clf.fit(train_data[columns].values, train_data['income_bracket'])
test_preds = clf.predict(test_data[columns].values)
accuracy_score(test_data['income_bracket'], test_preds)

0.8721822983846201

In [8]:
columns = ['education_num', 'age', 'capital_gain', 'capital_loss', 'hours_per_week', 'gender', 'native_country',
           'workclass', 'marital_status', 'occupation', 'relationship', 'race']

clf = HistGradientBoostingClassifier(validation_fraction=None, max_iter=1500, max_depth=None, learning_rate = 0.01)
clf.fit(train_data[columns].values, train_data['income_bracket'])
test_preds = clf.predict(test_data[columns].values)
accuracy_score(test_data['income_bracket'], test_preds)

0.8732264602911369