In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder, LabelBinarizer
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.neural_network import MLPClassifier

In [2]:
CSV_HEADER = [
    "age",
    "workclass",
    "fnlwgt",
    "education",
    "education_num",
    "marital_status",
    "occupation",
    "relationship",
    "race",
    "gender",
    "capital_gain",
    "capital_loss",
    "hours_per_week",
    "native_country",
    "income_bracket",
]

train_data_file = ("../../input/census_income/adult.data")
train_data = pd.read_csv(train_data_file, header=None, names=CSV_HEADER)

test_data_file = ("../../input/census_income/adult.test")
test_data = pd.read_csv(test_data_file, header=None, names=CSV_HEADER)
test_data.income_bracket = test_data.income_bracket.apply(
    lambda value: value.replace(".", "")
)

print(f"Train dataset shape: {train_data.shape}")
print(f"Test dataset shape: {test_data.shape}")

Train dataset shape: (32561, 15)
Test dataset shape: (16281, 15)


In [3]:
train_data['gender'] = 1*(train_data['gender'] == ' Male')
test_data['gender'] = 1*(test_data['gender'] == ' Male')

train_data['native_country'] = 1*(train_data['native_country'] == ' United-States')
test_data['native_country'] = 1*(test_data['native_country'] == ' United-States')

In [4]:
lb = LabelBinarizer()
race_train = lb.fit_transform(train_data['race'].values)
race_test = lb.transform(test_data['race'].values)
race_train = pd.DataFrame(race_train, columns = ['race0', 'race1', 'race2', 'race3', 'race4'] )
race_test = pd.DataFrame(race_test, columns = ['race0', 'race1', 'race2', 'race3', 'race4'] )
train_data = pd.concat([train_data, race_train], axis=1)
test_data = pd.concat([test_data, race_test], axis=1)

lb = LabelBinarizer()
relationship_train = lb.fit_transform(train_data['relationship'].values)
relationship_test = lb.transform(test_data['relationship'].values)
relationship_train = pd.DataFrame(relationship_train, columns = ['relationship0', 'relationship1', 'relationship2', 
                                                                 'relationship3', 'relationship4', 'relationship5'] )
relationship_test = pd.DataFrame(relationship_test, columns = ['relationship0', 'relationship1', 'relationship2', 
                                                                 'relationship3', 'relationship4', 'relationship5'] )
train_data = pd.concat([train_data, relationship_train], axis=1)
test_data = pd.concat([test_data, relationship_test], axis=1)

lb = LabelBinarizer()
workclass_train = lb.fit_transform(train_data['workclass'].values)
workclass_test = lb.transform(test_data['workclass'].values)
workclass_train = pd.DataFrame(workclass_train, columns = ['workclass0', 'workclass1', 'workclass2', 
                                                                 'workclass3', 'workclass4', 'workclass5',
                                                                'workclass6', 'workclass7', 'workclass8'] )
workclass_test = pd.DataFrame(workclass_test, columns = ['workclass0', 'workclass1', 'workclass2', 
                                                                 'workclass3', 'workclass4', 'workclass5',
                                                                'workclass6', 'workclass7', 'workclass8'] )
train_data = pd.concat([train_data, workclass_train], axis=1)
test_data = pd.concat([test_data, workclass_test], axis=1)

lb = LabelBinarizer()
marital_status_train = lb.fit_transform(train_data['marital_status'].values)
marital_status_test = lb.transform(test_data['marital_status'].values)
marital_status_train = pd.DataFrame(marital_status_train, columns = ['marital_status0', 'marital_status1', 'marital_status2', 
                                                                 'marital_status3', 'marital_status4', 'marital_status5',
                                                                'marital_status6'] )
marital_status_test = pd.DataFrame(marital_status_test, columns = ['marital_status0', 'marital_status1', 'marital_status2', 
                                                                 'marital_status3', 'marital_status4', 'marital_status5',
                                                                'marital_status6'] )
train_data = pd.concat([train_data, marital_status_train], axis=1)
test_data = pd.concat([test_data, marital_status_test], axis=1)

lb = LabelBinarizer()
occupation_train = lb.fit_transform(train_data['occupation'].values)
occupation_test = lb.transform(test_data['occupation'].values)
occupation_train = pd.DataFrame(occupation_train, columns = ['occupation0', 'occupation1', 'occupation2', 
                                                                 'occupation3', 'occupation4', 'occupation5',
                                                                'occupation6', 'occupation7', 'occupation8',
                                                                'occupation9', 'occupation10', 'occupation11',
                                                                'occupation12', 'occupation13', 'occupation14'] )
occupation_test = pd.DataFrame(occupation_test, columns = ['occupation0', 'occupation1', 'occupation2', 
                                                                 'occupation3', 'occupation4', 'occupation5',
                                                                'occupation6', 'occupation7', 'occupation8',
                                                                'occupation9', 'occupation10', 'occupation11',
                                                                'occupation12', 'occupation13', 'occupation14'])

train_data = pd.concat([train_data, occupation_train], axis=1)
test_data = pd.concat([test_data, occupation_test], axis=1)


In [5]:
columns = ['education_num', 'age', 'capital_gain', 'capital_loss', 'hours_per_week', 'gender', 
           'race0', 'race1', 'race2', 'race3', 'race4', 'relationship0', 'relationship1', 'relationship2', 
            'relationship3', 'relationship4', 'relationship5', 'workclass0', 'workclass1', 'workclass2', 
            'workclass3', 'workclass4', 'workclass5', 'workclass6', 'workclass7', 'workclass8',
          'marital_status0', 'marital_status1', 'marital_status2', 'marital_status3', 'marital_status4', 'marital_status5', 'marital_status6',
          'occupation0', 'occupation1', 'occupation2',  'occupation3', 'occupation4', 'occupation5', 'occupation6', 'occupation7', 'occupation8',
          'occupation9', 'occupation10', 'occupation11', 'occupation12', 'occupation13', 'occupation14']
clf = MLPClassifier()
clf.fit(train_data[columns].values, train_data['income_bracket'])
test_preds = clf.predict(test_data[columns].values)
accuracy_score(test_data['income_bracket'], test_preds)

0.814937657392052

In [6]:
clf = MLPClassifier(validation_fraction=0, max_iter=100, random_state=2022, hidden_layer_sizes=(80,))
clf.fit(train_data[columns].values, train_data['income_bracket'])
test_preds = clf.predict(test_data[columns].values)
accuracy_score(test_data['income_bracket'], test_preds)

0.8503777409250046