# 1. Load the data

In [1]:
import numpy as np
import pandas as pd

headers_col = ['age', 'workclass', 'fnlwgt', 'education', 'education-num', 'marital-status', 'occupation', \
               'relationship', 'race', 'sex', 'capital-gain', 'capital-loss', 'hours-per-week', 'native-country', 'target']
df = pd.read_csv('adult.data', names=headers_col, index_col=False)
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,target
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [2]:
print('number of samples and features', df.shape)

number of samples and features (32561, 15)


# 2. Missing values & duplicates

In [3]:
df.duplicated().sum()

24

There are missing values. Since we lack information, it might be best to remove them here:

In [4]:
df = df.drop_duplicates()

In [5]:
df.isna().sum()

age               0
workclass         0
fnlwgt            0
education         0
education-num     0
marital-status    0
occupation        0
relationship      0
race              0
sex               0
capital-gain      0
capital-loss      0
hours-per-week    0
native-country    0
target            0
dtype: int64

No missing values

# 3. Split the data

In [6]:
from sklearn.model_selection import train_test_split
X = df.drop(columns=['target'])
y = df['target']==' >50K'
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2, stratify=y, random_state=0)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((26029, 14), (6508, 14), (26029,), (6508,))

# 4. Prepare the data

In [7]:
X.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba


In [8]:
quant_cols = ['fnlwgt', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week']
X_train_quant = X_train[quant_cols]
X_test_quant = X_test[quant_cols]

X_train_qual = X_train.drop(columns=quant_cols)
X_test_qual = X_test.drop(columns=quant_cols)

In [9]:
# Rescaling quantitative data
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_quant = scaler.fit_transform(X_train_quant)
X_test_quant = scaler.transform(X_test_quant)

In [10]:
# Encoding qualitative data
X_train_qual = pd.get_dummies(X_train_qual, drop_first=True)
X_test_qual = pd.get_dummies(X_test_qual, drop_first=True)

In [11]:
X_train_qual.shape, X_test_qual.shape

((26029, 95), (6508, 92))

In [12]:
# Are there missing columns?
missing_cols = set(X_train_qual.columns) - set(X_test_qual.columns)
missing_cols

{'native-country_ Holand-Netherlands',
 'native-country_ Honduras',
 'workclass_ Never-worked'}

In [13]:
# Add zeros for those columns
for c in missing_cols:
    X_test_qual[c] = 0
# Check there are no more missing columns
print('number of missing columns:', len(set(X_train_qual.columns) - set(X_test_qual.columns)))

number of missing columns: 0


In [14]:
# Concatenate all of this back
X_train = np.concatenate([X_train_quant, X_train_qual], axis=1)
X_test = np.concatenate([X_test_quant, X_test_qual], axis=1)
X_train.shape, X_test.shape

((26029, 100), (6508, 100))

# 5. Model training and optimization

In [15]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
# Just a small grid for the sake of the example
grid_params = {'n_estimators': [10, 30, 100],
               'max_depth': [3, 5, 10]}

grid = GridSearchCV(RandomForestClassifier(), 
                    grid_params, 
                    scoring='accuracy',
                    cv=5)

grid.fit(X_train, y_train)

GridSearchCV(cv=5, estimator=RandomForestClassifier(),
             param_grid={'max_depth': [3, 5, 10],
                         'n_estimators': [10, 30, 100]},
             scoring='accuracy')

In [16]:
print('best accuracy score on train:', grid.best_score_)
print('hyparams of the best score', grid.best_params_)

best accuracy score on train: 0.8566213446909519
hyparams of the best score {'max_depth': 10, 'n_estimators': 100}


# Evaluate the model

In [17]:
from sklearn.metrics import accuracy_score
y_pred = grid.predict(X_test)
accuracy_score(y_test, y_pred)

0.7984019668100799