# Load Packages

In [2]:
# Primary Packages
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [157]:
# Modelling
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

# Package to Save Model
import joblib

# Metrics
from sklearn.metrics import classification_report

In [41]:
# Pandas options
pd.options.display.max_columns = 999

# Load Data

In [5]:
data = pd.read_csv('data/census.csv')

In [6]:
data.head()

Unnamed: 0,age,workclass,education_level,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,Bachelors,13.0,Never-married,Adm-clerical,Not-in-family,White,Male,2174.0,0.0,40.0,United-States,<=50K
1,50,Self-emp-not-inc,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.0,0.0,13.0,United-States,<=50K
2,38,Private,HS-grad,9.0,Divorced,Handlers-cleaners,Not-in-family,White,Male,0.0,0.0,40.0,United-States,<=50K
3,53,Private,11th,7.0,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0.0,0.0,40.0,United-States,<=50K
4,28,Private,Bachelors,13.0,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0.0,0.0,40.0,Cuba,<=50K


In [7]:
dep_var = 'high_income'
cat_names = ['workclass', 'education_level', 'marital-status',
              'occupation', 'relationship', 'race', 'sex', 
             'native-country']
cont_names = ['age', 'capital-gain', 'capital-loss', 'hours-per-week']

# Feature Engineering

In [14]:
# Create Boolean Target
data['high_income'] = data['income'] == '>50K'

# Multihot encode categorical variables
df_cat = pd.get_dummies(data[cat_names].astype(str))

# Reassign numerical to diff df
df_cont = data[cont_names]

# Normalize numerical features
df_cont_norm = (df_cont-df_cont.min())/(df_cont.max()-df_cont.min())

# Concatenate features
X = pd.concat([df_cat, df_cont_norm], axis=1)

# Create target df
y = data[dep_var]

In [15]:
X.shape, y.shape

((45222, 102), (45222,))

In [25]:
X.columns

Index(['workclass_ Federal-gov', 'workclass_ Local-gov', 'workclass_ Private',
       'workclass_ Self-emp-inc', 'workclass_ Self-emp-not-inc',
       'workclass_ State-gov', 'workclass_ Without-pay',
       'education_level_ 10th', 'education_level_ 11th',
       'education_level_ 12th',
       ...
       'native-country_ Taiwan', 'native-country_ Thailand',
       'native-country_ Trinadad&Tobago', 'native-country_ United-States',
       'native-country_ Vietnam', 'native-country_ Yugoslavia', 'age',
       'capital-gain', 'capital-loss', 'hours-per-week'],
      dtype='object', length=102)

In [19]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 42)

model = RandomForestClassifier(n_estimators=300, random_state=42)
# model = GradientBoostingRegressor(n_estimators=300, random_state=42, )

# Fit Model
%time model.fit(X_train, y_train)

CPU times: user 11.2 s, sys: 284 ms, total: 11.4 s
Wall time: 14.3 s


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=300,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

In [23]:
y_pred = model.predict(X_test)

In [24]:
report = classification_report(y_pred, y_test)
print(report)

              precision    recall  f1-score   support

       False       0.91      0.88      0.90     14105
        True       0.63      0.70      0.67      3984

    accuracy                           0.84     18089
   macro avg       0.77      0.79      0.78     18089
weighted avg       0.85      0.84      0.85     18089



# Create A Simpler Model

For our WebApp, let's just use a few simple features like:
- Age
- Hours per Week
- Education Level
- Sex
- Race

In [39]:
# Get features we want and create a new dataframe
columns = ['age', 'hours-per-week', 'education_level', 'sex', 'race']
data_small = data[columns]

In [42]:
# Onehot encode categorical features
data_small_dummies = pd.get_dummies(data_small)
data_small_dummies.head()

Unnamed: 0,age,hours-per-week,education_level_ 10th,education_level_ 11th,education_level_ 12th,education_level_ 1st-4th,education_level_ 5th-6th,education_level_ 7th-8th,education_level_ 9th,education_level_ Assoc-acdm,education_level_ Assoc-voc,education_level_ Bachelors,education_level_ Doctorate,education_level_ HS-grad,education_level_ Masters,education_level_ Preschool,education_level_ Prof-school,education_level_ Some-college,sex_ Female,sex_ Male,race_ Amer-Indian-Eskimo,race_ Asian-Pac-Islander,race_ Black,race_ Other,race_ White
0,39,40.0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1
1,50,13.0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1
2,38,40.0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1
3,53,40.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0
4,28,40.0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0


# Generate Model

In [61]:
# Assign X and y
X_small, y_small = data_small_dummies, y

In [162]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_small, y_small, test_size = 0.4, random_state = 42)

model = RandomForestClassifier(n_estimators=10, random_state=42)
# model = GradientBoostingRegressor(n_estimators=300, random_state=42, )

# Fit Model
%time model.fit(X_train, y_train)

CPU times: user 194 ms, sys: 8.22 ms, total: 202 ms
Wall time: 227 ms


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

In [163]:
# Get Test Predictions
y_pred = model.predict(X_test)

# Get Metrics
report = classification_report(y_pred, y_test)
print(report)

              precision    recall  f1-score   support

       False       0.89      0.83      0.86     14636
        True       0.44      0.57      0.49      3453

    accuracy                           0.78     18089
   macro avg       0.66      0.70      0.68     18089
weighted avg       0.80      0.78      0.79     18089



# Save Model

In [164]:
# Save Model
joblib.dump(model, 'model/census_model.pkl')

['model/census_model.pkl']

# Generate Predictions Using Sample Inputs and Saved Model

### Education Levels

In [129]:
education_level_values.values.tolist()

['Bachelors',
 'HS-grad',
 '11th',
 'Masters',
 '9th',
 'Some-college',
 'Assoc-acdm',
 '7th-8th',
 'Doctorate',
 'Assoc-voc',
 'Prof-school',
 '5th-6th',
 '10th',
 'Preschool',
 '12th',
 '1st-4th']

In [83]:
education_level_values = pd.Series(data['education_level'].unique()).str.strip()
education_level_dummies = pd.get_dummies(education_level_values)
education_level_dummies

Unnamed: 0,10th,11th,12th,1st-4th,5th-6th,7th-8th,9th,Assoc-acdm,Assoc-voc,Bachelors,Doctorate,HS-grad,Masters,Preschool,Prof-school,Some-college
0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0
2,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
4,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
6,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0
7,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0
9,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0


### Race

In [84]:
race_values = pd.Series(data['race'].unique()).str.strip()
race_dummies = pd.get_dummies(race_values)
race_dummies

Unnamed: 0,Amer-Indian-Eskimo,Asian-Pac-Islander,Black,Other,White
0,0,0,0,0,1
1,0,0,1,0,0
2,0,1,0,0,0
3,1,0,0,0,0
4,0,0,0,1,0


### Sex

In [85]:
sex_values = pd.Series(data['sex'].unique()).str.strip()
sex_dummies = pd.get_dummies(sex_values)
sex_dummies

Unnamed: 0,Female,Male
0,0,1
1,1,0


In [115]:
data_small_dummies.columns

Index(['age', 'hours-per-week', 'education_level_ 10th',
       'education_level_ 11th', 'education_level_ 12th',
       'education_level_ 1st-4th', 'education_level_ 5th-6th',
       'education_level_ 7th-8th', 'education_level_ 9th',
       'education_level_ Assoc-acdm', 'education_level_ Assoc-voc',
       'education_level_ Bachelors', 'education_level_ Doctorate',
       'education_level_ HS-grad', 'education_level_ Masters',
       'education_level_ Preschool', 'education_level_ Prof-school',
       'education_level_ Some-college', 'sex_ Female', 'sex_ Male',
       'race_ Amer-Indian-Eskimo', 'race_ Asian-Pac-Islander', 'race_ Black',
       'race_ Other', 'race_ White'],
      dtype='object')

### Load Model

In [152]:
model = joblib.load('model/census_model.pkl')

In [87]:
education_level_values

0        Bachelors
1          HS-grad
2             11th
3          Masters
4              9th
5     Some-college
6       Assoc-acdm
7          7th-8th
8        Doctorate
9        Assoc-voc
10     Prof-school
11         5th-6th
12            10th
13       Preschool
14            12th
15         1st-4th
dtype: object

In [88]:
np.where(education_level_values == education_level_sample)

(array([0]),)

In [96]:
race_values

0                 White
1                 Black
2    Asian-Pac-Islander
3    Amer-Indian-Eskimo
4                 Other
dtype: object

In [107]:
# Age
age = 21

# Hours per Pweek
hours = 80

# Education Level
education_level_sample = 'HS-grad'
education_level_sample_dummies = (education_level_dummies.loc[np.where(education_level_values.values == education_level_sample)[0]]
                                  .values.tolist()[0])

# Race
race_sample = 'White'
race_sample_dummies = race_dummies.loc[np.where(race_values.values == race_sample)[0]].values.tolist()[0]

# Gender/Sex
sex_sample = 'Male'
sex_sample_dummies = sex_dummies.loc[np.where(sex_values.values == sex_sample)[0]].values.tolist()[0]

In [122]:
# Concatenate features for sample prediction
sample_features = [age, hours] + education_level_sample_dummies + sex_sample_dummies + race_sample_dummies

In [123]:
len(sample_features)

25

In [160]:
# Sample Predictions
prediction = model.predict([sample_features])[0]
prediction

False