In [1]:
# Imports

import json # will be needed for saving preprocessing details

import numpy as np # for data manipulation
import pandas as pd # for data manipulation

from sklearn.model_selection import train_test_split # will be used for data split
from sklearn.preprocessing import LabelEncoder # for preprocessing
from sklearn.ensemble import RandomForestClassifier # for training the algorithm
from sklearn.ensemble import ExtraTreesClassifier # for training the algorithm

import joblib # for saving algorithm and preprocessing objects

import ssl
ssl._create_default_https_context = ssl._create_unverified_context

## Reading in Data

In [4]:
# Reading in the data
df = pd.read_csv('https://raw.githubusercontent.com/pplonski/datasets-for-start/master/adult/data.csv', skipinitialspace=True)
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [5]:
# Extracting the features and target

x_cols = [col for col in df.columns if col != 'income']
X = df[x_cols]
y = df['income']

In [6]:
# Splitting the data into train and test

X_train, X_test, y_train, y_test = train_test_split(
    X, 
    y, 
    test_size = 0.3, 
    random_state=0
)

## Missing Values

In [10]:
# Filling in mode for the missing values

## Getting the mode for each feature
train_mode = dict(X_train.mode().iloc[0])

## Filling in mode for each feature
X_train = X_train.fillna(train_mode)

## Categorical Encoding

In [11]:
# Converting categorical variables into label encodings

## Creating a dictionary to save category and corresponding label encoder 
encoders = {}

## Looping through each category
for column in ['workclass', 'education', 'marital-status',
               'occupation', 'relationship', 'race',
               'sex','native-country']:

    ### Instantiating a Label Encoder
    categorical_convert = LabelEncoder()

    ### Fitting and transforming the category
    X_train[column] = categorical_convert.fit_transform(X_train[column])
    
    ### Saving the category and corresponding Label Encoder object
    encoders[column] = categorical_convert

## Modeling

### Random Forest

In [12]:
# Applying random forest to the data

rf = RandomForestClassifier(n_estimators = 100)
rf = rf.fit(X_train, y_train)

### Extra Trees

In [13]:
# Training data with Extra-Trees

et = ExtraTreesClassifier(n_estimators = 100)
et = et.fit(X_train, y_train)

## Saving the Processes

In [14]:
# Saving the preprocessing objects

joblib.dump(train_mode, "./train_mode.joblib", compress=True)
joblib.dump(encoders, "./encoders.joblib", compress=True)

['./encoders.joblib']

In [15]:
# Saving the Random Forest and Extra Trees algorithms

joblib.dump(rf, "./random_forest.joblib", compress=True)
joblib.dump(et, "./extra_trees.joblib", compress=True)

['./extra_trees.joblib']