# 2.1 Imports

In [1]:
# import libraries
import pandas as pd
import numpy as np
import seaborn as sns

import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline

# from sklearn.preprocessing import OrdinalEncoder

In [2]:
# ignore warnings
import warnings
warnings.filterwarnings('ignore')

In [3]:
# load train data
train = pd.read_csv('data/train_cleaned.csv')

In [4]:
# load test data
test = pd.read_csv('data/test_cleaned.csv')

In [5]:
train.head()

Unnamed: 0,age,workclass,education,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,target
0,39,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [6]:
train.tail()

Unnamed: 0,age,workclass,education,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,target
32556,27,Private,Assoc-acdm,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,HS-grad,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,HS-grad,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,HS-grad,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K
32560,52,Self-emp-inc,HS-grad,Married-civ-spouse,Exec-managerial,Wife,White,Female,15024,0,40,United-States,>50K


In [7]:
test.head()

Unnamed: 0,age,workclass,education,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,target
0,25.0,Private,11th,Never-married,Machine-op-inspct,Own-child,Black,Male,0.0,0.0,40.0,United-States,<=50K.
1,38.0,Private,HS-grad,Married-civ-spouse,Farming-fishing,Husband,White,Male,0.0,0.0,50.0,United-States,<=50K.
2,28.0,Local-gov,Assoc-acdm,Married-civ-spouse,Protective-serv,Husband,White,Male,0.0,0.0,40.0,United-States,>50K.
3,44.0,Private,Some-college,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688.0,0.0,40.0,United-States,>50K.
4,18.0,Private,Some-college,Never-married,Prof-specialty,Own-child,White,Female,0.0,0.0,30.0,United-States,<=50K.


In [8]:
test.tail()

Unnamed: 0,age,workclass,education,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,target
16276,39.0,Private,Bachelors,Divorced,Prof-specialty,Not-in-family,White,Female,0.0,0.0,36.0,United-States,<=50K.
16277,64.0,Private,HS-grad,Widowed,Prof-specialty,Other-relative,Black,Male,0.0,0.0,40.0,United-States,<=50K.
16278,38.0,Private,Bachelors,Married-civ-spouse,Prof-specialty,Husband,White,Male,0.0,0.0,50.0,United-States,<=50K.
16279,44.0,Private,Bachelors,Divorced,Adm-clerical,Own-child,Asian-Pac-Islander,Male,5455.0,0.0,40.0,United-States,<=50K.
16280,35.0,Self-emp-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.0,0.0,60.0,United-States,>50K.


In [9]:
train.shape

(32561, 13)

In [10]:
test.shape

(16281, 13)

# 2.2 Data Encoding

# 2.2.1 'education'

In [11]:
train['education'].value_counts()

 HS-grad         10501
 Some-college     7291
 Bachelors        5355
 Masters          1723
 Assoc-voc        1382
 11th             1175
 Assoc-acdm       1067
 10th              933
 7th-8th           646
 Prof-school       576
 9th               514
 12th              433
 Doctorate         413
 5th-6th           333
 1st-4th           168
 Preschool          51
Name: education, dtype: int64

In [12]:
len(train['education'].value_counts())

16

In [13]:
test['education'].value_counts()

 HS-grad         5283
 Some-college    3587
 Bachelors       2670
 Masters          934
 Assoc-voc        679
 11th             637
 Assoc-acdm       534
 10th             456
 7th-8th          309
 Prof-school      258
 9th              242
 12th             224
 Doctorate        181
 5th-6th          176
 1st-4th           79
 Preschool         32
Name: education, dtype: int64

In [14]:
len(test['education'].value_counts())

16

In [15]:
test.education.dtype

dtype('O')

In [17]:
# create function
# ordinal encoder

def edcu_encoder(x):
    if x in [' Doctorate']:
        return 4
    elif x in [' Masters', ' Prof-school']:
        return 3
    elif x in [' Bachelors']:
        return 2
    elif x in [' Some-college', ' Assoc-voc', ' Assoc-acdm']:
        return 1
    else:
        return 0

In [18]:
# merging education level according education ladder in us

train['education'] = train['education'].apply(edcu_encoder)
test['education'] = test['education'].apply(edcu_encoder)

In [19]:
train['education'].value_counts()

0    14754
1     9740
2     5355
3     2299
4      413
Name: education, dtype: int64

In [21]:
test['education'].value_counts()

0    7438
1    4800
2    2670
3    1192
4     181
Name: education, dtype: int64

# 2.2.2 'capital_gain' and 'capital_loss'

In [46]:
# create function
# combine capital change

def capital_change(row):
    return row['capital_gain'] - row['capital_loss']

In [48]:
# combine two related features

train['capital_gain'] = train.apply(lambda row: capital_change(row), axis = 1)
test['capital_gain'] = test.apply(lambda row: capital_change(row), axis = 1)

In [62]:
# change column name to 'capital_change'
# remove 'capital_loss'

train.rename(columns={'capital_gain':'capital_change'}, inplace=True)
train.drop(columns=['capital_loss'], inplace=True)

test.rename(columns={'capital_gain':'capital_change'}, inplace=True)
test.drop(columns=['capital_loss'], inplace=True)

In [59]:
train.describe()

Unnamed: 0,age,education,capital_change,hours_per_week
count,32561.0,32561.0,32561.0,32561.0
mean,38.581647,0.890605,990.345014,40.437456
std,13.640433,1.00111,7408.986951,12.347429
min,17.0,0.0,-4356.0,1.0
25%,28.0,0.0,0.0,40.0
50%,37.0,1.0,0.0,40.0
75%,48.0,1.0,0.0,45.0
max,90.0,4.0,99999.0,99.0


In [63]:
test.describe()

Unnamed: 0,age,education,capital_change,hours_per_week
count,16281.0,16281.0,16281.0,16281.0
mean,38.767459,0.886923,994.005835,40.392236
std,13.849187,1.000517,7607.153716,12.479332
min,17.0,0.0,-3770.0,1.0
25%,28.0,0.0,0.0,40.0
50%,37.0,1.0,0.0,40.0
75%,48.0,1.0,0.0,45.0
max,90.0,4.0,99999.0,99.0


In [None]:
# have to amend: adding negative values into the encoder

In [43]:
# obserce max value
train[train['capital_gain'] == 99999].count()[0]

159

In [None]:
# all values over 99999 were presented as 99999

In [40]:
# obserce values between 50000 and 99999
train[(train['capital_gain'] >= 50000) & (train['capital_gain'] < 99999)].count()[0]

0

In [38]:
# observe values between 0 and 50000
train[(train['capital_gain'] > 0) & (train['capital_gain'] < 50000)].count()[0]

2553

In [None]:
# a small portion in this range

In [42]:
# observe data with no capital gain
train[train['capital_gain'] == 0].count()[0]

29849

In [None]:
# most data are with zero capital gain

In [None]:
# create function
# ordinal encoder

def gain_encoder(x):
    if x == 0:
        return 0
    elif (x > 0) and (x < 50000):
        return 1
    elif (x > 50000) and (x < 99999):
        return 2
    elif x == 99999:
        return 3

In [20]:
# END