# Census Income

## 1. Import Data

In [14]:
## Library Import and Initializations
# Data Library and Preprocessing
import pandas as pd
from pandas import DataFrame
import numpy as np
from collections import defaultdict
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import Imputer

# Algorithm
from sklearn.neural_network import MLPClassifier

# Test Type Algorithm
from sklearn.model_selection import cross_val_score

In [15]:
# Read CSV
data = pd.read_csv('data/CensusIncome.data.csv', header=None, sep = ",\s", engine="python", na_values=["?"])

# Turn string into integer, and return converted NaN to NaN
label = defaultdict(LabelEncoder)
data = data.fillna("NaN")
data = data.apply(lambda x: x if x.dtype != 'O' else label[x.name].fit_transform(x))

for x in label:
    if("NaN" in label[x].classes_):
        data[x] = data[x].replace(label[x].transform(["NaN"])[0], np.nan)

### Creating Imputed data

In [16]:
## Fill NaN values with dummy data, and then transform it using transform
imp = Imputer(strategy='mean', axis=0)

data_impute = data.copy(deep = True);
data_impute = data_impute.fillna("NaN")

imp.fit(data_impute)
data_impute = DataFrame(imp.transform(data_impute))

# Encode categorical data
data_impute = pd.get_dummies(data=data_impute, columns=list(label.keys()))
data_impute_x, data_impute_y = data_impute.iloc[:,:-1], data_impute.iloc[:, -1]

### Creating no NaN data

In [17]:
# Drop datas with NaN(null) values
data_drop = data.copy()
data_drop = data_drop.dropna(axis=0, how='any')

# Encode categorical data
data_drop = pd.get_dummies(data=data_drop, columns=list(label.keys()))
data_drop_x, data_drop_y = data_drop.iloc[:,:-1], data_drop.iloc[:, -1]

## 2. Training Model

### Multi Layer Perceptron

In [18]:
mlp = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5,2), random_state=1)

# Drop Data
print (cross_val_score(mlp, data_drop_x, data_drop_y, cv=10))

# Impute Data
print (cross_val_score(mlp, data_impute_x, data_impute_y, cv=10))

[ 0.75107723  0.75107723  0.75107723  0.75107723  0.75099469  0.75099469
  0.75099469  0.75099469  0.75124378  0.75124378]
[ 0.75928769  0.75982801  0.75952088  0.75921376  0.75952088  0.80251843
  0.75982801  0.75982801  0.75982801  0.75952088]


In [None]:
dt = tree.DecisionTreeClassifier(criterion='entropy')
print (cross_val_score(dt, data_drop_x, data_drop_y, cv=10))
print (cross_val_score(dt, data_impute_x, data_impute_y, cv=10))