In [1]:
import numpy as np
import pandas as pd
import re
import nltk
import matplotlib.pyplot as plt
%matplotlib inline
import mltools as ml
import math
import csv

In [2]:
from numpy import asarray as arr
from numpy import asmatrix as mat
from numpy import atleast_2d as twod
from itertools import islice

### Load data

In [3]:
data_train = pd.read_csv('./data/adult.data',delimiter=',',skipinitialspace=True)
data_test = pd.read_csv('./data/adult.test',delimiter=',',skipinitialspace=True)

In [4]:
orig_keys = data_train.keys()

In [5]:
for key in data_train.keys():
    n_missing = sum(data_train[key] == '?')
    print('number of missing values (%s): %d' % (key, n_missing))

number of missing values (age): 0
number of missing values (workclass): 1836
number of missing values (fnlwgt): 0
number of missing values (education): 0
number of missing values (education-num): 0
number of missing values (marital-status): 0
number of missing values (occupation): 1843
number of missing values (relationship): 0
number of missing values (race): 0
number of missing values (sex): 0
number of missing values (capital-gain): 0
number of missing values (capital-loss): 0
number of missing values (hours-per-week): 0
number of missing values (native-country): 583
number of missing values (salary): 0


  result = method(y)


### Remove missing data

In [6]:
# remove rows with missing values for training data
for key in data_train.keys():  
    data_train = data_train[data_train[key] != '?']
    
# remove rows with missing values for test data
for key in data_test.keys():  
    data_test = data_test[data_test[key] != '?']

In [7]:
n_train = len(data_train)
n_test = len(data_test)
print('number of train: %d' % n_train)
print('number of test: %d' % n_test)

number of train: 30162
number of test: 15060


In [8]:
# concatenate training and test sets to preserve consistency of one-hot encoding transformation
data = pd.concat([data_train, data_test],axis=0)

### Convert categorical data to binary

In [9]:
categorical_columns_all = ['salary', 'sex', 'education', 'workclass', 'marital-status', 'occupation', 'relationship', 'race', 'native-country']

In [10]:
# convert salary to binary values
salary_labels = data['salary'].unique()
for i,l in enumerate(salary_labels):
    data.loc[data['salary'] == l,'salary_binary'] = i%2

In [11]:
# convert salary to binary values
gender_labels = data['sex'].unique()
for i,l in enumerate(gender_labels):
    data.loc[data['sex'] == l,'sex_binary'] = i

### Convert categorical data to ordinal data

In [12]:
education_labels_sorted = ['Preschool', '1st-4th', '5th-6th', '7th-8th', '9th', 
                           '10th', '11th', '12th', 'HS-grad', 'Some-college', 'Assoc-acdm', 'Assoc-voc', 
                           'Bachelors', 'Masters', 'Prof-school', 'Doctorate']

# convert education to ranks
for i,l in enumerate(education_labels_sorted):
    data.loc[data['education'] == l,'education_ord'] = i

### Convert categorical data with one-hot-encoding

In [13]:
categorical_columns = ['workclass', 'marital-status', 'occupation', 'relationship', 'race', 'native-country']

In [14]:
one_hot_df = pd.DataFrame()
for col in categorical_columns:
    one_hot_columns = pd.get_dummies(pd.Categorical(data[col]), prefix = col)
    print('number of added columns (%s): %d' % (col, one_hot_columns.shape[1]))
    one_hot_df = pd.concat([one_hot_df, one_hot_columns], axis=1)

number of added columns (workclass): 7
number of added columns (marital-status): 7
number of added columns (occupation): 14
number of added columns (relationship): 6
number of added columns (race): 5
number of added columns (native-country): 41


In [15]:
data = data.reset_index()

data = pd.concat([data, one_hot_df], axis=1)
data = data.drop(columns=['index'])

### Split data into training and test set according to indices

In [16]:
data_train_new = data.iloc[0:n_train]
data_test_new = data.iloc[n_train:]

In [17]:
data_train_new.to_csv('training_preprocessed.csv')
data_test_new.to_csv('test_preprocessed.csv')