In [128]:
import numpy  as np
import pandas as pd
import matplotlib.pyplot as plt

import statsmodels.formula.api as smf
import statsmodels.api as sm
import statsmodels.graphics as smgraph
import scipy.stats as stats

%matplotlib inline
plt.style.use("ggplot")


# read raw data
remote_train_data = 'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data'
remote_test_data = 'https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test'
# column names 
data_columns=['age','workclass','fnlwgt','education','education_num','marital_status','occupation',
              'relationship','race','sex','capital_gain','capital_loss','hours_per_week',
              'native_country','income_range']
continuous_columns = ['age','fnlwgt', 'education_num', 'capital_gain','capital_loss', 'hours_per_week']
categorical_columns=['workclass','education','marital_status','occupation',
              'relationship','race','sex','native_country','income_range']

data_df_train = pd.read_csv(remote_train_data, names=data_columns, sep=",\s+", engine='python')
data_df_test = pd.read_csv(remote_test_data, skiprows=1, names=data_columns, sep=",\s+", engine='python')

#remove the first row from the test data
#data_df_test=data_df_test.drop([0])

#print data_df_train
data_df_train.shape

(32561, 15)

In [129]:
# trim the income_range col in data_df_test for denotation consistency
for index, row in data_df_test.iterrows():
    if row["income_range"]=='<=50K.':
        data_df_test.loc[index,"income_range"]='<=50K'
    elif row["income_range"]=='>50K.':
        data_df_test.loc[index,"income_range"]='>50K'

data_df_test

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,income_range
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K
5,34,Private,198693,10th,6,Never-married,Other-service,Not-in-family,White,Male,0,0,30,United-States,<=50K
6,29,?,227026,HS-grad,9,Never-married,?,Unmarried,Black,Male,0,0,40,United-States,<=50K
7,63,Self-emp-not-inc,104626,Prof-school,15,Married-civ-spouse,Prof-specialty,Husband,White,Male,3103,0,32,United-States,>50K
8,24,Private,369667,Some-college,10,Never-married,Other-service,Unmarried,White,Female,0,0,40,United-States,<=50K
9,55,Private,104996,7th-8th,4,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,10,United-States,<=50K


In [130]:
# Step 1: Check for missing data
# for categorical data type, we replace ? with the most common value 
# by printing out the value_counts() for each column, we observe that there is no missing value in 
# col education, marital_status, relationship, race, sex, and income_range

#create a copy of train dataframe
df_train_processed =data_df_train.copy()

# for col workclass
print data_df_train["workclass"].value_counts()
# the most common value is Private
df_train_processed["workclass"] = data_df_train["workclass"].replace("?", 'Private')
print df_train_processed["workclass"].value_counts()

# for col occupation
print data_df_train["occupation"].value_counts()
# the most common value is Prof-specialty 
df_train_processed["occupation"] = data_df_train["occupation"].replace("?", 'Prof-specialty')
print df_train_processed["occupation"].value_counts()

# for col occupation
print data_df_train["native_country"].value_counts()
# the most common value is United-States 
df_train_processed["native_country"] = data_df_train["native_country"].replace("?", 'United-States')
print df_train_processed["native_country"].value_counts()

Private             22696
Self-emp-not-inc     2541
Local-gov            2093
?                    1836
State-gov            1298
Self-emp-inc         1116
Federal-gov           960
Without-pay            14
Never-worked            7
Name: workclass, dtype: int64
Private             24532
Self-emp-not-inc     2541
Local-gov            2093
State-gov            1298
Self-emp-inc         1116
Federal-gov           960
Without-pay            14
Never-worked            7
Name: workclass, dtype: int64
Prof-specialty       4140
Craft-repair         4099
Exec-managerial      4066
Adm-clerical         3770
Sales                3650
Other-service        3295
Machine-op-inspct    2002
?                    1843
Transport-moving     1597
Handlers-cleaners    1370
Farming-fishing       994
Tech-support          928
Protective-serv       649
Priv-house-serv       149
Armed-Forces            9
Name: occupation, dtype: int64
Prof-specialty       5983
Craft-repair         4099
Exec-managerial      4066

In [131]:
# Step 2 :now we need to encode categorical values with integer numbers by making use of 
# LabelEncoder provided in sklearn.preprocessing
 
# referred to http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html
from sklearn import preprocessing

#create a df to store integers that represent categorical value
data_train_categorized = pd.DataFrame("?", columns=categorical_columns, index=np.arange(32561))

# LabelEncoder in sklearn.preprocessing
le = preprocessing.LabelEncoder()

# fill values in data_train_categorized by transforming categorical strings to matching integer
for col in categorical_columns:
    le.fit(df_train_processed[col])
    data_train_categorized[col] = le.transform(df_train_processed[col]) 

    
['workclass','education','marital_status','occupation',
              'relationship','race','sex','native_country','income_range']
# such integer representation cannot be directly used by scikit-learn estimators, so 
# we need to transform them into one-hot encoding
enc = preprocessing.OneHotEncoder()

enc.fit(data_train_categorized)
one_hot_encode = enc.transform(data_train_categorized).toarray()
data_train_one_hot = pd.DataFrame(data=one_hot_encode)
data_train_one_hot.head()
# the columns are the enumetation of possible values in all categorical columns
# we dont need to know the order of them if we process test data with the same approach

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,91,92,93,94,95,96,97,98,99,100
0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
2,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [132]:
# Step 3: Standardize continuous columns
# we can use scale function provided in Scikit-Learn
continuous_columns = ['age','fnlwgt', 'education_num', 'capital_gain','capital_loss', 'hours_per_week']
data_train = data_df_train[continuous_columns]

train_scaled = preprocessing.scale(data_train)
                                      
data_train_standardized = pd.DataFrame(train_scaled, columns=continuous_columns)
data_train_standardized.head()

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week
0,0.030671,-1.063611,1.134739,0.148453,-0.21666,-0.035429
1,0.837109,-1.008707,1.134739,-0.14592,-0.21666,-2.222153
2,-0.042642,0.245079,-0.42006,-0.14592,-0.21666,-0.035429
3,1.057047,0.425801,-1.197459,-0.14592,-0.21666,-0.035429
4,-0.775768,1.408176,1.134739,-0.14592,-0.21666,-0.035429


In [133]:
# we can now combine the standardized df with the one-hot encoded categorized df to form a df for ml algorithm

data_train =  data_train_standardized.join(data_train_one_hot)
data_train.head()

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week,0,1,2,3,...,91,92,93,94,95,96,97,98,99,100
0,0.030671,-1.063611,1.134739,0.148453,-0.21666,-0.035429,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
1,0.837109,-1.008707,1.134739,-0.14592,-0.21666,-2.222153,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
2,-0.042642,0.245079,-0.42006,-0.14592,-0.21666,-0.035429,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
3,1.057047,0.425801,-1.197459,-0.14592,-0.21666,-0.035429,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
4,-0.775768,1.408176,1.134739,-0.14592,-0.21666,-0.035429,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [134]:
# Now we need to preprocess the test data using the same method
# Check for missing data
# for categorical data type, we replace ? with the most common value 
# by printing out the value_counts() for each column, we observe that there is no missing value in 
# col education, marital_status, relationship, race, sex, and income_range

#create a copy of test dataframe
df_test_processed =data_df_test.copy()

# for col workclass
print data_df_test["workclass"].value_counts()
# the most common value is Private
df_test_processed["workclass"] = data_df_test["workclass"].replace("?", 'Private')
print df_test_processed["workclass"].value_counts()

# for col occupation
print data_df_test["occupation"].value_counts()
# the most common value is Prof-specialty
df_test_processed["occupation"] = data_df_test["occupation"].replace("?", 'Prof-specialty')
print df_test_processed["occupation"].value_counts()

# for col native_country
print data_df_test["native_country"].value_counts()
# the most common value is United-States
df_test_processed["native_country"] = data_df_test["native_country"].replace("?", 'United-States')
print df_test_processed["native_country"].value_counts()


Private             11210
Self-emp-not-inc     1321
Local-gov            1043
?                     963
State-gov             683
Self-emp-inc          579
Federal-gov           472
Without-pay             7
Never-worked            3
Name: workclass, dtype: int64
Private             12173
Self-emp-not-inc     1321
Local-gov            1043
State-gov             683
Self-emp-inc          579
Federal-gov           472
Without-pay             7
Never-worked            3
Name: workclass, dtype: int64
Prof-specialty       2032
Exec-managerial      2020
Craft-repair         2013
Sales                1854
Adm-clerical         1841
Other-service        1628
Machine-op-inspct    1020
?                     966
Transport-moving      758
Handlers-cleaners     702
Tech-support          518
Farming-fishing       496
Protective-serv       334
Priv-house-serv        93
Armed-Forces            6
Name: occupation, dtype: int64
Prof-specialty       2998
Exec-managerial      2020
Craft-repair         2013

In [135]:
# encode categorical values with integer numbers 
#create a df to store integers that represent categorical value
data_test_categorized = pd.DataFrame("?", columns=categorical_columns, index=np.arange(16281))

# LabelEncoder in sklearn.preprocessing
le = preprocessing.LabelEncoder()

# fill values in data_train_categorized by transforming categorical strings to matching integer
for col in categorical_columns:
    le.fit(df_train_processed[col])
    data_test_categorized[col] = le.transform(df_test_processed[col]) 

# transform them into one-hot encoding
enc = preprocessing.OneHotEncoder()

enc.fit(data_train_categorized)
one_hot_encode = enc.transform(data_test_categorized).toarray()
data_test_one_hot = pd.DataFrame(data=one_hot_encode)
data_test_one_hot.head()



Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,91,92,93,94,95,96,97,98,99,100
0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


In [136]:
#Standardize continuous columns
# we can use scale function provided in Scikit-Learn
continuous_columns = ['age','fnlwgt', 'education_num', 'capital_gain','capital_loss', 'hours_per_week']
data_test = data_df_test[continuous_columns]

test_scaled = preprocessing.scale(data_test)
                                      
data_test_standardized = pd.DataFrame(test_scaled, columns=continuous_columns)
data_test_standardized.head()

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week
0,-0.994129,0.353474,-1.196864,-0.142662,-0.218062,-0.031432
1,-0.055417,-0.942391,-0.417886,-0.142662,-0.218062,0.769918
2,-0.777503,1.39545,0.750582,-0.142662,-0.218062,-0.031432
3,0.377835,-0.275397,-0.028397,0.871091,-0.218062,-0.031432
4,-1.49959,-0.812954,-0.028397,-0.142662,-0.218062,-0.832781


In [137]:
#combine the standardized df with the one-hot encoded categorized df to form a df for ml algorithm

data_test =  data_test_standardized.join(data_test_one_hot)
data_test.head()

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week,0,1,2,3,...,91,92,93,94,95,96,97,98,99,100
0,-0.994129,0.353474,-1.196864,-0.142662,-0.218062,-0.031432,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
1,-0.055417,-0.942391,-0.417886,-0.142662,-0.218062,0.769918,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0
2,-0.777503,1.39545,0.750582,-0.142662,-0.218062,-0.031432,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
3,0.377835,-0.275397,-0.028397,0.871091,-0.218062,-0.031432,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
4,-1.49959,-0.812954,-0.028397,-0.142662,-0.218062,-0.832781,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0


In [138]:
# A function to slice the training data
# referred to lab5 material
from numpy.random      import permutation
def fold(df,proportion=.2): 
    random_indices = permutation(df.index)                 # Randomly reorder the index of nba.
    test_cutoff    = int(np.floor(len(df) * proportion))
    test           = df.loc[random_indices[1:test_cutoff]] # A random test set
    train          = df.loc[random_indices[test_cutoff:]]  # A random training set
    
    return test,train

In [139]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier

# Three machine learning algorithms will be used here:
# 1. K nearest neighbors strategy
# 2. Neural Net
# 3. Decision Tree
# referred to CSCI4146 lab5 and 
# http://scikit-learn.org/stable/auto_examples/classification/plot_classifier_comparison.html
# 
x_cols= list(data_train.columns)
x_cols.pop()
x_cols.pop()
# print x_cols
y_col=[99, 100]

test,train = fold(data_train)


In [140]:
# try to find the best performance by adjusting number of neighbors
# referred to lab5 

# KMAX    = 25 # Max K model
# for K in range(5,KMAX):
#     print("Running 5 folds with %i-nearest neighbors..." % K)
#     knn = KNeighborsClassifier(n_neighbors=K)
    
#     for f in range(5):
#         test,train = fold(data_train)

#         # Train and test the model:
#         knn.fit(train[x_cols], train[y_col])
#         # And calculate the Accuracy
#         prediction_score = knn.score(test[x_cols], test[y_col])
#         print "Fold:", f, "Accuracy: ", prediction_score

# this may take a while...

Running 5 folds with 5-nearest neighbors...


KeyboardInterrupt: 

In [141]:
# the Accuracy performance is better when number of neighbors set at 17
# test it on the test dataset
knn = KNeighborsClassifier(n_neighbors=17)
knn.fit(train[x_cols], train[y_col]) 

accuracy =knn.score(data_test[x_cols], data_test[y_col])
print accuracy

0.842331552116


In [None]:
# Method 2 Neural Net
test,train = fold(data_train)
mlpc = MLPClassifier() # use default setting
mlpc.fit(train[x_cols], train[y_col])
accuracy =mlpc.score(test[x_cols], test[y_col])
print accuracy

In [126]:
# Method 3  Decision Tree
test,train = fold(data_train)
clf = DecisionTreeClassifier()
clf.fit(train[x_cols], train[y_col])
accuracy = clf.score(test[x_cols], test[y_col])
print accuracy

0.812624788819
