In [1]:
import numpy as np
import pandas as pd

# Loading Data

And preliminary wrangling to fix column names and previously noted that '?' appears as a Nan value in the `capital-gain` column so set `na_values` to account for it.

In [108]:
census = pd.read_csv('Datasets/census.data', header=None, names=['education', 'age', 'capital-gain', 
                                                             'race', 'capital-loss', 'hours-per-week', 
                                                             'sex', 'classification'], na_values='?')

# Exploring and Cleaning

In [109]:
census.head()

Unnamed: 0,education,age,capital-gain,race,capital-loss,hours-per-week,sex,classification
0,Bachelors,39,2174.0,White,0,40,Male,<=50K
1,Bachelors,50,,White,0,13,Male,<=50K
2,HS-grad,38,,White,0,40,Male,<=50K
3,11th,53,,Black,0,40,Male,<=50K
4,Bachelors,28,0.0,Black,0,40,Female,<=50K


In [119]:
print census['capital-loss'].hasnans
print census['capital-gain'].hasnans

False
True


In [110]:
set(census.race)

{'Amer-Indian-Eskimo', 'Asian-Pac-Islander', 'Black', 'Other', 'White'}

In [111]:
census.age.describe()

count    29536.000000
mean        38.506094
std         13.811739
min         17.000000
25%         27.000000
50%         37.000000
75%         48.000000
max         90.000000
Name: age, dtype: float64

In [112]:
census.age.hasnans

False

In [113]:
print census['capital-loss'].dtype
print census['capital-loss'].hasnans

int64
False


In [114]:
print census['hours-per-week'].dtype

int64


In [115]:
census.classification.value_counts()

<=50K    22744
>50K      6792
Name: classification, dtype: int64

## Convert classification into numerical category

In [60]:
census.classification = census.classification.astype('category').cat.codes

In [61]:
census.head()

Unnamed: 0,education,age,capital-gain,race,capital-loss,hours-per-week,sex,classification
0,Bachelors,39,2174.0,White,0,40,Male,0
1,Bachelors,50,,White,0,13,Male,0
2,HS-grad,38,,White,0,40,Male,0
3,11th,53,,Black,0,40,Male,0
4,Bachelors,28,0.0,Black,0,40,Female,0


In [62]:
census.classification.value_counts()

0    22744
1     6792
Name: classification, dtype: int64

## Seems like using 0.0 for capital-gain Nan is reasonable?

In [63]:
census = census.fillna(0.0)

In [64]:
census.head()

Unnamed: 0,education,age,capital-gain,race,capital-loss,hours-per-week,sex,classification
0,Bachelors,39,2174.0,White,0,40,Male,0
1,Bachelors,50,0.0,White,0,13,Male,0
2,HS-grad,38,0.0,White,0,40,Male,0
3,11th,53,0.0,Black,0,40,Male,0
4,Bachelors,28,0.0,Black,0,40,Female,0


## Properly encode any ordinal features using the method discussed in the chapter.

I don't think any are ordinal

## Properly encode any nominal features by exploding them out into new, separate, boolean features.

>Caution:  They want new and separate features

In [65]:
census.columns

Index([u'education', u'age', u'capital-gain', u'race', u'capital-loss',
       u'hours-per-week', u'sex', u'classification'],
      dtype='object')

In [98]:
census2 = pd.concat([census, pd.get_dummies(census.race).astype(np.int),
                    pd.get_dummies(census.sex).astype(np.int)], axis=1)

In [99]:
census2.head()

Unnamed: 0,education,age,capital-gain,race,capital-loss,hours-per-week,sex,classification,Amer-Indian-Eskimo,Asian-Pac-Islander,Black,Other,White,Female,Male
0,Bachelors,39,2174.0,White,0,40,Male,0,0,0,0,0,1,0,1
1,Bachelors,50,0.0,White,0,13,Male,0,0,0,0,0,1,0,1
2,HS-grad,38,0.0,White,0,40,Male,0,0,0,0,0,1,0,1
3,11th,53,0.0,Black,0,40,Male,0,0,0,1,0,0,0,1
4,Bachelors,28,0.0,Black,0,40,Female,0,0,0,1,0,0,1,0


In [100]:
census2.education.value_counts()

HS-grad         10501
Some-college     7291
Bachelors        5355
Masters          1723
11th             1175
10th              933
7th-8th           646
9th               514
12th              433
Doctorate         413
5th-6th           333
1st-4th           168
Preschool          51
Name: education, dtype: int64

In [101]:
ordered_education = ['Preschool', '1st-4th', '5th-6th', '7th-8th', '9th', 
                    '10th', '11th', '12th', 'HS-grad', 'Some-college', 'Bachelors', 'Masters', 'Doctorate']
print len(ordered_education)
print len(set(census2.education))

13
13


In [102]:
len(set(ordered_education)&set(census2.education))

13

In [103]:
census2.education = census2.education.astype('category', ordered=True, categories=ordered_education).cat.codes

In [104]:
census2.head()

Unnamed: 0,education,age,capital-gain,race,capital-loss,hours-per-week,sex,classification,Amer-Indian-Eskimo,Asian-Pac-Islander,Black,Other,White,Female,Male
0,10,39,2174.0,White,0,40,Male,0,0,0,0,0,1,0,1
1,10,50,0.0,White,0,13,Male,0,0,0,0,0,1,0,1
2,8,38,0.0,White,0,40,Male,0,0,0,0,0,1,0,1
3,6,53,0.0,Black,0,40,Male,0,0,0,1,0,0,0,1
4,10,28,0.0,Black,0,40,Female,0,0,0,1,0,0,1,0


In [105]:
print len(census2.columns)
print len(census2.columns) - len(census.columns) 

15
7


In [106]:
census2.columns

Index([u'education', u'age', u'capital-gain', u'race', u'capital-loss',
       u'hours-per-week', u'sex', u'classification', u'Amer-Indian-Eskimo',
       u'Asian-Pac-Islander', u'Black', u'Other', u'White', u'Female',
       u'Male'],
      dtype='object')

In [107]:
census.columns

Index([u'education', u'age', u'capital-gain', u'race', u'capital-loss',
       u'hours-per-week', u'sex', u'classification'],
      dtype='object')

In [123]:
age_sex = pd.concat([pd.get_dummies(census.race), pd.get_dummies(census.sex)], axis=1)

In [124]:
age_sex.head()

Unnamed: 0,Amer-Indian-Eskimo,Asian-Pac-Islander,Black,Other,White,Female,Male
0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
1,0.0,0.0,0.0,0.0,1.0,0.0,1.0
2,0.0,0.0,0.0,0.0,1.0,0.0,1.0
3,0.0,0.0,1.0,0.0,0.0,0.0,1.0
4,0.0,0.0,1.0,0.0,0.0,1.0,0.0


In [125]:
census3 = pd.get_dummies(census, columns=['race', 'sex', 'classification'])

In [126]:
census3.education = census3.education.astype('category', ordered=True, categories=ordered_education).cat.codes

In [127]:
census3.head()

Unnamed: 0,education,age,capital-gain,capital-loss,hours-per-week,race_Amer-Indian-Eskimo,race_Asian-Pac-Islander,race_Black,race_Other,race_White,sex_Female,sex_Male,classification_<=50K,classification_>50K
0,10,39,2174.0,0,40,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0
1,10,50,,0,13,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0
2,8,38,,0,40,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0
3,6,53,,0,40,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0
4,10,28,0.0,0,40,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0


# Answers

There's a lot to be learned here believe it or not.  I was thinking that all I had to do was encode 'classification' as 1 or zero but that still makes it ordinal and so although we ultimately only need to treat `classification_>50K` as the label for prediction, we do need to one-hot encode `classification`.  Doing so produces the correct answer for the course lab questions which is that 9 new binary features are produced.

And the lab assignment does expect you to keep only the encoded features in the final dataset.  Somehow I misread the directions as if the original features should be kept as well.  That didn't seem useful to me as input to scikit-learn models but I thought oh well, whatever they want.  

>Issue:  pd.get_dummies produces floats so should probably convert all new columns to ints

In [129]:
census3 = census3.fillna(0.0)

In [130]:
census3.head()

Unnamed: 0,education,age,capital-gain,capital-loss,hours-per-week,race_Amer-Indian-Eskimo,race_Asian-Pac-Islander,race_Black,race_Other,race_White,sex_Female,sex_Male,classification_<=50K,classification_>50K
0,10,39,2174.0,0,40,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0
1,10,50,0.0,0,13,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0
2,8,38,0.0,0,40,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0
3,6,53,0.0,0,40,0.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0
4,10,28,0.0,0,40,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0


In [131]:
len(census3.columns)

14

In [133]:
float_cols = list(set(census3.columns) - set(census.columns))
float_cols

['race_Amer-Indian-Eskimo',
 'classification_<=50K',
 'race_White',
 'sex_Male',
 'sex_Female',
 'race_Black',
 'race_Asian-Pac-Islander',
 'classification_>50K',
 'race_Other']

In [134]:
for c in float_cols:
    census3[c] = census3[c].astype(np.int)

In [135]:
census3.head()

Unnamed: 0,education,age,capital-gain,capital-loss,hours-per-week,race_Amer-Indian-Eskimo,race_Asian-Pac-Islander,race_Black,race_Other,race_White,sex_Female,sex_Male,classification_<=50K,classification_>50K
0,10,39,2174.0,0,40,0,0,0,0,1,0,1,1,0
1,10,50,0.0,0,13,0,0,0,0,1,0,1,1,0
2,8,38,0.0,0,40,0,0,0,0,1,0,1,1,0
3,6,53,0.0,0,40,0,0,1,0,0,0,1,1,0
4,10,28,0.0,0,40,0,0,1,0,0,1,0,1,0
