In [1]:
import pandas as pd
import numpy as np
import pylab as plt

# Process Input

In [2]:
census = pd.read_csv('census_income_learn.csv', header=None)
census.columns = ['AGE', 'CLSWKR', 'INDUSTRY', 'OCCUPATION', 'EDU', 'WAGE', 'SCHOOL ATTENDING', 
                  'MARRIAGE', 'INDUSTRY', 'OCCUPATION', 'RACE', 'HISPANIC ORI', 'SEX', 'MEM UNION', 
                  'UNEMPLOY REASON', 'EMPLOYMENT STATUS', 'CAPGAIN', 'CAPLOSS' , 'DIVIDENT',
                  'FEDTAX', 'TAX FILLER', 'PREVIOUS STATE', 'HOUSEHOLD STATUS', 
                  'FAMILY STATUS', 'WEIGHT', 'MIGRANT  (MSA)', 'MIGRANT (REG)', 
                  'MIGRANT (WITHIN REG)', 'LIVE IN HOUSE', 'LIVE IN SUNBELT', 'NUM EMPLOYEES', 
                  'PARENT PRESENTED', 'FATHER COUNTRY', 'MOTHER COUNTRY', 'SELF COUNTRY',
                  'CITIZEN', 'OWN BUSINESS', 'VETERAN QUESIONAIRES', 'VETERAN BENEFIT', 
                  'WORK WEEKs IN YEAR', 'YEAR', 'EARN']

census.head(5)
print(census.describe())

                 AGE           WAGE       CAPGAIN        CAPLOSS  \
count  199523.000000  199523.000000  199523.00000  199523.000000   
mean       34.494199      55.426908     434.71899      37.313788   
std        22.310895     274.896454    4697.53128     271.896428   
min         0.000000       0.000000       0.00000       0.000000   
25%        15.000000       0.000000       0.00000       0.000000   
50%        33.000000       0.000000       0.00000       0.000000   
75%        50.000000       0.000000       0.00000       0.000000   
max        90.000000    9999.000000   99999.00000    4608.000000   

            DIVIDENT         WEIGHT  NUM EMPLOYEES   OWN BUSINESS  \
count  199523.000000  199523.000000  199523.000000  199523.000000   
mean      197.529533    1740.380269       1.956180       0.175438   
std      1984.163658     993.768156       2.365126       0.553694   
min         0.000000      37.870000       0.000000       0.000000   
25%         0.000000    1061.615000       

There are 13 columns with numeric values. In these 13 columns, there are no missing values. I will use these first 13 columns for inital model. Will include more features if the accuracy is low. More inspection in row 42th, which is the role of the label. 

Change the 'EARN' row to [1, 0] for classification

In [3]:
census.loc[census['EARN'] == ' - 50000.', 'EARN'] = 0
census.loc[census['EARN'] == ' 50000+.', 'EARN'] = 1
print(census['EARN'].unique())

[0 1]


In [4]:
# The data is heavily skewed toward not making 50,000. If the model only predict 0, then it has the accuracy of
print(census['EARN'].value_counts())
print(float(187141)/199523)

0    187141
1     12382
Name: EARN, dtype: int64
0.93794199165


# Simple Linear Regression

In [5]:
from sklearn.linear_model import LinearRegression
from sklearn.cross_validation import KFold

predictors = ["AGE", "WAGE", "DIVIDENT", "NUM EMPLOYEES", "OWN BUSINESS", "VETERAN BENEFIT", "WORK WEEKs IN YEAR"]

# Initialize our algorithm class
alg = LinearRegression()

# Generate cross validation folds 
kf = KFold(census.shape[0], n_folds=3, random_state=1)

predictions = []
for train, test in kf:
    train_predictors = (census[predictors].iloc[train,:])
    train_target = census["EARN"].iloc[train]
    # Training the algorithm
    alg.fit(train_predictors, train_target)
    # Make predictions on the test fold
    test_predictions = alg.predict(census[predictors].iloc[test,:])
    predictions.append(test_predictions)

In [6]:
predictions = np.concatenate(predictions, axis=0)

# Map predictions to outcomes (only possible outcomes are 1 and 0)
predictions[predictions > .5] = 1
predictions[predictions <=.5] = 0

accuracy = ( float((predictions == census["EARN"]).sum()) / len(predictions))
print(accuracy)

0.938778987886


In [7]:
# Check how many of 1 the model predicts
unique, counts = np.unique(predictions, return_counts=True)
print np.asarray((unique, counts)).T

[[  0.00000000e+00   1.99240000e+05]
 [  1.00000000e+00   2.83000000e+02]]
