In [2]:
# Import the necessary modules and libraries
import numpy as np
from sklearn import linear_model, datasets
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, log_loss, confusion_matrix as cm
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
from sklearn.feature_selection import RFE
import matplotlib.pyplot as plt
import pandas as pd

plt.style.use('ggplot')

In [3]:
#Disclosure:I don't know how to use random_state.
np.random.seed(5)
train = pd.read_csv('adult_train.csv', header=0)
test = pd.read_csv('adult_test.csv', header=0)

In [4]:
train.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'sex',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
       'income'],
      dtype='object')

In [5]:
#data cleaning, remove white space
train['workclass'] = train['workclass'].str.strip()
train['marital-status'] = train['marital-status'].str.strip()
train['race'] = train['race'].str.strip()
train['sex'] = train['sex'].str.strip()
train['relationship'] = train['relationship'].str.strip()

In [6]:
train['workclass'].unique()

array(['Private', 'Self-emp-not-inc', '?', 'Local-gov', 'Federal-gov',
       'Self-emp-inc', 'State-gov', 'Without-pay', 'Never-worked'],
      dtype=object)

Logistic regression model to see which features contribute more to the income.
Source:https://www.kaggle.com/jiashenliu/who-can-earn-more-than-50k-per-year

summary(lg)
 
Call:
 glm(formula = income ~ ., family = binomial(link = "logit"), 
    data = train)
 
Deviance Residuals: 
    Min       1Q   Median       3Q      Max  
-5.4190  -0.6200  -0.3423  -0.0905   3.2819  
 
 Coefficients:
                Estimate Std. Error z value Pr(>|z|)    
 (Intercept)   -8.971e+00  2.704e-01 -33.174  < 2e-16 ***
 age            3.320e-02  1.630e-03  20.370  < 2e-16 ***
 workclass      1.750e-02  1.146e-02   1.528  0.12657    
 fnlwgt         5.551e-07  1.826e-07   3.040  0.00237 ** 
education      1.432e-02  6.170e-03   2.322  0.02025 *  
 education.num  3.231e-01  8.613e-03  37.512  < 2e-16 ***
 Marital       -2.330e-01  1.472e-02 -15.827  < 2e-16 ***
 occupation     1.272e-02  4.614e-03   2.756  0.00585 ** 
 relationship  -1.314e-01  1.726e-02  -7.615 2.64e-14 ***
 race           1.051e-01  2.542e-02   4.134 3.57e-05 ***
 sex            8.868e-01  6.072e-02  14.606  < 2e-16 ***
 capital.gain   3.261e-04  1.189e-05  27.422  < 2e-16 ***
CapitalLoss    6.436e-04  3.973e-05  16.197  < 2e-16 ***
Hours          2.864e-02  1.665e-03  17.200  < 2e-16 ***
 Country        3.877e-03  2.565e-03   1.511  0.13070    

 Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
 
(Dispersion parameter for binomial family taken to be 1)

     Null deviance: 25165  on 22792  degrees of freedom
Residual deviance: 17457  on 22778  degrees of freedom
 AIC: 17487
 
 Number of Fisher Scoring iterations: 7

 Conclusion: We can see that Maritial statues, Working Hours and Sex really matters if you want to earn more than 50K per year. In contrary, the working class is not that important. Generally speaking, you will get equal opportunity if you work hard enough, no matter what kinds of job are you doing.

In [7]:
train = train[train['workclass'] != '?']

In [23]:
#change categorical var to numerical var
train['workclass_num'] = train.workclass.map({'Private':0, 'State-gov':1, 'Federal-gov':2, 'Self-emp-not-inc':3, 'Self-emp-inc':4, 'Local-gov':5, 'Without-pay':6, 'Never-worked':7})
train['marital_num'] = train['marital-status'].map({'Widowed':0, 'Divorced':1, 'Separated':2, 'Never-married':3, 'Married-civ-spouse':4, 'Married-AF-spouse':4, 'Married-spouse-absent':5})
train['race_num'] = train.race.map({'White':1, 'Black':0, 'Asian-Pac-Islander':0, 'Amer-Indian-Eskimo':0, 'Other':0})
train['sex_num'] = np.where(train.sex == 'Female', 0, 1)
train['rel_num'] = train.relationship.map({'Not-in-family':0, 'Unmarried':0, 'Own-child':0, 'Other-relative':0, 'Husband':1, 'Wife':1})

#What if dummy variables are used?

In [39]:
#one at a time worked
# dummify marital status
#dummy_marital_st = pd.get_dummies(train['marital-status'], prefix='marital-status')
#dummy_marital_st.head()



Unnamed: 0,marital-status_Divorced,marital-status_Married-AF-spouse,marital-status_Married-civ-spouse,marital-status_Married-spouse-absent,marital-status_Never-married,marital-status_Separated,marital-status_Widowed
0,0,0,1,0,0,0,0
1,0,0,1,0,0,0,0
2,0,0,1,0,0,0,0
3,0,0,0,0,1,0,0
4,0,0,0,0,1,0,0


In [8]:
#make a loop to create multiple dummy variables
#what is wrong with my loop???
dummycol={'marital-status','race','relationship','sex'}
for col in list(dummycol):
            train= pd.concat([train,pd.get_dummies(train[col],prefix=col)],axis=0)
            train[col].fillna(0,inplace=True)

train.columns

Index(['age', 'capital-gain', 'capital-loss', 'education', 'education-num',
       'fnlwgt', 'hours-per-week', 'income', 'marital-status',
       'marital-status_Divorced', 'marital-status_Married-AF-spouse',
       'marital-status_Married-civ-spouse',
       'marital-status_Married-spouse-absent', 'marital-status_Never-married',
       'marital-status_Separated', 'marital-status_Widowed', 'native-country',
       'occupation', 'race', 'race_Amer-Indian-Eskimo',
       'race_Asian-Pac-Islander', 'race_Black', 'race_Other', 'race_White',
       'relationship', 'relationship_Husband', 'relationship_Not-in-family',
       'relationship_Other-relative', 'relationship_Own-child',
       'relationship_Unmarried', 'relationship_Wife', 'sex', 'sex_Female',
       'sex_Male', 'workclass'],
      dtype='object')

In [9]:
#why are my dummy variables are all NANs???
train.head()

Unnamed: 0,age,capital-gain,capital-loss,education,education-num,fnlwgt,hours-per-week,income,marital-status,marital-status_Divorced,...,relationship_Husband,relationship_Not-in-family,relationship_Other-relative,relationship_Own-child,relationship_Unmarried,relationship_Wife,sex,sex_Female,sex_Male,workclass
0,24.0,0.0,0.0,Some-college,10.0,220323.0,40.0,0.0,Married-civ-spouse,,...,,,,,,,Male,,,Private
1,42.0,0.0,0.0,HS-grad,9.0,89073.0,48.0,0.0,Married-civ-spouse,,...,,,,,,,Male,,,Private
2,28.0,0.0,1887.0,Bachelors,13.0,51461.0,40.0,1.0,Married-civ-spouse,,...,,,,,,,Male,,,Private
3,41.0,0.0,0.0,10th,6.0,139907.0,50.0,0.0,Never-married,,...,,,,,,,Male,,,Private
4,23.0,0.0,0.0,Some-college,10.0,211678.0,40.0,0.0,Never-married,,...,,,,,,,Male,,,Private


In [10]:
train['workclass_num'].unique()

array([0, 3, 5, 2, 4, 1, 6, 7])

In [11]:
train.describe()

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week,income,workclass_num,marital_num,race_num,sex_num,rel_num
count,21497.0,21497.0,21497.0,21497.0,21497.0,21497.0,21497.0,21497.0,21497.0,21497.0,21497.0,21497.0
mean,38.545611,189954.1,10.137461,1115.437224,87.633902,40.992836,0.252547,0.838582,3.105643,0.857701,0.677164,0.4636
std,13.156199,105629.1,2.573996,7432.478043,403.120147,11.998497,0.434483,1.580578,1.155582,0.349365,0.467571,0.498685
min,17.0,13769.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,28.0,117789.0,9.0,0.0,0.0,40.0,0.0,0.0,3.0,1.0,0.0,0.0
50%,37.0,178615.0,10.0,0.0,0.0,40.0,0.0,0.0,3.0,1.0,1.0,0.0
75%,47.0,237608.0,13.0,0.0,0.0,45.0,1.0,1.0,4.0,1.0,1.0,1.0
max,90.0,1484705.0,16.0,99999.0,4356.0,99.0,1.0,7.0,5.0,1.0,1.0,1.0


In [12]:
#train logistic regression classifier
#X = train[['age','education-num','hours-per-week','capital-gain','capital-loss','workclass_num','marital_num','race_num','sex_num','rel_num']]
#without workclass_num
X = train[['age','education-num','hours-per-week','capital-gain','capital-loss','marital_num','race_num','sex_num','rel_num']]
#X = train[['age','education-num','hours-per-week','capital-gain','capital-loss']]
y = train['income']
 


In [13]:
# create a base classifier used to evaluate a subset of attributes
from sklearn.linear_model import LogisticRegression



In [14]:
lin_clf = linear_model.LogisticRegression().fit(X, y)

In [15]:
#estimate accuracy for logreg
lin_clf.score(X,y)

0.839559008233707

In [16]:
# 10-fold cross-validation with logistic regression
#from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()
print(cross_val_score(logreg, X, y, cv=10, scoring='accuracy').mean())

0.8378843523970619


In [17]:
#Decision tree
from sklearn import tree
tree_clf = tree.DecisionTreeClassifier()
tree_clf = tree_clf.fit(X, y)

In [18]:
#est accu for decision tree
tree_clf.score(X,y)

0.9291063869377122

In [19]:
# 10-fold cross-validation with decision tree
print(cross_val_score(tree_clf, X, y, cv=10, scoring='accuracy').mean())

0.8205797319521551


In [20]:
#random forest
# Import the model we are using
from sklearn.ensemble import RandomForestRegressor
# Instantiate model with 1000 decision trees
rf = RandomForestRegressor(n_estimators = 1000, random_state = 42)
# Train the model on training data
rf.fit(X, y)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=1,
           oob_score=False, random_state=42, verbose=0, warm_start=False)

In [156]:
forest_clf=rf.fit(X, y)

In [21]:
#get random forest accuracy, since rf already randomly select train sets, k fold is not used
print(rf.score(X,y)) 

0.7265739263214138


In [22]:
# GradientBoosting
from sklearn.ensemble import GradientBoostingClassifier
gbc = GradientBoostingClassifier().fit(X, y)

print('GBC %s' % gbc.score(X, y))

GBC 0.8606317160534028


In [23]:
# 10-fold cross-validation with GradientBoosting
boosting=GradientBoostingClassifier()
print(cross_val_score(boosting, X, y, cv=20, scoring='accuracy').mean())

0.8567719756438039


In [24]:
#KNN
X_knntrain, X_knntest, y_knntrain, y_knntest = train_test_split(X, y, random_state=6)

# check classification accuracy of KNN with K=5
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_knntrain, y_knntrain)
y_pred = knn.predict(X_knntest)
metrics.accuracy_score(y_knntest, y_pred)

0.8403720930232558

In [25]:
# 10-fold cross-validation with KNN

print(cross_val_score(knn, X, y, cv=10, scoring='accuracy').mean())

0.8376056075423277


Feature engineering for test dataset:

In [21]:
test.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'sex',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country'],
      dtype='object')

In [66]:
#stripe the white space
test['workclass'] = test['workclass'].str.strip()
test['marital-status'] = test['marital-status'].str.strip()
test['race'] = test['race'].str.strip()
test['sex'] = test['sex'].str.strip()
test['relationship'] = test['relationship'].str.strip()

In [67]:
test['workclass'].unique()

array(['Private', 'Self-emp-not-inc', 'Local-gov', 'Federal-gov',
       'Self-emp-inc', '?', 'State-gov', 'Never-worked', 'Without-pay'],
      dtype=object)

In [24]:
#test = test[test['workclass'] != '?']

In [68]:
#change categorical var to numerical var
test['workclass_num'] = test.workclass.map({'Private':0, 'State-gov':1, 'Federal-gov':2, 'Self-emp-not-inc':3, 'Self-emp-inc':4, 'Local-gov':5, 'Without-pay':6, 'Never-worked':7,'?':7})
test['marital_num'] = test['marital-status'].map({'Widowed':0, 'Divorced':1, 'Separated':2, 'Never-married':3, 'Married-civ-spouse':4, 'Married-AF-spouse':4, 'Married-spouse-absent':5})
test['race_num'] = test.race.map({'White':0, 'Black':1, 'Asian-Pac-Islander':2, 'Amer-Indian-Eskimo':3, 'Other':4})
test['sex_num'] = np.where(test.sex == 'Female', 0, 1)
test['rel_num'] = test.relationship.map({'Not-in-family':0, 'Unmarried':0, 'Own-child':0, 'Other-relative':0, 'Husband':1, 'Wife':1})

In [69]:
#get test set classificers
X_test = test[['age','education-num','hours-per-week','capital-gain','capital-loss','workclass_num','marital_num','race_num','sex_num','rel_num']]


In [70]:
# Use the forest's predict method on the test data
forest_preds = rf.predict(X_test)


In [81]:
lin_preds = pd.DataFrame(lin_clf.predict(X_test))

In [83]:
tree_preds = pd.DataFrame(tree_clf.predict(X_test))

In [79]:
forest_preds = pd.DataFrame(forest_clf.predict(X_test))

In [77]:
gradient_preds =pd.DataFrame(gbc.predict(X_test))

In [33]:
#save preds to csv for leaderboard scoring
lin_preds.to_csv('predictions1_xa.csv', header=False)

In [37]:
#save preds to csv for leaderboard scoring
tree_preds.to_csv('predictions4_xa.csv', header=False)

In [38]:
#save preds to csv for leaderboard scoring
forest_preds.to_csv('predictions5_xa.csv', header=False)

In [86]:
#save GradientBoosting preds to csv for leaderboard scoring
gradient_preds.to_csv('predictions6_xa.csv', header=False)