In [66]:
# Import the necessary modules and libraries
import numpy as np
from sklearn import linear_model, datasets
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, log_loss, confusion_matrix as cm
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_selection import RFE
import matplotlib.pyplot as plt
import pandas as pd

plt.style.use('ggplot')

In [41]:
np.random.seed(5)
train = pd.read_csv('adult_train.csv', header=0)
test = pd.read_csv('adult_test.csv', header=0)

In [42]:
train.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'sex',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
       'income'],
      dtype='object')

In [43]:
train['workclass'] = train['workclass'].str.strip()
train['marital-status'] = train['marital-status'].str.strip()
train['race'] = train['race'].str.strip()
train['sex'] = train['sex'].str.strip()
train['relationship'] = train['relationship'].str.strip()

In [53]:
train['workclass'].unique()

array(['Private', 'Self-emp-not-inc', 'Local-gov', 'Federal-gov',
       'Self-emp-inc', 'State-gov', 'Without-pay', 'Never-worked'],
      dtype=object)

In [50]:
train = train[train['workclass'] != '?']

In [54]:
#change categorical var to numerical var
train['workclass_num'] = train.workclass.map({'Private':0, 'State-gov':1, 'Federal-gov':2, 'Self-emp-not-inc':3, 'Self-emp-inc':4, 'Local-gov':5, 'Without-pay':6, 'Never-worked':7})
train['marital_num'] = train['marital-status'].map({'Widowed':0, 'Divorced':1, 'Separated':2, 'Never-married':3, 'Married-civ-spouse':4, 'Married-AF-spouse':4, 'Married-spouse-absent':5})
train['race_num'] = train.race.map({'White':0, 'Black':1, 'Asian-Pac-Islander':2, 'Amer-Indian-Eskimo':3, 'Other':4})
train['sex_num'] = np.where(train.sex == 'Female', 0, 1)
train['rel_num'] = train.relationship.map({'Not-in-family':0, 'Unmarried':0, 'Own-child':0, 'Other-relative':0, 'Husband':1, 'Wife':1})

In [55]:
train.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income,workclass_num,marital_num,race_num,sex_num,rel_num
0,24,Private,220323,Some-college,10,Married-civ-spouse,Prof-specialty,Husband,Black,Male,0,0,40,United-States,0,0,4,1,1,1
1,42,Private,89073,HS-grad,9,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,48,United-States,0,0,4,0,1,1
2,28,Private,51461,Bachelors,13,Married-civ-spouse,Adm-clerical,Husband,White,Male,0,1887,40,United-States,1,0,4,0,1,1
3,41,Private,139907,10th,6,Never-married,Handlers-cleaners,Unmarried,White,Male,0,0,50,United-States,0,0,3,0,1,0
4,23,Private,211678,Some-college,10,Never-married,Machine-op-inspct,Not-in-family,White,Male,0,0,40,United-States,0,0,3,0,1,0


In [56]:
train['workclass_num'].unique()

array([0, 3, 5, 2, 4, 1, 6, 7], dtype=int64)

In [58]:
train.describe()

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week,income,workclass_num,marital_num,race_num,sex_num,rel_num
count,21497.0,21497.0,21497.0,21497.0,21497.0,21497.0,21497.0,21497.0,21497.0,21497.0,21497.0,21497.0
mean,38.545611,189954.1,10.137461,1115.437224,87.633902,40.992836,0.252547,0.838582,3.105643,0.217426,0.677164,0.4636
std,13.156199,105629.1,2.573996,7432.478043,403.120147,11.998497,0.434483,1.580578,1.155582,0.622273,0.467571,0.498685
min,17.0,13769.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,28.0,117789.0,9.0,0.0,0.0,40.0,0.0,0.0,3.0,0.0,0.0,0.0
50%,37.0,178615.0,10.0,0.0,0.0,40.0,0.0,0.0,3.0,0.0,1.0,0.0
75%,47.0,237608.0,13.0,0.0,0.0,45.0,1.0,1.0,4.0,0.0,1.0,1.0
max,90.0,1484705.0,16.0,99999.0,4356.0,99.0,1.0,7.0,5.0,4.0,1.0,1.0


In [59]:
#train logistic regression classifier
X = train[['age','education-num','hours-per-week','capital-gain','capital-loss','workclass_num','marital_num','race_num','sex_num','rel_num']]
y = train['income']
 


In [60]:
# create a base classifier used to evaluate a subset of attributes
from sklearn.linear_model import LogisticRegression
logreg = LogisticRegression()

# create the RFE model and select 3 attributes
rfe = RFE(logreg, 3)
rfe = rfe.fit(X, y)


In [61]:
lin_clf = linear_model.LogisticRegression().fit(X, y)

In [62]:
#estimate accuracy
lin_clf.score(X,y)

0.8393729357584779

In [64]:
#Decision tree
from sklearn import tree
tree_clf = tree.DecisionTreeClassifier()
tree_clf = tree_clf.fit(X, y)

In [65]:
#est accu for decision tree
tree_clf.score(X,y)

0.9494813229752989

In [68]:
#random forest
# Import the model we are using
from sklearn.ensemble import RandomForestRegressor
# Instantiate model with 1000 decision trees
rf = RandomForestRegressor(n_estimators = 1000, random_state = 42)
# Train the model on training data
rf.fit(X, y)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=1,
           oob_score=False, random_state=42, verbose=0, warm_start=False)

In [69]:
forest_clf=rf.fit(X, y)

In [70]:
forest_clf

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=1,
           oob_score=False, random_state=42, verbose=0, warm_start=False)

In [72]:
# Use the forest's predict method on the test data
forest_preds = rf.predict(X_test)


In [73]:
forest_preds

array([0.        , 0.07062399, 0.49741007, ..., 0.20689212, 0.18185897,
       1.        ])

In [74]:
forest_clf.predict(X_test)

array([0.        , 0.07062399, 0.49741007, ..., 0.20689212, 0.18185897,
       1.        ])

In [43]:
#get test set predictions
X_test = test[['age','education-num','hours-per-week','capital-gain']]


In [79]:
lin_preds = pd.DataFrame(lin_clf.predict(X_test))

In [80]:
tree_preds = pd.DataFrame(tree_clf.predict(X_test))

In [81]:
forest_preds = pd.DataFrame(forest_clf.predict(X_test))

In [78]:
preds

Unnamed: 0,0
0,0.000000
1,0.070624
2,0.497410
3,0.415393
4,0.010133
5,0.136565
6,0.173742
7,0.423317
8,0.000000
9,0.000000


In [83]:
#save preds to csv for leaderboard scoring
lin_preds.to_csv('predictions1_xa.csv', header=False)

In [84]:
#save preds to csv for leaderboard scoring
tree_preds.to_csv('predictions2_xa.csv', header=False)

In [85]:
#save preds to csv for leaderboard scoring
forest_preds.to_csv('predictions3_xa.csv', header=False)

In [38]:
test.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
0,23,Private,122272,Bachelors,13,Never-married,Adm-clerical,Own-child,White,Female,0,0,30,United-States
1,32,Private,186824,HS-grad,9,Never-married,Machine-op-inspct,Unmarried,White,Male,0,0,40,United-States
2,43,Self-emp-not-inc,292175,Masters,14,Divorced,Exec-managerial,Unmarried,White,Female,0,0,45,United-States
3,59,Private,109015,HS-grad,9,Divorced,Tech-support,Unmarried,White,Female,0,0,40,United-States
4,23,Local-gov,190709,Assoc-acdm,12,Never-married,Protective-serv,Not-in-family,White,Male,0,0,52,United-States
