In [1]:
import pdb
import numpy as np
import pandas as pd
import sklearn
import sklearn.linear_model
import string

## Read in the datasets

Read in the csv dataset as a pandas dataframe for ease of manipulation.

In [2]:
#get feature names
names = pd.read_table('train.names', delimiter = ':', names = ['features','options'])
feature_names = np.asarray(names['features'])
feature_options = np.asarray(names['options'])

#get feature data
train_x = pd.read_table('train-features.csv', delimiter = ',', names = feature_names[:-1])
#get label data (income bool)
train_y = np.genfromtxt('train-output.csv')

#get test feature data 
test_x = pd.read_table('test-features.csv', delimiter = ',', names = feature_names[:-1])


## Feature engineering

Encode the categorical features with one hot encoding for numerical classification techniques.

In [3]:
#get non continuous options
mask_continuous = np.asarray(names['options']) == ' continuous.'
noncon_feat = feature_names[np.logical_not(mask_continuous)][:-1] #non continuous feature names

#iterate across all categorical features and one hot encode them
for feat in noncon_feat:
    #get the different options for this feature
    ind = np.where(feature_names == feat)[0][0]
#     options = feature_options[ind].split(',')
#     options = [option.strip() for option in options]
#     print(options)

    #train
    add = pd.get_dummies(train_x[feat], prefix = feat)
    train_x = train_x.join(add)
    
    #test 
    add = pd.get_dummies(test_x[feat], prefix = feat)
    test_x = test_x.join(add)

#align the two dataframes, so that the test frame has the same shape as the train frame
train_x, test_x = train_x.align(test_x, join ='left', axis = 1, fill_value = 0)

#drop the unknown work class 
train_x = train_x.drop('workclass_ ?', axis=1)
test_x = test_x.drop('workclass_ ?', axis=1)

In [4]:
#check the shape and columns of the features
print(np.shape(test_x))
print(np.shape(train_x))
# print(train_x.columns)
# print(test_x.columns)

(16281, 115)
(32561, 115)


In [5]:
# get rid of original categorical data
train_vars = train_x.columns.values.tolist()
to_keep = [i for i in train_vars if i not in noncon_feat]
train_x_final = train_x[to_keep]

test_vars = test_x.columns.values.tolist()
to_keep = [i for i in test_vars if i not in noncon_feat]
test_x_final = test_x[to_keep]

Normalize the features using sklearn RobustScaler. This scaler defaults to using the interquartile range for the scaling statistics. 

In [6]:
from sklearn.preprocessing import RobustScaler
cols = train_x_final.columns
scaler = RobustScaler()
train_x_final = scaler.fit_transform(train_x_final)
test_x_final = scaler.transform(test_x_final)
train_x_final = pd.DataFrame(train_x_final, columns=[cols])
test_x_final = pd.DataFrame(test_x_final, columns=[cols])


In [7]:
train_y = pd.Series(train_y)

#keep the original pandas dataframe 
train_x_final_pd = train_x_final
test_x_final_pd = test_x_final

#convert the pandas dataframe to numpy array 
train_x_final = train_x_final.to_numpy()
test_x_final = test_x_final.to_numpy()


## Testing Cross validation

Run the Random Forest classifier from scikit learn to get an idea of what the useful features are and what we can expect for preliminary accuracy. Accuracy was determined from K folds cross validation. Here I also tried simpler models such as logistic regression, svms, K-nearest neighbors all to little/moderate success.

In [8]:

#try out k folds 
n_splits = 3 #number of folds
acc_arr = np.zeros(n_splits) #preallocate accuracy of each fold 

kf = sklearn.model_selection.KFold(n_splits = n_splits) #initialize kfolds
kf.get_n_splits(train_x_final,train_y) 

i = 0 
for train_index, test_index in kf.split(train_x_final, train_y):
    X_train, X_test = train_x_final[train_index], train_x_final[test_index]
    y_train, y_test = train_y[train_index], train_y[test_index]
            
    #test the model 
    
#     #logistic regression
#     logreg_model = sklearn.linear_model.LogisticRegression(penalty = 'l2', C = 0.5)
#     logreg_model.fit(X_train,y_train)
#     model = logreg_model
    
#     #svm
#     model = sklearn.svm.SVC(gamma = 'auto')
#     model.fit(X_train,y_train)
    
#     #  K nearest neighbor
#     import sklearn.neighbors
#     model = sklearn.neighbors.KNeighborsClassifier(n_neighbors =100, weights='distance', p=1)
#     model.fit(X_train, y_train)
    
#     #adaboost #86.4
#     from sklearn.ensemble import AdaBoostClassifier
#     model = AdaBoostClassifier(n_estimators=100,learning_rate= 1)
#     model.fit(X_train, y_train)
    
    # random forest #84.2
    from sklearn.ensemble import RandomForestClassifier
    model = RandomForestClassifier(n_estimators = 100)
    model.fit(X_train, y_train)
    
#     from sklearn.neural_network import MLPClassifier
#     model = MLPClassifier()
#     model.fit(X_train, y_train)
    
#     from sklearn.gaussian_process import GaussianProcessClassifier
#     from sklearn.gaussian_process.kernels import RBF
#     kernel = 1.0 * RBF(1.0)
#     gpc = GaussianProcessClassifier(kernel=kernel,
#            random_state=0).fit(X_train, y_train)
    
#     #try out naive bayes
#     from sklearn.naive_bayes import GaussianNB
#     model = GaussianNB()
#     model.fit(X_train, y_train)
#     #gradient boosting
#     from sklearn.ensemble import GradientBoostingClassifier
#     model = GradientBoostingClassifier()
#     model.fit(X_train, y_train)
    
#     from sklearn.linear_model import RidgeCV, LassoCV
#     from sklearn.svm import SVR
    
#     estimators = [('ridge', RidgeCV()),
#                   ('lasso', LassoCV(random_state=42)),
#                   ('svr', SVR(C=1, gamma=1e-6))]
    
#     from sklearn.ensemble import GradientBoostingRegressor
#     from sklearn.ensemble import StackingRegressor
#     model = StackingRegressor(
#         estimators=estimators,
#         final_estimator=GradientBoostingRegressor(random_state=42))
#     model.fit(X_train, y_train)
    
    #determine accuracy of this fold
    acc =  sklearn.metrics.accuracy_score(model.predict(X_test), y_test)
    acc_arr[i] = acc
    print(acc)
    i+=1      
# print(acc_arr)
print('mean')
print(np.mean(acc_arr))

0.8544315459738345
0.8557213930348259
0.85911729475721
mean
0.8564234112552902


Drop the lowest weighted scores according to the random forest classifier feature weights. In addition to speeding up run time, this may also improve accuracy.

In [9]:

# view the feature scores
feature_scores = pd.Series(model.feature_importances_, index=train_x_final_pd.columns).sort_values(ascending=False)
print(feature_scores)
# feature_scores[-10:]
# train_x_final_pd.columns[-10:]

#drop least important features
train_x_final_pd_2 = train_x_final_pd
test_x_final_pd_2 = test_x_final_pd
for feature in train_x_final_pd.columns[-5:]:
#     pdb.set_trace()
    train_x_final_pd_2 = train_x_final_pd_2.drop(feature[0], axis = 1)
    test_x_final_pd_2 = test_x_final_pd_2.drop(feature[0], axis = 1)

train_x_final = train_x_final_pd_2.to_numpy()
test_x_final = test_x_final_pd_2.to_numpy()

fnlwgt                                        1.568733e-01
age                                           1.512121e-01
capital-gain                                  9.571370e-02
hours-per-week                                8.308781e-02
marital-status_ Married-civ-spouse            6.173552e-02
                                                  ...     
native-country_ Honduras                      1.590997e-05
workclass_ Without-pay                        1.408628e-05
native-country_ Outlying-US(Guam-USVI-etc)    1.280102e-05
workclass_ Never-worked                       2.107648e-07
native-country_ Holand-Netherlands            0.000000e+00
Length: 107, dtype: float64


  new_axis = axis.drop(labels, errors=errors)


Retry training the model with much more advanced techniques on the pruned feature set with the same k folds cross validation on the training set. Eventually I settled on using the gradient boosting classifier due to the benefits of ensemble learning and its robustness from overfitting.


In [10]:

#try out k folds 
n_splits = 3
acc_arr = np.zeros(n_splits)
kf = sklearn.model_selection.KFold(n_splits = n_splits)
kf.get_n_splits(train_x_final,train_y)
i = 0 
for train_index, test_index in kf.split(train_x_final, train_y):
    
    X_train, X_test = train_x_final[train_index], train_x_final[test_index]
    y_train, y_test = train_y[train_index], train_y[test_index]
             
#     # random forest #84.2
#     from sklearn.ensemble import RandomForestClassifier
#     model = RandomForestClassifier(n_estimators = 300)
#     model.fit(X_train, y_train)
    
    #gradient boosting classsifier 87.1 
    from sklearn.ensemble import GradientBoostingClassifier
#     model = GradientBoostingClassifier(n_estimators = 300) #this is the kaggle sub
    model = GradientBoostingClassifier(n_estimators = 500)
    model.fit(X_train, y_train)
    
    #determine accuracy of this fold
    acc =  sklearn.metrics.accuracy_score(model.predict(X_test), y_test)
    acc_arr[i] = acc
    print(acc)
    i+=1      
# print(acc_arr)
print('mean')
print(np.mean(acc_arr))

0.8703703703703703
0.8725815367606412
0.8729383580576799
mean
0.8719634217295639


Here I train the model on the full dataset. 

In [11]:
# #train big model 
model.fit(train_x_final,train_y)


GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=500,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

## Export results 

Run the model on the test set and output the results in the desired csv format.

In [12]:
#Export the results
category = model.predict(test_x_final)
category = [int(x) for x in category]
output = {'Id':range(len(category)), 'Category': category}

output = pd.DataFrame(output)

output.to_csv('submission.csv', columns = ['Id', 'Category'],index = False)

In [13]:
# check that the output is sane 
output

Unnamed: 0,Category,Id
0,0,0
1,0,1
2,0,2
3,1,3
4,0,4
...,...,...
16276,0,16276
16277,0,16277
16278,1,16278
16279,0,16279
