In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
df= pd.read_csv('data/income_data_2017_clean_zeros.csv.bz2', compression='bz2', index_col='peridnum')

List of features to consider

In [3]:
features_list = ['peioocc','ern_val', 'a_hga', 'H_NUMPER', 'MARSUPWT', 'FKIND', 'FPERSONS', 
                 'FOWNU6', 'FOWNU18', 'a_age']
features_list = [str.lower(v) for v in features_list]

1. Filtering for individuals eraning between \$1,000 and \$200,000 per year, 
2. Getting the features from the DataFrame, and droping nan's

In [4]:
df2 = df[(df['ern_val'] > 8000) & (df['ern_val'] < 250000) &(df['a_hrs1'] >= 40)]
features_df = df2.loc[:,features_list]
features_df = features_df.dropna()

In [5]:
# Examining the dataset
features_df.describe()

Unnamed: 0,peioocc,ern_val,a_hga,h_numper,marsupwt,fkind,fpersons,fownu6,fownu18,a_age
count,60045.0,60045.0,60045.0,60045.0,60045.0,60045.0,60045.0,60045.0,60045.0,60045.0
mean,4243.000583,45468.161845,40.554301,3.242568,149723.51115,1.492914,2.971055,0.254043,0.874877,41.733883
std,2727.607514,31153.957657,2.62345,1.516722,95081.60086,0.76562,1.51111,0.576198,1.101314,11.391952
min,10.0,8001.0,31.0,1.0,9281.0,1.0,1.0,0.0,0.0,18.0
25%,2020.0,24960.0,39.0,2.0,74180.0,1.0,2.0,0.0,0.0,33.0
50%,4600.0,37239.0,40.0,3.0,143622.0,1.0,3.0,0.0,0.0,42.0
75%,6200.0,57000.0,43.0,4.0,191769.0,2.0,4.0,0.0,2.0,50.0
max,9750.0,200000.0,46.0,15.0,978808.0,3.0,15.0,5.0,9.0,74.0


In [6]:
features_list.remove('ern_val')
X = features_df[features_list]
Y = ((features_df['ern_val'] > 40000).values)
ratio=sum(features_df['ern_val'] > 40000)/len(features_df['ern_val'])
print('True class to Flase class ratio = {:2.4f}'.format(ratio))

True class to Flase class ratio = 0.4328


Splitting Train/Test

In [43]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.15, random_state=2)

Extracting sample weights from the data 

In [44]:
weights = X_train['marsupwt']
X_train = X_train.drop('marsupwt',axis=1)
X_test = X_test.drop('marsupwt', axis=1)

Trying a desicsion tree classifier

In [45]:
from sklearn.tree import DecisionTreeClassifier
_=model = DecisionTreeClassifier(max_depth=8)
_=model.fit(X_train, Y_train, sample_weight=weights)
print(model.score(X_test, Y_test))

0.724658598868


While the score is not terribly bad, the confusion matrix whows a different picture

In [46]:
import sklearn.metrics as met
Y_pred = model.predict(X_test)
print(met.confusion_matrix(Y_test,Y_pred, labels=[0,1]))
print(met.cohen_kappa_score(Y_test,Y_pred))

[[4082 1086]
 [1394 2445]]
0.431193413735


Trying to adaptive boosting classifier

In [47]:
from sklearn.ensemble import AdaBoostClassifier
seed = 9
num_trees = 500
model = AdaBoostClassifier(n_estimators=num_trees, random_state=seed)
_=model.fit(X_train, Y_train, sample_weight=weights)
print(model.score(X_test, Y_test))

0.737870545132


In [42]:
Y_pred = model.predict(X_test)
print(met.confusion_matrix(Y_test,Y_pred, labels=[0,1]))
print(met.f1_score(Y_test,Y_pred))

[[1348  386]
 [ 397  872]]
0.690146418678


A bit better, but still problematic,
Trying a begging classifier

In [13]:
from sklearn.ensemble import BaggingClassifier
model = BaggingClassifier(n_estimators=num_trees, random_state=seed)
_=model.fit(X_train, Y_train, sample_weight=weights)
print(model.score(X_test, Y_test))

0.712572855953


In [14]:
Y_pred = model.predict(X_test)
print(met.confusion_matrix(Y_test,Y_pred, labels=[0,1]))
print(met.cohen_kappa_score(Y_test,Y_pred))

[[2635  857]
 [ 869 1644]]
0.409053880018


Finally, trying a simple naive Bayes  and logistic regression classifiers

In [15]:
from sklearn.naive_bayes import GaussianNB
model = GaussianNB()
model.fit(X_train, Y_train,sample_weight=weights)
model.score(X_test, Y_test)

0.68892589508742719

In [16]:
Y_pred = model.predict(X_test)
met.confusion_matrix(Y_test,Y_pred, labels=[0,1])

array([[2549,  943],
       [ 925, 1588]])

In [17]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
_=model.fit(X_train, Y_train, sample_weight=weights)
print(model.score(X_test, Y_test))

0.672606161532


In [48]:
Y_pred = model.predict(X_test)
met.confusion_matrix(Y_test,Y_pred, labels=[0,1])

array([[4024, 1144],
       [1217, 2622]])

Suprisingly SVM is the worst of all

from sklearn.svm import SVC
model = SVC()
_=model.fit(X_train, Y_train, sample_weight=weights)
print(model.score(X_test, y_test))

In [49]:
import xgboost as xgb
