In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
df= pd.read_csv('data/income_data_2017_clean_zeros.csv.bz2', compression='bz2', index_col='peridnum')

List of features to consider

In [24]:
features_list = ['peioocc','ern_val', 'a_hga', 'H_NUMPER', 'MARSUPWT', 'FKIND', 'FPERSONS', 
                 'FOWNU6', 'FOWNU18', 'a_age']
features_list = [str.lower(v) for v in features_list]

1. Filtering for individuals eraning between \$1,000 and \$200,000 per year, 
2. Getting the features from the DataFrame, and droping nan's

In [25]:
df2 = df[(df['ern_val'] > 8000) & (df['ern_val'] < 200000) &(df['a_hrs1'] >= 40)]
features_df = df2.loc[:,features_list]
features_df = features_df.dropna()

In [26]:
# Examining the dataset
features_df.describe()

Unnamed: 0,peioocc,ern_val,a_hga,h_numper,marsupwt,fkind,fpersons,fownu6,fownu18,a_age
count,59816.0,59816.0,59816.0,59816.0,59816.0,59816.0,59816.0,59816.0,59816.0,59816.0
mean,4251.411161,44876.551057,40.544353,3.242076,149708.252223,1.493948,2.969757,0.253845,0.87393,41.717885
std,2726.533737,29707.080522,2.620122,1.517304,95094.908008,0.766142,1.511373,0.575832,1.101171,11.397607
min,10.0,8001.0,31.0,1.0,9281.0,1.0,1.0,0.0,0.0,18.0
25%,2020.0,24500.0,39.0,2.0,74204.75,1.0,2.0,0.0,0.0,33.0
50%,4600.0,37000.0,40.0,3.0,143596.5,1.0,3.0,0.0,0.0,42.0
75%,6200.0,56000.0,43.0,4.0,191713.0,2.0,4.0,0.0,2.0,50.0
max,9750.0,199000.0,46.0,15.0,978808.0,3.0,15.0,5.0,9.0,74.0


In [27]:
features_list.remove('ern_val')
X = features_df[features_list]
Y = ((features_df['ern_val'] > 55000).values)
ratio=sum(features_df['ern_val'] > 55000)/len(features_df['ern_val'])
print('True class to Flase class ratio = {:2.4f}'.format(ratio))

True class to Flase class ratio = 0.2533


Splitting Train/Test

In [28]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.5, random_state=0)

Extracting sample weights from the data 

In [29]:
weights = X_train['marsupwt']
X_train = X_train.drop('marsupwt',axis=1)
X_test = X_test.drop('marsupwt', axis=1)

Trying a desicsion tree classifier

In [30]:
from sklearn.tree import DecisionTreeClassifier
_=model = DecisionTreeClassifier(max_depth=8)
_=model.fit(X_train, Y_train, sample_weight=weights)
print(model.score(X_test, Y_test))

0.798883241942


While the score is not terribly bad, the confusion matrix whows a different picture

In [31]:
import sklearn.metrics as met
Y_pred = model.predict(X_test)
met.confusion_matrix(Y_test,Y_pred, labels=[0,1])

array([[20627,  1736],
       [ 4279,  3266]])

Trying to adaptive boosting classifier

In [35]:
from sklearn.ensemble import AdaBoostClassifier
seed = 7
num_trees = 150
_model = AdaBoostClassifier(n_estimators=num_trees, random_state=seed)
_=model.fit(X_train, Y_train, sample_weight=weights)
print(model.score(X_test, Y_test))

0.798883241942


In [36]:
Y_pred = model.predict(X_test)
met.confusion_matrix(Y_test,Y_pred, labels=[0,1])

array([[20625,  1738],
       [ 4277,  3268]])

A bit better, but still problematic,
Trying a begging classifier

In [38]:
from sklearn.ensemble import BaggingClassifier
_model = BaggingClassifier(n_estimators=num_trees, random_state=seed)
_=model.fit(X_train, Y_train, sample_weight=weights)
print(model.score(X_test, Y_test))

0.798883241942


In [40]:
Y_pred = model.predict(X_test)
met.confusion_matrix(Y_test,Y_pred, labels=[0,1])

array([[20627,  1736],
       [ 4279,  3266]])

Finally, trying a simple naive Bayes  and logistic regression classifiers

In [42]:
from sklearn.naive_bayes import GaussianNB
model = GaussianNB()
model.fit(X_train, Y_train,sample_weight=weights)
model.score(X_test, Y_test)

0.76116758058044676

In [44]:
Y_pred = model.predict(X_test)
met.confusion_matrix(Y_test,Y_pred, labels=[0,1])

array([[18991,  3372],
       [ 3771,  3774]])

In [45]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
_=model.fit(X_train, Y_train, sample_weight=weights)
print(model.score(X_test, Y_test))

0.764343988231


In [46]:
Y_pred = model.predict(X_test)
met.confusion_matrix(Y_test,Y_pred, labels=[0,1])

array([[20816,  1547],
       [ 5501,  2044]])

Suprisingly SVM is the worst of all

In [None]:
from sklearn.svm import SVC
model = SVC()
_=model.fit(X_train, Y_train, sample_weight=weights)
print(model.score(X_test, y_test))