In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree

In [2]:
cols = ['Age','Workclass','FinalWeight','Education','EduNumber','MaritalStatus','Job','Family','Race','Gender','CapitalGain','CapitalLoss','HrsWeek','NativeCountry','Salary']

In [135]:
data_ =pd.read_csv("adult.data",names=cols)
test_ = pd.read_csv("adult.test",names=cols)

In [136]:
data_.tail()

Unnamed: 0,Age,Workclass,FinalWeight,Education,EduNumber,MaritalStatus,Job,Family,Race,Gender,CapitalGain,CapitalLoss,HrsWeek,NativeCountry,Salary
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K
32560,52,Self-emp-inc,287927,HS-grad,9,Married-civ-spouse,Exec-managerial,Wife,White,Female,15024,0,40,United-States,>50K


In [137]:
data_.loc[data_.Salary == " >50K","CapitalGain"].shape

(7841,)

In [138]:
data_.loc[data_.Salary != " >50K","CapitalGain"].shape

(24720,)

In [139]:
test_.head()

Unnamed: 0,Age,Workclass,FinalWeight,Education,EduNumber,MaritalStatus,Job,Family,Race,Gender,CapitalGain,CapitalLoss,HrsWeek,NativeCountry,Salary
0,|1x3 Cross validator,,,,,,,,,,,,,,
1,25,Private,226802.0,11th,7.0,Never-married,Machine-op-inspct,Own-child,Black,Male,0.0,0.0,40.0,United-States,<=50K.
2,38,Private,89814.0,HS-grad,9.0,Married-civ-spouse,Farming-fishing,Husband,White,Male,0.0,0.0,50.0,United-States,<=50K.
3,28,Local-gov,336951.0,Assoc-acdm,12.0,Married-civ-spouse,Protective-serv,Husband,White,Male,0.0,0.0,40.0,United-States,>50K.
4,44,Private,160323.0,Some-college,10.0,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688.0,0.0,40.0,United-States,>50K.


In [140]:
#drop first row with bad data
test_.drop(0, inplace=True)
test_.reset_index(drop=True,inplace=True)

In [141]:
data_.isnull().sum()

Age              0
Workclass        0
FinalWeight      0
Education        0
EduNumber        0
MaritalStatus    0
Job              0
Family           0
Race             0
Gender           0
CapitalGain      0
CapitalLoss      0
HrsWeek          0
NativeCountry    0
Salary           0
dtype: int64

In [142]:
#let's look at the data to see which are the important values

In [143]:
data_['Education'] = data_['Education'].str.strip()

In [144]:
data_.Education.unique()

array(['Bachelors', 'HS-grad', '11th', 'Masters', '9th', 'Some-college',
       'Assoc-acdm', 'Assoc-voc', '7th-8th', 'Doctorate', 'Prof-school',
       '5th-6th', '10th', '1st-4th', 'Preschool', '12th'], dtype=object)

In [145]:
data_.loc[(data_.Salary ==' >50K')&(data_.Education == 'Masters'),:].shape[0]/data_.shape[0]

0.02945241239519671

In [146]:
for p in data_.Workclass.unique():
    print(p + "  {0:.6f}".format(data_.loc[(data_.Salary ==' >50K')&(data_.Workclass == p),:].shape[0]/data_.shape[0]))

 State-gov  0.010841
 Self-emp-not-inc  0.022235
 Private  0.152422
 Federal-gov  0.011394
 Local-gov  0.018949
 ?  0.005866
 Self-emp-inc  0.019103
 Without-pay  0.000000
 Never-worked  0.000000


In [147]:
for p in data_.Education.unique():
    print(p + "  {0:.6f}".format(data_.loc[(data_.Salary ==' >50K')&(data_.Education == p),:].shape[0]/data_.shape[0]))

Bachelors  0.068210
HS-grad  0.051442
11th  0.001843
Masters  0.029452
9th  0.000829
Some-college  0.042597
Assoc-acdm  0.008139
Assoc-voc  0.011087
7th-8th  0.001228
Doctorate  0.009398
Prof-school  0.012991
5th-6th  0.000491
10th  0.001904
1st-4th  0.000184
Preschool  0.000000
12th  0.001013


In [148]:
for p in data_.MaritalStatus.unique():
    print(p + "  {0:.6f}".format(data_.loc[(data_.Salary ==' >50K')&(data_.MaritalStatus == p),:].shape[0]/data_.shape[0]))

 Never-married  0.015079
 Married-civ-spouse  0.205522
 Divorced  0.014219
 Married-spouse-absent  0.001044
 Separated  0.002027
 Married-AF-spouse  0.000307
 Widowed  0.002610


In [16]:
for p in data_.Job.unique():
    print(p + "  {0:.6f}".format(data_.loc[(data_.Salary ==' >50K')&(data_.Job == p),:].shape[0]/data_.shape[0]))

 Adm-clerical  0.015571
 Exec-managerial  0.060440
 Handlers-cleaners  0.002641
 Prof-specialty  0.057093
 Other-service  0.004207
 Sales  0.030189
 Craft-repair  0.028531
 Transport-moving  0.009828
 Farming-fishing  0.003532
 Machine-op-inspct  0.007678
 Tech-support  0.008691
 ?  0.005866
 Protective-serv  0.006480
 Armed-Forces  0.000031
 Priv-house-serv  0.000031


In [17]:
for p in data_.Family.unique():
    print(p + "  {0:.6f}".format(data_.loc[(data_.Salary ==' >50K')&(data_.Family == p),:].shape[0]/data_.shape[0]))

 Not-in-family  0.026289
 Husband  0.181751
 Wife  0.022880
 Own-child  0.002058
 Unmarried  0.006695
 Other-relative  0.001136


In [18]:
for p in data_.Gender.unique():
    print(p + "  {0:.6f}".format(data_.loc[(data_.Salary ==' >50K')&(data_.Gender == p),:].shape[0]/data_.shape[0]))

 Male  0.204601
 Female  0.036209


In [19]:
for p in data_.Race.unique():
    print(p + "  {0:.6f}".format(data_.loc[(data_.Salary ==' >50K')&(data_.Race == p),:].shape[0]/data_.shape[0]))

 White  0.218574
 Black  0.011885
 Asian-Pac-Islander  0.008476
 Amer-Indian-Eskimo  0.001106
 Other  0.000768


In [20]:
for p in data_.NativeCountry.unique():
    print(p + "  {0:.6f}".format(data_.loc[(data_.Salary ==' >50K')&(data_.NativeCountry == p),:].shape[0]/data_.shape[0]))

 United-States  0.220233
 Cuba  0.000768
 Jamaica  0.000307
 India  0.001228
 ?  0.004484
 Mexico  0.001013
 South  0.000491
 Puerto-Rico  0.000369
 Honduras  0.000031
 England  0.000921
 Canada  0.001198
 Germany  0.001351
 Iran  0.000553
 Philippines  0.001873
 Italy  0.000768
 Poland  0.000369
 Columbia  0.000061
 Cambodia  0.000215
 Thailand  0.000092
 Ecuador  0.000123
 Laos  0.000061
 Taiwan  0.000614
 Haiti  0.000123
 Portugal  0.000123
 Dominican-Republic  0.000061
 El-Salvador  0.000276
 France  0.000369
 Guatemala  0.000092
 China  0.000614
 Japan  0.000737
 Yugoslavia  0.000184
 Peru  0.000061
 Outlying-US(Guam-USVI-etc)  0.000000
 Scotland  0.000092
 Trinadad&Tobago  0.000061
 Greece  0.000246
 Nicaragua  0.000061
 Vietnam  0.000154
 Hong  0.000184
 Ireland  0.000154
 Hungary  0.000092
 Holand-Netherlands  0.000000


In [21]:
'''
Looking at the data lets look at all the factors that are >=0.10 or 10%
Workclass.Private
MaritalStatus.Married-civ-spouse
Family.Husband
Gender.Male
Race.White
NativeCountry.United-States
So 6 factors are above 10%.  Seems like a good number for prediction.
'''

'\nLooking at the data lets look at all the factors that are >=0.10 or 10%\nWorkclass.Private\nWorkclass.Local-gov\nMaritalStatus.Married-civ-spouse\nFamily.Husband\nGender.Male\nRace.White\nNativeCountry.United-States\nEduClass.NoDegree\nWorkRate.WorkALot\nSo 9 factors are above 10%.  Seems like a good number for prediction.\n'

In [22]:
#for some reason the Age in test data is string so we'll have to cast to int
test_.Age = pd.to_numeric(test_.Age, errors='coerce')

In [23]:
test_.head()

Unnamed: 0,Age,Workclass,FinalWeight,Education,EduNumber,MaritalStatus,Job,Family,Race,Gender,CapitalGain,CapitalLoss,HrsWeek,NativeCountry,Salary
0,25,Private,226802.0,11th,7.0,Never-married,Machine-op-inspct,Own-child,Black,Male,0.0,0.0,40.0,United-States,<=50K.
1,38,Private,89814.0,HS-grad,9.0,Married-civ-spouse,Farming-fishing,Husband,White,Male,0.0,0.0,50.0,United-States,<=50K.
2,28,Local-gov,336951.0,Assoc-acdm,12.0,Married-civ-spouse,Protective-serv,Husband,White,Male,0.0,0.0,40.0,United-States,>50K.
3,44,Private,160323.0,Some-college,10.0,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688.0,0.0,40.0,United-States,>50K.
4,18,?,103497.0,Some-college,10.0,Never-married,?,Own-child,White,Female,0.0,0.0,30.0,United-States,<=50K.


In [24]:
#Gender is the only column with only two values, so I'll pull that out and replace values with numbers.
#Male =1, Female = 0
dfGender=data_.Gender
dfGender=dfGender.str.strip()
dfGenTest=test_.Gender
dfGenTest=dfGenTest.str.strip()
ser1 = pd.Series(np.where(dfGender == 'Male', 1,0))
ser2 = pd.Series(np.where(dfGenTest == 'Male', 1,0))
dfGender = pd.DataFrame(data=ser1,columns=['Gender'])
dfGenTest = pd.DataFrame(data=ser2, columns=['Gender'])

In [37]:
data_.drop("Gender",axis=1,inplace=True)
test_.drop("Gender",axis=1,inplace=True)

In [26]:
dfGenTest.tail()

Unnamed: 0,Gender
16276,0
16277,1
16278,1
16279,1
16280,1


In [25]:
#handle Salary columns  <=50K =0 , >50 = 1
dfSalary = data_.Salary.str.strip()
dfSalTest = test_.Salary.str.strip()
dfSalary = pd.Series(np.where(dfSalary == '<=50K', 0,1))
dfSalTest = pd.Series(np.where(dfSalTest == '<=50K', 0,1))

In [27]:
data_.drop('Salary',axis=1,inplace=True)
test_.drop('Salary',axis=1,inplace=True)

In [38]:
data_.head()

Unnamed: 0,Age,Workclass,FinalWeight,Education,EduNumber,MaritalStatus,Job,Family,Race,CapitalGain,CapitalLoss,HrsWeek,NativeCountry
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,2174,0,40,United-States
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,0,0,13,United-States
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,0,0,40,United-States
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,0,0,40,United-States
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,0,0,40,Cuba


In [41]:
Cat_cols = [i for i in data_.columns if type(data_[i][0])==str]
CatTest_cols = [i for i in test_.columns if type(data_[i][0])==str]

In [42]:
Num_cols = list(set(data_.columns.tolist())- set(Cat_cols))
NumTest_cols = list(set(test_.columns.tolist())- set(CatTest_cols))

In [46]:
data_[Cat_cols].head()

Unnamed: 0,Workclass,Education,MaritalStatus,Job,Family,Race,NativeCountry
0,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,United-States
1,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,United-States
2,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,United-States
3,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,United-States
4,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Cuba


In [43]:
dfStrEncode = pd.get_dummies(data=data_[Cat_cols])
dfStrEncode.astype('int32',copy=True)
dfStrEncTest = pd.get_dummies(data=test_[CatTest_cols])
dfStrEncTest.astype('int32',copy=True)
dfStrEncTest.tail()

Unnamed: 0,Workclass_ ?,Workclass_ Federal-gov,Workclass_ Local-gov,Workclass_ Never-worked,Workclass_ Private,Workclass_ Self-emp-inc,Workclass_ Self-emp-not-inc,Workclass_ State-gov,Workclass_ Without-pay,Education_ 10th,...,NativeCountry_ Portugal,NativeCountry_ Puerto-Rico,NativeCountry_ Scotland,NativeCountry_ South,NativeCountry_ Taiwan,NativeCountry_ Thailand,NativeCountry_ Trinadad&Tobago,NativeCountry_ United-States,NativeCountry_ Vietnam,NativeCountry_ Yugoslavia
16276,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
16277,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
16278,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
16279,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
16280,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [49]:
#now that we have the categorical data in a new DF, let's remove those columns from data and test
data_.drop(Cat_cols, axis=1, inplace=True)
test_.drop(CatTest_cols,axis=1, inplace=True)

In [50]:
#the test data is missing an entry for NativeCountry_ Holand-Netherlands, so I'll insert a column in test data with all zeros
a = np.zeros(shape=(16281,1))
dfHoland = pd.DataFrame(a,columns=['NativeCountry_ Holand-Netherlands'])

In [51]:
dfStrEncode.columns.get_loc('NativeCountry_ Holand-Netherlands')

73

In [52]:
#not sure if column order is important for decision trees so I'm going to put them back in order data vs test
temp1 = dfStrEncTest.iloc[:,0:73]
temp2 = dfStrEncTest.iloc[:,73:]

In [53]:
dfStrEncTest2 = pd.concat([temp1,dfHoland,temp2],axis=1)


In [63]:
#deal with missing data/columns from missing data(?)
dfStrEncTest2.rename(columns={"NativeCountry_ ?":"NativeCountry_NoInfo","Job_ ?":"Job_NoInfo",'Workclass_ ?':"Workclass_ NoInfo"},inplace=True)
dfStrEncode.rename(columns={"NativeCountry_ ?":"NativeCountry_NoInfo","Job_ ?":"Job_NoInfo",'Workclass_ ?':"Workclass_ NoInfo"},inplace=True)

In [67]:
data_.head()

Unnamed: 0,Age,FinalWeight,EduNumber,CapitalGain,CapitalLoss,HrsWeek
0,39,77516,13,2174,0,40
1,50,83311,13,0,0,13
2,38,215646,9,0,0,40
3,53,234721,7,0,0,40
4,28,338409,13,0,0,40


In [68]:
#let's normalize the numerical data
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler() # default=(0, 1)
data2 =scaler.fit_transform(data_)
test2 =scaler.fit_transform(test_)

In [71]:
data3 = pd.DataFrame(data2,columns=data_.columns)
test3 = pd.DataFrame(test2,columns=test_.columns)

In [73]:
test3.head()

Unnamed: 0,Age,FinalWeight,EduNumber,CapitalGain,CapitalLoss,HrsWeek
0,0.109589,0.14443,0.4,0.0,0.0,0.397959
1,0.287671,0.051677,0.533333,0.0,0.0,0.5
2,0.150685,0.219011,0.733333,0.0,0.0,0.397959
3,0.369863,0.099418,0.6,0.076881,0.0,0.397959
4,0.013699,0.060942,0.6,0.0,0.0,0.295918


In [74]:
#combine dataframes and assign to testing variable
x_train = pd.concat([data3,dfStrEncode,dfGender], axis=1)
x_test = pd.concat([test3,dfStrEncTest2,dfGenTest], axis=1)
y_train = dfSalary
y_test = dfSalTest

In [80]:
x_train.head()

Unnamed: 0,Age,FinalWeight,EduNumber,CapitalGain,CapitalLoss,HrsWeek,Workclass_ NoInfo,Workclass_ Federal-gov,Workclass_ Local-gov,Workclass_ Never-worked,...,NativeCountry_ Puerto-Rico,NativeCountry_ Scotland,NativeCountry_ South,NativeCountry_ Taiwan,NativeCountry_ Thailand,NativeCountry_ Trinadad&Tobago,NativeCountry_ United-States,NativeCountry_ Vietnam,NativeCountry_ Yugoslavia,Gender
0,0.30137,0.044302,0.8,0.02174,0.0,0.397959,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1
1,0.452055,0.048238,0.8,0.0,0.0,0.122449,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1
2,0.287671,0.138113,0.533333,0.0,0.0,0.397959,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1
3,0.493151,0.151068,0.4,0.0,0.0,0.397959,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1
4,0.150685,0.221488,0.8,0.0,0.0,0.397959,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [81]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

  from numpy.core.umath_tests import inner1d


In [82]:
m1=DecisionTreeClassifier()
m2=GaussianNB()
m3=RandomForestClassifier()
m4=LogisticRegression()

In [88]:
#Decision Tree
m1.fit(x_train,y_train)
print(recall_score(y_test,m1.predict(x_test)))
print(accuracy_score(y_test,m1.predict(x_test)))

0.23278668386462747
0.23278668386462747


In [89]:
#NB
m2.fit(x_train,y_train)
print(recall_score(y_test,m2.predict(x_test)))
print(accuracy_score(y_test,m2.predict(x_test)))

0.6702905226951661
0.6702905226951661


In [91]:
#Random Forest
m3.fit(x_train,y_train)
print(recall_score(y_test,m3.predict(x_test)))
print(accuracy_score(y_test,m3.predict(x_test)))

0.18543087033965971
0.18543087033965971


So these scores are really bad. Naive Bayes is the best single method.  Since these are so bad, let's pull out the important features and see if we can't improve them.

In [115]:
x_train.columns[np.where(m1.feature_importances_!=0)]

Index(['Age', 'FinalWeight', 'EduNumber', 'CapitalGain', 'CapitalLoss',
       'HrsWeek', 'Workclass_ NoInfo', 'Workclass_ Federal-gov',
       'Workclass_ Local-gov', 'Workclass_ Private', 'Workclass_ Self-emp-inc',
       'Workclass_ Self-emp-not-inc', 'Workclass_ State-gov', 'Education_10th',
       'Education_11th', 'Education_12th', 'Education_1st-4th',
       'Education_5th-6th', 'Education_7th-8th', 'Education_9th',
       'Education_Assoc-acdm', 'Education_Assoc-voc', 'Education_Bachelors',
       'Education_Doctorate', 'Education_HS-grad', 'Education_Masters',
       'Education_Prof-school', 'Education_Some-college',
       'MaritalStatus_ Divorced', 'MaritalStatus_ Married-AF-spouse',
       'MaritalStatus_ Married-civ-spouse',
       'MaritalStatus_ Married-spouse-absent', 'MaritalStatus_ Never-married',
       'MaritalStatus_ Separated', 'MaritalStatus_ Widowed', 'Job_NoInfo',
       'Job_ Adm-clerical', 'Job_ Craft-repair', 'Job_ Exec-managerial',
       'Job_ Farming-fish

95 out of 100 columns are important.  Not very helpful.  

In [120]:
#let's try ensemble
Ensemble_Train = pd.DataFrame({'DT':list(m1.predict(x_train)),'NB':list(m2.predict(x_train)),'RF':list(m3.predict(x_train)),'Actual':list(y_train)})
Ensemble_Test = pd.DataFrame({'DT':list(m1.predict(x_test)),'NB':list(m2.predict(x_test)),'RF':list(m3.predict(x_test)),'Actual':list(y_test)})

In [125]:
EnsembleXTrain = Ensemble_Train.iloc[:,:2].values
EnsembleYTrain = Ensemble_Train.iloc[:,3].values

In [126]:
EnsembleXTest = Ensemble_Test.iloc[:,:2].values
EnsembleYTest = Ensemble_Test.iloc[:,3].values

In [127]:
m4.fit(EnsembleXTrain,EnsembleYTrain)
print(accuracy_score(EnsembleYTest,m4.predict(EnsembleXTest)))
print(recall_score(EnsembleYTest,m4.predict(EnsembleXTest)))

0.23278668386462747
0.23278668386462747


So an ensemble Logistic Regression didn't help that much either.  I think the test data is just a bad subset.  
Naive Bayes was the best algorythm for this data.  The most important features varied depending on how you look at it.

In [128]:
#lets try a bagging KNN for fun
from sklearn.ensemble import BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
bagging = BaggingClassifier(KNeighborsClassifier(), n_estimators=10)

In [129]:
bag_knn = BaggingClassifier(KNeighborsClassifier(n_neighbors=5),
n_estimators=10, max_samples=0.5,
bootstrap=True, random_state=3)

In [130]:
bag_knn.fit(x_train, y_train)
bag_knn.score(x_test, y_test)
#accuracy went from 23% to 31% so that's a good improvement

0.19839076223819177

No luck with BagKNN either.

In [131]:
#Let's try xgboost
import xgboost as xgb

In [132]:
xg_reg = xgb.XGBRegressor(objective ='reg:linear', learning_rate = 0.1, max_depth = 5, alpha = 10, n_estimators = 10)

In [133]:
xg_reg.fit(x_train.values,y_train.values)
preds = xg_reg.predict(x_test.values)

In [134]:
from sklearn.metrics import mean_squared_error
rmse = np.sqrt(mean_squared_error(y_test, preds))
print("RMSE: %f" % (rmse))

RMSE: 0.694335
