In [86]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree

In [87]:
cols = ['Age','Workclass','FinalWeight','Education','EduNumber','MaritalStatus','Job','Family','Race','Gender','CapitalGain','CapitalLoss','HrsWeek','NativeCountry','Salary']

In [88]:
data_ =pd.read_csv("adult.data",names=cols)
test_ = pd.read_csv("adult.test",names=cols)

In [89]:
data_.tail()

Unnamed: 0,Age,Workclass,FinalWeight,Education,EduNumber,MaritalStatus,Job,Family,Race,Gender,CapitalGain,CapitalLoss,HrsWeek,NativeCountry,Salary
32556,27,Private,257302,Assoc-acdm,12,Married-civ-spouse,Tech-support,Wife,White,Female,0,0,38,United-States,<=50K
32557,40,Private,154374,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K
32558,58,Private,151910,HS-grad,9,Widowed,Adm-clerical,Unmarried,White,Female,0,0,40,United-States,<=50K
32559,22,Private,201490,HS-grad,9,Never-married,Adm-clerical,Own-child,White,Male,0,0,20,United-States,<=50K
32560,52,Self-emp-inc,287927,HS-grad,9,Married-civ-spouse,Exec-managerial,Wife,White,Female,15024,0,40,United-States,>50K


In [90]:
data_.loc[data_.Salary == " >50K","CapitalGain"].shape

(7841,)

In [83]:
data_.loc[data_.Salary != " >50K","CapitalGain"].shape

(24720,)

In [91]:
test_.head()

Unnamed: 0,Age,Workclass,FinalWeight,Education,EduNumber,MaritalStatus,Job,Family,Race,Gender,CapitalGain,CapitalLoss,HrsWeek,NativeCountry,Salary
0,|1x3 Cross validator,,,,,,,,,,,,,,
1,25,Private,226802.0,11th,7.0,Never-married,Machine-op-inspct,Own-child,Black,Male,0.0,0.0,40.0,United-States,<=50K.
2,38,Private,89814.0,HS-grad,9.0,Married-civ-spouse,Farming-fishing,Husband,White,Male,0.0,0.0,50.0,United-States,<=50K.
3,28,Local-gov,336951.0,Assoc-acdm,12.0,Married-civ-spouse,Protective-serv,Husband,White,Male,0.0,0.0,40.0,United-States,>50K.
4,44,Private,160323.0,Some-college,10.0,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688.0,0.0,40.0,United-States,>50K.


In [92]:
#drop first row with bad data
test_.drop(0, inplace=True)
test_.reset_index(drop=True,inplace=True)

In [93]:
data_.isnull().sum()

Age              0
Workclass        0
FinalWeight      0
Education        0
EduNumber        0
MaritalStatus    0
Job              0
Family           0
Race             0
Gender           0
CapitalGain      0
CapitalLoss      0
HrsWeek          0
NativeCountry    0
Salary           0
dtype: int64

In [94]:
target =data_.Salary

In [95]:
#let's look at the data to see which are the important values

In [96]:
data_['Education'] = data_['Education'].str.strip()

In [97]:
data_.Education.unique()

array(['Bachelors', 'HS-grad', '11th', 'Masters', '9th', 'Some-college',
       'Assoc-acdm', 'Assoc-voc', '7th-8th', 'Doctorate', 'Prof-school',
       '5th-6th', '10th', '1st-4th', 'Preschool', '12th'], dtype=object)

In [98]:
data_.loc[(data_.Salary ==' >50K')&(data_.Education == 'Masters'),:].shape[0]/data_.shape[0]

0.02945241239519671

In [99]:
for p in data_.Workclass.unique():
    print(p + "  {0:.6f}".format(data_.loc[(data_.Salary ==' >50K')&(data_.Workclass == p),:].shape[0]/data_.shape[0]))

 State-gov  0.010841
 Self-emp-not-inc  0.022235
 Private  0.152422
 Federal-gov  0.011394
 Local-gov  0.018949
 ?  0.005866
 Self-emp-inc  0.019103
 Without-pay  0.000000
 Never-worked  0.000000


In [100]:
for p in data_.Education.unique():
    print(p + "  {0:.6f}".format(data_.loc[(data_.Salary ==' >50K')&(data_.Education == p),:].shape[0]/data_.shape[0]))

Bachelors  0.068210
HS-grad  0.051442
11th  0.001843
Masters  0.029452
9th  0.000829
Some-college  0.042597
Assoc-acdm  0.008139
Assoc-voc  0.011087
7th-8th  0.001228
Doctorate  0.009398
Prof-school  0.012991
5th-6th  0.000491
10th  0.001904
1st-4th  0.000184
Preschool  0.000000
12th  0.001013


In [101]:
#I'm going to classify Education by EdNum into groups: <Bachelors, Bachelors,Advanced degrees to see if there is more info derived
data_.loc[data_['Education'] == 'Bachelors']

Unnamed: 0,Age,Workclass,FinalWeight,Education,EduNumber,MaritalStatus,Job,Family,Race,Gender,CapitalGain,CapitalLoss,HrsWeek,NativeCountry,Salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K
9,42,Private,159449,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,5178,0,40,United-States,>50K
11,30,State-gov,141297,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,Asian-Pac-Islander,Male,0,0,40,India,>50K
12,23,Private,122272,Bachelors,13,Never-married,Adm-clerical,Own-child,White,Female,0,0,30,United-States,<=50K
25,56,Local-gov,216851,Bachelors,13,Married-civ-spouse,Tech-support,Husband,White,Male,0,0,40,United-States,>50K
32,45,Private,386940,Bachelors,13,Divorced,Exec-managerial,Own-child,White,Male,0,1408,40,United-States,<=50K
41,53,Self-emp-not-inc,88506,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,40,United-States,<=50K
42,24,Private,172987,Bachelors,13,Married-civ-spouse,Tech-support,Husband,White,Male,0,0,50,United-States,<=50K


In [102]:
#less than Bachelors will be 0, Bachelors = 1, Masters & Doctorate = 2
def f(row):
    if row['EduNumber'] < 13:
        val = 'NoDegree'
    elif row['EduNumber'] == 13:
        val = 'Bachelors'
    else:
        val = 'AdvDegree'
    return val

In [103]:
data_['EduClass'] = data_.apply(f, axis=1)

In [104]:
data_.head()

Unnamed: 0,Age,Workclass,FinalWeight,Education,EduNumber,MaritalStatus,Job,Family,Race,Gender,CapitalGain,CapitalLoss,HrsWeek,NativeCountry,Salary,EduClass
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K,Bachelors
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K,Bachelors
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K,NoDegree
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K,NoDegree
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K,Bachelors


In [105]:
#ok, this looks like more useful information
for p in data_.EduClass.unique():
    print(p + "  {0:.6f}".format(data_.loc[(data_.Salary ==' >50K')&(data_.EduClass == p),:].shape[0]/data_.shape[0]))

Bachelors  0.068210
NoDegree  0.120758
AdvDegree  0.051841


In [106]:
data_.drop('Education',axis=1,inplace=True)

In [107]:
data_.drop('EduNumber',axis=1,inplace=True)

In [108]:
#going to classify HrsWeek the same way
def hrs(row):
    if row['HrsWeek'] < 40:
        val = 'PartTime'
    elif row['HrsWeek'] == 40:
        val = 'FullTime'
    else:
        val = 'WorksALot'
    return val

In [109]:
data_['WorkRate'] = data_.apply(hrs, axis=1)

In [110]:
data_.head()

Unnamed: 0,Age,Workclass,FinalWeight,MaritalStatus,Job,Family,Race,Gender,CapitalGain,CapitalLoss,HrsWeek,NativeCountry,Salary,EduClass,WorkRate
0,39,State-gov,77516,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K,Bachelors,FullTime
1,50,Self-emp-not-inc,83311,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K,Bachelors,PartTime
2,38,Private,215646,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K,NoDegree,FullTime
3,53,Private,234721,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K,NoDegree,FullTime
4,28,Private,338409,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K,Bachelors,FullTime


In [111]:
#definitely a difference between PartTime and Fulltime or better
for p in data_.WorkRate.unique():
    print(p + "  {0:.6f}".format(data_.loc[(data_.Salary ==' >50K')&(data_.WorkRate == p),:].shape[0]/data_.shape[0]))

FullTime  0.099721
PartTime  0.022665
WorksALot  0.118424


In [112]:
data_.drop('HrsWeek',axis=1,inplace=True)

In [113]:
#let's classify/bucket Age the same way
def age_(row):
    if row['Age'] < 20:
        val = 'LessThan20'
    elif row['Age'] == 20 and row['Age'] <30:
        val = 'Twenties'
    elif row['Age'] >= 30 and row['Age'] <40:
        val = 'Thirties'
    elif row['Age'] >= 40 and row['Age'] <50:
        val = 'Forties'
    elif row['Age'] >= 50 and row['Age'] <60:
        val = 'Fifties'
    elif row['Age'] >= 60 and row['Age'] <70:
        val = 'Sixties'
    else:
        val = 'Elderly'
    return val

In [114]:
data_['AgeClass'] = data_.apply(age_, axis=1)

In [115]:
data_.head()

Unnamed: 0,Age,Workclass,FinalWeight,MaritalStatus,Job,Family,Race,Gender,CapitalGain,CapitalLoss,NativeCountry,Salary,EduClass,WorkRate,AgeClass
0,39,State-gov,77516,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,United-States,<=50K,Bachelors,FullTime,Thirties
1,50,Self-emp-not-inc,83311,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,United-States,<=50K,Bachelors,PartTime,Fifties
2,38,Private,215646,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,United-States,<=50K,NoDegree,FullTime,Thirties
3,53,Private,234721,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,United-States,<=50K,NoDegree,FullTime,Fifties
4,28,Private,338409,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,Cuba,<=50K,Bachelors,FullTime,Elderly


In [116]:
#not much help even with the classification
for p in data_.AgeClass.unique():
    print(p + "  {0:.6f}".format(data_.loc[(data_.Salary ==' >50K')&(data_.AgeClass == p),:].shape[0]/data_.shape[0]))

Thirties  0.070913
Fifties  0.052425
Elderly  0.019103
Forties  0.081754
LessThan20  0.000061
Twenties  0.000000
Sixties  0.016554


In [117]:
data_.drop('Age',axis=1,inplace=True)

In [118]:
for p in data_.Job.unique():
    print(p + "  {0:.6f}".format(data_.loc[(data_.Salary ==' >50K')&(data_.Job == p),:].shape[0]/data_.shape[0]))

 Adm-clerical  0.015571
 Exec-managerial  0.060440
 Handlers-cleaners  0.002641
 Prof-specialty  0.057093
 Other-service  0.004207
 Sales  0.030189
 Craft-repair  0.028531
 Transport-moving  0.009828
 Farming-fishing  0.003532
 Machine-op-inspct  0.007678
 Tech-support  0.008691
 ?  0.005866
 Protective-serv  0.006480
 Armed-Forces  0.000031
 Priv-house-serv  0.000031


In [119]:
for p in data_.Family.unique():
    print(p + "  {0:.6f}".format(data_.loc[(data_.Salary ==' >50K')&(data_.Family == p),:].shape[0]/data_.shape[0]))

 Not-in-family  0.026289
 Husband  0.181751
 Wife  0.022880
 Own-child  0.002058
 Unmarried  0.006695
 Other-relative  0.001136


In [120]:
for p in data_.Gender.unique():
    print(p + "  {0:.6f}".format(data_.loc[(data_.Salary ==' >50K')&(data_.Gender == p),:].shape[0]/data_.shape[0]))

 Male  0.204601
 Female  0.036209


In [121]:
for p in data_.Race.unique():
    print(p + "  {0:.6f}".format(data_.loc[(data_.Salary ==' >50K')&(data_.Race == p),:].shape[0]/data_.shape[0]))

 White  0.218574
 Black  0.011885
 Asian-Pac-Islander  0.008476
 Amer-Indian-Eskimo  0.001106
 Other  0.000768


In [122]:
for p in data_.NativeCountry.unique():
    print(p + "  {0:.6f}".format(data_.loc[(data_.Salary ==' >50K')&(data_.NativeCountry == p),:].shape[0]/data_.shape[0]))

 United-States  0.220233
 Cuba  0.000768
 Jamaica  0.000307
 India  0.001228
 ?  0.004484
 Mexico  0.001013
 South  0.000491
 Puerto-Rico  0.000369
 Honduras  0.000031
 England  0.000921
 Canada  0.001198
 Germany  0.001351
 Iran  0.000553
 Philippines  0.001873
 Italy  0.000768
 Poland  0.000369
 Columbia  0.000061
 Cambodia  0.000215
 Thailand  0.000092
 Ecuador  0.000123
 Laos  0.000061
 Taiwan  0.000614
 Haiti  0.000123
 Portugal  0.000123
 Dominican-Republic  0.000061
 El-Salvador  0.000276
 France  0.000369
 Guatemala  0.000092
 China  0.000614
 Japan  0.000737
 Yugoslavia  0.000184
 Peru  0.000061
 Outlying-US(Guam-USVI-etc)  0.000000
 Scotland  0.000092
 Trinadad&Tobago  0.000061
 Greece  0.000246
 Nicaragua  0.000061
 Vietnam  0.000154
 Hong  0.000184
 Ireland  0.000154
 Hungary  0.000092
 Holand-Netherlands  0.000000


In [123]:
'''Answer to Problem #2: which factors are important
Looking at the data lets look at all the factors that are >=0.10 or 10%
Workclass.Private
Workclass.Local-gov
MaritalStatus.Married-civ-spouse
Family.Husband
Gender.Male
Race.White
NativeCountry.United-States
EduClass.NoDegree
WorkRate.WorkALot
So 9 factors are above 10%.  Seems like a good number for prediction.
'''

'Answer to Problem #2: which factors are important\nLooking at the data lets look at all the factors that are >=0.10 or 10%\nWorkclass.Private\nWorkclass.Local-gov\nMaritalStatus.Married-civ-spouse\nFamily.Husband\nGender.Male\nRace.White\nNativeCountry.United-States\nEduClass.NoDegree\nWorkRate.WorkALot\nSo 9 factors are above 10%.  Seems like a good number for prediction.\n'

In [38]:
#drop usless columns
data_.drop('FinalWeight',axis=1,inplace=True)
data_.drop('CapitalGain',axis=1,inplace=True)
data_.drop('CapitalLoss',axis=1,inplace=True)

In [124]:
data_.head()

Unnamed: 0,Workclass,FinalWeight,MaritalStatus,Job,Family,Race,Gender,CapitalGain,CapitalLoss,NativeCountry,Salary,EduClass,WorkRate,AgeClass
0,State-gov,77516,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,United-States,<=50K,Bachelors,FullTime,Thirties
1,Self-emp-not-inc,83311,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,United-States,<=50K,Bachelors,PartTime,Fifties
2,Private,215646,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,United-States,<=50K,NoDegree,FullTime,Thirties
3,Private,234721,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,United-States,<=50K,NoDegree,FullTime,Fifties
4,Private,338409,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,Cuba,<=50K,Bachelors,FullTime,Elderly


In [41]:
#transform the test data as well
test_.drop('FinalWeight',axis=1,inplace=True)
test_.drop('CapitalGain',axis=1,inplace=True)
test_.drop('CapitalLoss',axis=1,inplace=True)
test_['WorkRate'] = test_.apply(hrs, axis=1)
test_['EduClass'] = test_.apply(f, axis=1)

In [42]:
test_.head()

Unnamed: 0,Age,Workclass,Education,EduNumber,MaritalStatus,Job,Family,Race,Gender,HrsWeek,NativeCountry,Salary,WorkRate,EduClass
0,25,Private,11th,7.0,Never-married,Machine-op-inspct,Own-child,Black,Male,40.0,United-States,<=50K.,FullTime,NoDegree
1,38,Private,HS-grad,9.0,Married-civ-spouse,Farming-fishing,Husband,White,Male,50.0,United-States,<=50K.,WorksALot,NoDegree
2,28,Local-gov,Assoc-acdm,12.0,Married-civ-spouse,Protective-serv,Husband,White,Male,40.0,United-States,>50K.,FullTime,NoDegree
3,44,Private,Some-college,10.0,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,40.0,United-States,>50K.,FullTime,NoDegree
4,18,?,Some-college,10.0,Never-married,?,Own-child,White,Female,30.0,United-States,<=50K.,PartTime,NoDegree


In [125]:
#for some reason the Age in test data is string so we'll have to cast to int
test_.Age = pd.to_numeric(test_.Age, errors='coerce')

In [126]:
test_['AgeClass'] = test_.apply(age_, axis=1)

In [127]:
test_.head()

Unnamed: 0,Age,Workclass,FinalWeight,Education,EduNumber,MaritalStatus,Job,Family,Race,Gender,CapitalGain,CapitalLoss,HrsWeek,NativeCountry,Salary,AgeClass
0,25,Private,226802.0,11th,7.0,Never-married,Machine-op-inspct,Own-child,Black,Male,0.0,0.0,40.0,United-States,<=50K.,Elderly
1,38,Private,89814.0,HS-grad,9.0,Married-civ-spouse,Farming-fishing,Husband,White,Male,0.0,0.0,50.0,United-States,<=50K.,Thirties
2,28,Local-gov,336951.0,Assoc-acdm,12.0,Married-civ-spouse,Protective-serv,Husband,White,Male,0.0,0.0,40.0,United-States,>50K.,Elderly
3,44,Private,160323.0,Some-college,10.0,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688.0,0.0,40.0,United-States,>50K.,Forties
4,18,?,103497.0,Some-college,10.0,Never-married,?,Own-child,White,Female,0.0,0.0,30.0,United-States,<=50K.,LessThan20


In [128]:
test_.drop('Age',axis=1,inplace=True)
test_.drop('Education',axis=1,inplace=True)
test_.drop('EduNumber',axis=1,inplace=True)
test_.drop('HrsWeek',axis=1,inplace=True)

In [129]:
#Gender is the only column with only two values, so I'll pull that out and replace values with numbers.
#Male =1, Female = 0
dfGender=data_.Gender
dfGender=dfGender.str.strip()
dfGenTest=test_.Gender
dfGenTest=dfGenTest.str.strip()
ser1 = pd.Series(np.where(dfGender == 'Male', 1,0))
ser2 = pd.Series(np.where(dfGenTest == 'Male', 1,0))
dfGender = pd.DataFrame(data=ser1,columns=['Gender'])
dfGenTest = pd.DataFrame(data=ser2, columns=['Gender'])

In [130]:
dfGenTest.tail()

Unnamed: 0,Gender
16276,0
16277,1
16278,1
16279,1
16280,1


In [131]:
#handle Salary columns  <=50K =1 , >50 = 0
dfSalary = data_.Salary.str.strip()
dfSalTest = test_.Salary.str.strip()
dfSalary = pd.Series(np.where(dfSalary == '<=50K', 1,0))
dfSalTest = pd.Series(np.where(dfSalTest == '<=50K', 1,0))

In [132]:
data_.drop('Salary',axis=1,inplace=True)
test_.drop('Salary',axis=1,inplace=True)

In [133]:
#encode data
dfStrEncode = pd.get_dummies(data=data_)
dfStrEncode.astype('int32',copy=True)
dfStrEncTest = pd.get_dummies(data=test_)
dfStrEncTest.astype('int32',copy=True)
dfStrEncTest.tail()

Unnamed: 0,FinalWeight,CapitalGain,CapitalLoss,Workclass_ ?,Workclass_ Federal-gov,Workclass_ Local-gov,Workclass_ Never-worked,Workclass_ Private,Workclass_ Self-emp-inc,Workclass_ Self-emp-not-inc,...,NativeCountry_ United-States,NativeCountry_ Vietnam,NativeCountry_ Yugoslavia,AgeClass_Elderly,AgeClass_Fifties,AgeClass_Forties,AgeClass_LessThan20,AgeClass_Sixties,AgeClass_Thirties,AgeClass_Twenties
16276,215419.0,0.0,0.0,0,0,0,0,1,0,0,...,1,0,0,0,0,0,0,0,1,0
16277,321403.0,0.0,0.0,1,0,0,0,0,0,0,...,1,0,0,0,0,0,0,1,0,0
16278,374983.0,0.0,0.0,0,0,0,0,1,0,0,...,1,0,0,0,0,0,0,0,1,0
16279,83891.0,5455.0,0.0,0,0,0,0,1,0,0,...,1,0,0,0,0,1,0,0,0,0
16280,182148.0,0.0,0.0,0,0,0,0,0,1,0,...,1,0,0,0,0,0,0,0,1,0


In [134]:
#the test data is missing an entry for NativeCountry_ Holand-Netherlands, so I'll insert a column in test data with all zeros
a = np.zeros(shape=(16281,1))
dfHoland = pd.DataFrame(a,columns=['NativeCountry_ Holand-Netherlands'])

In [135]:
dfStrEncode.columns.get_loc('NativeCountry_ Holand-Netherlands')

62

In [136]:
#not sure if column order is important for decision trees so I'm going to put them back in order data vs test
temp1 = dfStrEncTest.iloc[:,0:59]
temp2 = dfStrEncTest.iloc[:,59:]

In [137]:
dfStrEncTest2 = pd.concat([temp1,dfHoland,temp2],axis=1)


In [138]:
#combine dataframes and assign to testing variable
x_train = pd.concat([dfStrEncode,dfGender], axis=1)
x_test = pd.concat([dfStrEncTest2,dfGenTest], axis=1)
y_train = dfSalary
y_test = dfSalTest

In [139]:
x_test.tail()

Unnamed: 0,FinalWeight,CapitalGain,CapitalLoss,Workclass_ ?,Workclass_ Federal-gov,Workclass_ Local-gov,Workclass_ Never-worked,Workclass_ Private,Workclass_ Self-emp-inc,Workclass_ Self-emp-not-inc,...,NativeCountry_ Vietnam,NativeCountry_ Yugoslavia,AgeClass_Elderly,AgeClass_Fifties,AgeClass_Forties,AgeClass_LessThan20,AgeClass_Sixties,AgeClass_Thirties,AgeClass_Twenties,Gender
16276,215419.0,0.0,0.0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
16277,321403.0,0.0,0.0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,1
16278,374983.0,0.0,0.0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,1
16279,83891.0,5455.0,0.0,0,0,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,1
16280,182148.0,0.0,0.0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,1


In [58]:
# train the decision tree
dtree = tree.DecisionTreeClassifier(criterion='entropy', max_depth=3, random_state=50)
dtree.fit(x_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=3,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=50,
            splitter='best')

In [59]:
y_pred = dtree.predict(x_test)

In [67]:
x_train.columns[np.where(dtree.feature_importances_!=0)]

Index(['MaritalStatus_ Married-civ-spouse', 'EduClass_NoDegree',
       'WorkRate_WorksALot', 'AgeClass_Elderly'],
      dtype='object')

In [62]:
len(dtree.feature_importances_)

100

In [60]:
#check accuracy
from sklearn import metrics
count_misclassified = (y_test != y_pred).sum()
print('Misclassified samples: {}'.format(count_misclassified))
accuracy = metrics.accuracy_score(y_test, y_pred)
print('Accuracy: {:.2f}'.format(accuracy))

Misclassified samples: 12491
Accuracy: 0.23


In [84]:
#So that's a pretty terrible accuracy score, but it was run on the entire data set.  I'm going to pull out the previously
#identified important columns and run just against those
x_train2 = x_train[["Workclass_ Private","Workclass_ Local-gov","MaritalStatus_ Married-civ-spouse","Family_ Husband","Gender_ Male","Race_ White","NativeCountry_ United-States","EduClass_NoDegree","WorkRate_WorksALot"]]
x_test2 = x_test[["Workclass_ Private","Workclass_ Local-gov","MaritalStatus_ Married-civ-spouse","Family_ Husband","Gender_ Male","Race_ White","NativeCountry_ United-States","EduClass_NoDegree","WorkRate_WorksALot"]]

In [91]:
# train the decision tree
dtree = tree.DecisionTreeClassifier(criterion='entropy', max_depth=3, random_state=50)
dtree.fit(x_train2, y_train)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=3,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=50,
            splitter='best')

In [92]:
y_pred = dtree.predict(x_test2)

In [93]:
#check accuracy again
#wow, that's worse.  how is that possible.
from sklearn import metrics
count_misclassified = (y_test != y_pred).sum()
print('Misclassified samples: {}'.format(count_misclassified))
accuracy = metrics.accuracy_score(y_test, y_pred)
print('Accuracy: {:.2f}'.format(accuracy))

Misclassified samples: 14088
Accuracy: 0.13


In [95]:
#lets move on to looking at different models
from sklearn.ensemble import BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
bagging = BaggingClassifier(KNeighborsClassifier(), n_estimators=10)

  from numpy.core.umath_tests import inner1d


In [96]:
bag_knn = BaggingClassifier(KNeighborsClassifier(n_neighbors=5),
n_estimators=10, max_samples=0.5,
bootstrap=True, random_state=3)

In [98]:
#let's try bagging with the full data set
bag_knn.fit(x_train, y_train)
bag_knn.score(x_test, y_test)
#accuracy went from 23% to 31% so that's a good improvement

0.3163196363859714

In [102]:
#can't try the targeted training data because bagging wants the same number of features
#so let's try bagging regressor
from sklearn.ensemble import BaggingRegressor
from sklearn.tree import DecisionTreeRegressor
bag_tree = BaggingRegressor(DecisionTreeRegressor(),max_features=0.5, n_estimators=10,random_state=3)

In [103]:
#well that didn't work at all.
bag_tree.fit(x_train, y_train)
bag_tree.score(x_test, y_test)

0.0

In [105]:
#random forest trial
from sklearn.ensemble import RandomForestClassifier

In [106]:
clf=RandomForestClassifier(n_estimators=20)
clf.fit(x_train,y_train)
y_pred2=clf.predict(x_test)

In [112]:
#check accuracy again
#so with random forest getting 2.5% better accuracy than bagging
count_misclassified = (y_test != y_pred2).sum()
print('Misclassified samples: {}'.format(count_misclassified))
accuracy = metrics.accuracy_score(y_test, y_pred2)
print('Accuracy: {:.2f}'.format(accuracy))

Misclassified samples: 10755
Accuracy: 0.34


In [114]:
#Let's try xgboost
import xgboost as xgb

In [119]:
xg_reg = xgb.XGBRegressor(objective ='reg:linear', learning_rate = 0.1, max_depth = 5, alpha = 10, n_estimators = 10)

In [126]:
xg_reg.fit(x_train.values,y_train.values)
preds = xg_reg.predict(x_test.values)

In [128]:
from sklearn.metrics import mean_squared_error
rmse = np.sqrt(mean_squared_error(y_test, preds))
print("RMSE: %f" % (rmse))

RMSE: 0.583782


In [129]:
#Problem 3 - out of the different methods, random forests gave the highest accuracy

In [160]:
data_ =pd.read_csv("adult.data",names=cols,sep=', ')
test_ = pd.read_csv("adult.test",names=cols)

  """Entry point for launching an IPython kernel.


In [153]:
#drop first row with bad data
test_.drop(0, inplace=True)
test_.reset_index(drop=True,inplace=True)

In [180]:
Categorical_cols = [i for i in data_.columns if type(data_[i][0])==str]

In [181]:
Numeric_cols = list(set(data_.columns.tolist())- set(Categorical_cols))

In [184]:
data_[Numeric_cols].head()

Unnamed: 0,HrsWeek,CapitalLoss,CapitalGain,Age,FinalWeight,EduNumber
0,40,0,2174,39,77516,13
1,13,0,0,50,83311,13
2,40,0,0,38,215646,9
3,40,0,0,53,234721,7
4,40,0,0,28,338409,13


In [186]:
dfStrEncode = pd.get_dummies(data=data_[Categorical_cols])
dfStrEncode.astype('int32',copy=True)
dfStrEncTest = pd.get_dummies(data=test_[Categorical_cols])
dfStrEncTest.astype('int32',copy=True)
dfStrEncTest.tail()

Unnamed: 0,Workclass_ ?,Workclass_ Federal-gov,Workclass_ Local-gov,Workclass_ Never-worked,Workclass_ Private,Workclass_ Self-emp-inc,Workclass_ Self-emp-not-inc,Workclass_ State-gov,Workclass_ Without-pay,Education_ 10th,...,NativeCountry_ Scotland,NativeCountry_ South,NativeCountry_ Taiwan,NativeCountry_ Thailand,NativeCountry_ Trinadad&Tobago,NativeCountry_ United-States,NativeCountry_ Vietnam,NativeCountry_ Yugoslavia,Salary_ <=50K.,Salary_ >50K.
16277,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
16278,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
16279,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
16280,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
16281,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,1,0,0,0,1


In [204]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler() # default=(0, 1)
Numeric_cols2 =scaler.fit_transform(data_[Numeric_cols])


In [211]:
df = pd.DataFrame(Numeric_cols2,columns=data_[Numeric_cols].columns)


In [212]:
df.head()

Unnamed: 0,HrsWeek,CapitalLoss,CapitalGain,Age,FinalWeight,EduNumber
0,0.397959,0.0,0.02174,0.30137,0.044302,0.8
1,0.122449,0.0,0.0,0.452055,0.048238,0.8
2,0.397959,0.0,0.0,0.287671,0.138113,0.533333
3,0.397959,0.0,0.0,0.493151,0.151068,0.4
4,0.397959,0.0,0.0,0.150685,0.221488,0.8


In [208]:
Numeric_cols2[5:]

array([[0.39795918, 0.        , 0.        , 0.2739726 , 0.18493161,
        0.86666667],
       [0.15306122, 0.        , 0.        , 0.43835616, 0.10044824,
        0.26666667],
       [0.44897959, 0.        , 0.        , 0.47945205, 0.13403581,
        0.53333333],
       ...,
       [0.39795918, 0.        , 0.        , 0.56164384, 0.09482688,
        0.53333333],
       [0.19387755, 0.        , 0.        , 0.06849315, 0.12849934,
        0.53333333],
       [0.39795918, 0.        , 0.1502415 , 0.47945205, 0.18720338,
        0.53333333]])

In [213]:
datanew= pd.concat([df,dfStrEncode],axis=1)

In [220]:
datanew["Workclass_?"].sum()

0

In [218]:
datanew=datanew.loc[datanew["Workclass_?"]!=1,:]

In [222]:
datanew.drop(["Workclass_?"],axis=1,inplace=True)

In [198]:
help(pd.concat)

Help on function concat in module pandas.core.reshape.concat:

concat(objs, axis=0, join='outer', join_axes=None, ignore_index=False, keys=None, levels=None, names=None, verify_integrity=False, sort=None, copy=True)
    Concatenate pandas objects along a particular axis with optional set logic
    along the other axes.
    
    Can also add a layer of hierarchical indexing on the concatenation axis,
    which may be useful if the labels are the same (or overlapping) on
    the passed axis number.
    
    Parameters
    ----------
    objs : a sequence or mapping of Series, DataFrame, or Panel objects
        If a dict is passed, the sorted keys will be used as the `keys`
        argument, unless it is passed, in which case the values will be
        selected (see below). Any None objects will be dropped silently unless
        they are all None in which case a ValueError will be raised
    axis : {0/'index', 1/'columns'}, default 0
        The axis to concatenate along
    join : {'in

In [189]:
datanew= pd.concat(data_.Categorical_cols,data_.Numeric_cols,axis=1)

AttributeError: 'DataFrame' object has no attribute 'Categorical_cols'

In [188]:
dfStrEncode.head()

Unnamed: 0,Workclass_?,Workclass_Federal-gov,Workclass_Local-gov,Workclass_Never-worked,Workclass_Private,Workclass_Self-emp-inc,Workclass_Self-emp-not-inc,Workclass_State-gov,Workclass_Without-pay,Education_10th,...,NativeCountry_Scotland,NativeCountry_South,NativeCountry_Taiwan,NativeCountry_Thailand,NativeCountry_Trinadad&Tobago,NativeCountry_United-States,NativeCountry_Vietnam,NativeCountry_Yugoslavia,Salary_<=50K,Salary_>50K
0,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,1,0
1,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,1,0
2,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
3,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
4,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [85]:
from sklearn.ensemble import AdaBoostClassifier
clf = AdaBoostClassifier(base_estimator=DecisionTreeClassifier())
parameters={'n_estimators':[120], 
              'learning_rate':[0.1],
              'base_estimator__min_samples_split' : np.arange(2, 8, 2),
              'base_estimator__max_depth' : np.arange(1, 4, 1)}


  from numpy.core.umath_tests import inner1d


In [224]:
datanew.columns.tolist()

['HrsWeek',
 'CapitalLoss',
 'CapitalGain',
 'Age',
 'FinalWeight',
 'EduNumber',
 'Workclass_Federal-gov',
 'Workclass_Local-gov',
 'Workclass_Never-worked',
 'Workclass_Private',
 'Workclass_Self-emp-inc',
 'Workclass_Self-emp-not-inc',
 'Workclass_State-gov',
 'Workclass_Without-pay',
 'Education_10th',
 'Education_11th',
 'Education_12th',
 'Education_1st-4th',
 'Education_5th-6th',
 'Education_7th-8th',
 'Education_9th',
 'Education_Assoc-acdm',
 'Education_Assoc-voc',
 'Education_Bachelors',
 'Education_Doctorate',
 'Education_HS-grad',
 'Education_Masters',
 'Education_Preschool',
 'Education_Prof-school',
 'Education_Some-college',
 'MaritalStatus_Divorced',
 'MaritalStatus_Married-AF-spouse',
 'MaritalStatus_Married-civ-spouse',
 'MaritalStatus_Married-spouse-absent',
 'MaritalStatus_Never-married',
 'MaritalStatus_Separated',
 'MaritalStatus_Widowed',
 'Job_?',
 'Job_Adm-clerical',
 'Job_Armed-Forces',
 'Job_Craft-repair',
 'Job_Exec-managerial',
 'Job_Farming-fishing',
 'Job

In [226]:
NativeCountry_save=datanew["NativeCountry_?"]
Job_save=datanew["Job_?"]

In [228]:
datanew.loc[datanew["Job_?"]!=1,:].shape[0]

30718

In [234]:
datanew.rename(columns={"NativeCountry_?":"NativeCountry_NoInfo","Job_?":"Job_NoInfo"},inplace=True)

In [235]:
datanew.columns.tolist()

['HrsWeek',
 'CapitalLoss',
 'CapitalGain',
 'Age',
 'FinalWeight',
 'EduNumber',
 'Workclass_Federal-gov',
 'Workclass_Local-gov',
 'Workclass_Never-worked',
 'Workclass_Private',
 'Workclass_Self-emp-inc',
 'Workclass_Self-emp-not-inc',
 'Workclass_State-gov',
 'Workclass_Without-pay',
 'Education_10th',
 'Education_11th',
 'Education_12th',
 'Education_1st-4th',
 'Education_5th-6th',
 'Education_7th-8th',
 'Education_9th',
 'Education_Assoc-acdm',
 'Education_Assoc-voc',
 'Education_Bachelors',
 'Education_Doctorate',
 'Education_HS-grad',
 'Education_Masters',
 'Education_Preschool',
 'Education_Prof-school',
 'Education_Some-college',
 'MaritalStatus_Divorced',
 'MaritalStatus_Married-AF-spouse',
 'MaritalStatus_Married-civ-spouse',
 'MaritalStatus_Married-spouse-absent',
 'MaritalStatus_Never-married',
 'MaritalStatus_Separated',
 'MaritalStatus_Widowed',
 'Job_NoInfo',
 'Job_Adm-clerical',
 'Job_Armed-Forces',
 'Job_Craft-repair',
 'Job_Exec-managerial',
 'Job_Farming-fishing',


In [240]:
datanew.drop("Salary_<=50K",axis=1,inplace=True)

In [None]:
datanew.rename(columns={"Job_?":"Job_NoInfo"}).columns.tolist()

In [None]:
datanew=datanew.loc[datanew["NativeCountry_?"]!=1,:]
datanew=datanew.loc[datanew["Job_?"]!=1,:]

In [297]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

In [298]:
m1=DecisionTreeClassifier()
m2=GaussianNB()
m3=RandomForestClassifier()

m4 = LogisticRegression()

In [251]:
input_cols = [i for i in datanew.columns if i!= 'Salary_>50K']
Target_ = 'Salary_>50K'

In [253]:
X = datanew[input_cols].values
y = datanew[Target_].values

In [258]:
from sklearn.model_selection import train_test_split

In [261]:
X_Train,X_Test,Y_train,Y_Test = train_test_split(X,y,test_size = 0.3,random_state = 1627)

In [None]:
#Decision Tree

In [299]:
m1.fit(X_Train,Y_train)
print(recall_score(Y_Test,m1.predict(X_Test)))
print(accuracy_score(Y_Test,m1.predict(X_Test)))

0.6191923577941815
0.8078758949880668


In [300]:
#KNN

In [301]:
m2.fit(X_Train,Y_train)
print(recall_score(Y_Test,m2.predict(X_Test)))
print(accuracy_score(Y_Test,m2.predict(X_Test)))

0.9652627008250109
0.4206986331091343


In [283]:
# Random Forest

In [302]:
m3.fit(X_Train,Y_train)
print(recall_score(Y_Test,m3.predict(X_Test)))
print(accuracy_score(Y_Test,m3.predict(X_Test)))

0.592270950933565
0.8393360815795183


In [323]:
Ensemble_DF_Train = pd.DataFrame({'DT':list(m1.predict(X_Train)),'NB':list(m2.predict(X_Train)),'RF':list(m3.predict(X_Train)),'Actual':list(Y_train)})

In [321]:
X_Ensemble = Ensemble_DF_Train.iloc[:,:2].values
Y_Ensemble = Ensemble_DF_Train.iloc[:,3].values

In [324]:
Ensemble_DF_Test = pd.DataFrame({'DT':list(m1.predict(X_Test)),'NB':list(m2.predict(X_Test)),'RF':list(m3.predict(X_Test)),'Actual':list(Y_Test)})

In [325]:
Ensemble_X_Test = Ensemble_DF_Test.iloc[:,:2].values
Ensemble_Y_Test = Ensemble_DF_Test.iloc[:,3].values

In [322]:
m4.fit(X_Ensemble,Y_Ensemble)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [328]:
print(accuracy_score(Ensemble_Y_Test,m4.predict(Ensemble_X_Test)))
print(recall_score(Ensemble_Y_Test,m4.predict(Ensemble_X_Test)))

0.8078758949880668
0.6191923577941815


In [145]:
from sklearn.metrics import accuracy_score,fbeta_score
from sklearn.metrics import make_scorer
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import GridSearchCV

clf = AdaBoostClassifier(base_estimator=DecisionTreeClassifier())
parameters={'n_estimators':[120], 
              'learning_rate':[0.1],
              'base_estimator__min_samples_split' : np.arange(2, 8, 2),
              'base_estimator__max_depth' : np.arange(1, 4, 1)}
scorer = make_scorer(fbeta_score, beta=0.5)
grid_obj = GridSearchCV(clf, parameters, scoring=scorer)
grid_fit = grid_obj.fit(datanew, test_)
best_clf = grid_fit.best_estimator_
predictions = (clf.fit(datanew, test_)).predict(x_test)
best_predictions = best_clf.predict(x_test)
print("Unoptimized model\n------")
print("Accuracy score on testing data: {:.4f}".format(accuracy_score(y_test, predictions)))
print("F-score on testing data: {:.4f}".format(fbeta_score(y_test, predictions, beta = 0.5)))
print("\nOptimized Model\n------")
print("Final accuracy score on the testing data: {:.4f}".format(accuracy_score(y_test, best_predictions)))
print("Final F-score on the testing data: {:.4f}".format(fb))



TypeError: '<' not supported between instances of 'float' and 'str'