In [19]:
import numpy as np
import pandas as pd

train_set = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data', header = None)

test_set = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test', skiprows = 1, header = None)

col_labels = ['age', 'workclass', 'fnlwgt', 'education', 'education_num', 'marital_status','occupation','relationship', 'race', 'sex', 'capital_gain', 'capital_loss', 'hours_per_week','native_country', 'wage_class']
train_set.columns = col_labels
test_set.columns = col_labels

In [20]:
test_set.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,wage_class
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K.
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K.
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K.
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K.
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K.


In [36]:
#handle Salary columns  <=50K =1 , >50 = 0
traintemp = train_set.wage_class.str.strip()
testtemp = test_set.wage_class.str.strip()
traintemp = pd.Series(np.where(traintemp == '<=50K', 0,1))
testtemp = pd.Series(np.where(testtemp == '<=50K', 0,1))
y_train = pd.DataFrame(data=traintemp,columns=["wage_class"])
y_test = pd.DataFrame(data=testtemp,columns=["wage_class"])

In [38]:
#remove wage_class columns
train_set.drop("wage_class",axis=1,inplace=True)
test_set.drop("wage_class",axis=1,inplace=True)

In [39]:
#XGBoost only takes numeric data, so we'll have to split things up and then dummify the string columns
Cat_cols = [i for i in train_set.columns if type(train_set[i][0])==str]
CatTest_cols = [i for i in test_set.columns if type(train_set[i][0])==str]
Num_cols = list(set(train_set.columns.tolist())- set(Cat_cols))
NumTest_cols = list(set(test_set.columns.tolist())- set(CatTest_cols))

In [40]:
dfStrEncode = pd.get_dummies(data=train_set[Cat_cols])
dfStrEncode.astype('int32',copy=True)
dfStrEncTest = pd.get_dummies(data=test_set[CatTest_cols])
dfStrEncTest.astype('int32',copy=True)
dfStrEncTest.tail()

Unnamed: 0,workclass_ ?,workclass_ Federal-gov,workclass_ Local-gov,workclass_ Never-worked,workclass_ Private,workclass_ Self-emp-inc,workclass_ Self-emp-not-inc,workclass_ State-gov,workclass_ Without-pay,education_ 10th,...,native_country_ Portugal,native_country_ Puerto-Rico,native_country_ Scotland,native_country_ South,native_country_ Taiwan,native_country_ Thailand,native_country_ Trinadad&Tobago,native_country_ United-States,native_country_ Vietnam,native_country_ Yugoslavia
16276,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
16277,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
16278,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
16279,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
16280,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [41]:
#now that we have the categorical data in a new DF, let's remove those columns from data and test
train_set.drop(Cat_cols, axis=1, inplace=True)
test_set.drop(CatTest_cols,axis=1, inplace=True)

In [42]:
#the test data is missing an entry for NativeCountry_ Holand-Netherlands, so I'll insert a column in test data with all zeros
a = np.zeros(shape=(16281,1))
dfHoland = pd.DataFrame(a,columns=['native_country_ Holand-Netherlands'])

In [43]:
dfStrEncode.columns.get_loc('native_country_ Holand-Netherlands')

75

In [44]:
#not sure if column order is important for decision trees so I'm going to put them back in order data vs test
temp1 = dfStrEncTest.iloc[:,0:75]
temp2 = dfStrEncTest.iloc[:,75:]

In [45]:
dfStrEncTest2 = pd.concat([temp1,dfHoland,temp2],axis=1)

In [46]:
#deal with missing data/columns from missing data(?)
dfStrEncTest2.rename(columns={"native_country_ ?":"native_country_NoInfo","job_ ?":"job_NoInfo",'workclass_ ?':"workclass_ NoInfo"},inplace=True)
dfStrEncode.rename(columns={"native_country_ ?":"native_country_NoInfo","job_ ?":"job_NoInfo",'workclass_ ?':"workclass_ NoInfo"},inplace=True)

In [47]:
dfStrEncode.head()

Unnamed: 0,workclass_ NoInfo,workclass_ Federal-gov,workclass_ Local-gov,workclass_ Never-worked,workclass_ Private,workclass_ Self-emp-inc,workclass_ Self-emp-not-inc,workclass_ State-gov,workclass_ Without-pay,education_ 10th,...,native_country_ Portugal,native_country_ Puerto-Rico,native_country_ Scotland,native_country_ South,native_country_ Taiwan,native_country_ Thailand,native_country_ Trinadad&Tobago,native_country_ United-States,native_country_ Vietnam,native_country_ Yugoslavia
0,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
1,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,0,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [48]:
train_set.head()

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week
0,39,77516,13,2174,0,40
1,50,83311,13,0,0,13
2,38,215646,9,0,0,40
3,53,234721,7,0,0,40
4,28,338409,13,0,0,40


In [49]:
#let's normalize the numerical data
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler() # default=(0, 1)
train2 =pd.DataFrame(data=scaler.fit_transform(train_set),columns=train_set.columns)
test2 =pd.DataFrame(data=scaler.fit_transform(test_set),columns=test_set.columns)

In [50]:
train2.head()

Unnamed: 0,age,fnlwgt,education_num,capital_gain,capital_loss,hours_per_week
0,0.30137,0.044302,0.8,0.02174,0.0,0.397959
1,0.452055,0.048238,0.8,0.0,0.0,0.122449
2,0.287671,0.138113,0.533333,0.0,0.0,0.397959
3,0.493151,0.151068,0.4,0.0,0.0,0.397959
4,0.150685,0.221488,0.8,0.0,0.0,0.397959


In [55]:
x_train = pd.concat([dfStrEncode,train2],axis=1)
x_test  = pd.concat([dfStrEncTest2,test2],axis=1)

In [56]:
from sklearn.metrics import accuracy_score
from xgboost.sklearn import XGBClassifier

In [57]:
params = {
'objective':'binary:logistic',
'max_depth':3,
'silent':1,
'eta':1
}
num_rounds = 10

In [58]:
bst = XGBClassifier(**params).fit(x_train, y_train)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


In [59]:
preds = bst.predict(x_test)
preds

  if diff:


array([0, 0, 0, ..., 1, 0, 1])

In [76]:
correct = 0
for i in range(len(preds)):
    if y_test.iloc[i]["wage_class"] == preds[i]:
        correct += 1
acc = accuracy_score(y_test, preds)
print('Predicted correctly: {0}/{1}'.format(correct, len(preds)))
print('Error: {0:.4f}'.format(1-acc))

Predicted correctly: 2875/16281
Error: 0.8234


Terrible results.  We worked with this same dataset in Project 3 which also had terrible results.  I think the testing dataset is bad.