In [1]:
import pandas as pd
from scipy import io
import numpy as np

In [2]:
arff_file = io.arff.loadarff('phpMawTba.arff')

In [3]:
adult_census = pd.DataFrame(arff_file[0])

# convert bytes columns to strings

str_df = adult_census.select_dtypes([object])
str_df = str_df.stack().str.decode('utf-8').unstack()

for col in str_df:
    adult_census[col] = str_df[col]

In [4]:
#select only numerical columns to fit model + target
adult_census_num = adult_census[['age', 'capital-gain','capital-loss','hours-per-week', 'class']]

In [5]:
#create training and test set, number chosen based on in class notebook
data = adult_census_num.sample(n = 39073)
outer = adult_census_num.merge(data, how = 'outer', indicator = True)
test = outer[(outer._merge=='left_only')].drop('_merge', axis = 1)

In [6]:
target_name = "class"
target = data[target_name]
target

36958    <=50K
21845    <=50K
24778     >50K
43814    <=50K
11235     >50K
         ...  
34081    <=50K
23156    <=50K
14426     >50K
87        >50K
26736     >50K
Name: class, Length: 39073, dtype: object

In [7]:
data = data.drop(columns = [target_name])
data.head()

Unnamed: 0,age,capital-gain,capital-loss,hours-per-week
36958,24.0,0.0,0.0,17.0
21845,46.0,0.0,0.0,40.0
24778,35.0,0.0,0.0,40.0
43814,23.0,0.0,0.0,50.0
11235,36.0,0.0,0.0,40.0


In [8]:
from sklearn.neighbors import KNeighborsClassifier

In [9]:
model = KNeighborsClassifier(n_neighbors = 50)

In [10]:
_ = model.fit(data, target)

In [11]:
target_predicted = model.predict(data)

In [12]:
target[:10] == target_predicted[:10]

36958     True
21845     True
24778    False
43814     True
11235    False
844       True
22891     True
2655      True
33884     True
39145     True
Name: class, dtype: bool

In [13]:
accuracy = model.score(data, target)
model_name = model.__class__.__name__

print(f"The training accuracy using a {model_name} is {accuracy:.3f}")

The training accuracy using a KNeighborsClassifier is 0.827


In [14]:
#set up test data
target_test = test[target_name]
test = test.drop(columns = [target_name])

In [15]:
accuracy = model.score(test, target_test)
print(f"The test accuracy using a {model_name} is {accuracy:.3f}")

The test accuracy using a KNeighborsClassifier is 0.824
