In [1]:
import pandas as pd
from scipy import io
import numpy as np

In [2]:
arff_file = io.arff.loadarff('phpMawTba.arff')

In [3]:
adult_census = pd.DataFrame(arff_file[0])

# convert bytes columns to strings

str_df = adult_census.select_dtypes([object])
str_df = str_df.stack().str.decode('utf-8').unstack()

for col in str_df:
    adult_census[col] = str_df[col]

In [4]:
adult_census.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,class
0,25.0,Private,226802.0,11th,7.0,Never-married,Machine-op-inspct,Own-child,Black,Male,0.0,0.0,40.0,United-States,<=50K
1,38.0,Private,89814.0,HS-grad,9.0,Married-civ-spouse,Farming-fishing,Husband,White,Male,0.0,0.0,50.0,United-States,<=50K
2,28.0,Local-gov,336951.0,Assoc-acdm,12.0,Married-civ-spouse,Protective-serv,Husband,White,Male,0.0,0.0,40.0,United-States,>50K
3,44.0,Private,160323.0,Some-college,10.0,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688.0,0.0,40.0,United-States,>50K
4,18.0,?,103497.0,Some-college,10.0,Never-married,?,Own-child,White,Female,0.0,0.0,30.0,United-States,<=50K


In [5]:
#select only numerical columns to fit model + target
adult_census_num = adult_census[['age', 'capital-gain','capital-loss','hours-per-week', 'class']]

In [6]:
adult_census_num.head()

Unnamed: 0,age,capital-gain,capital-loss,hours-per-week,class
0,25.0,0.0,0.0,40.0,<=50K
1,38.0,0.0,0.0,50.0,<=50K
2,28.0,0.0,0.0,40.0,>50K
3,44.0,7688.0,0.0,40.0,>50K
4,18.0,0.0,0.0,30.0,<=50K


In [7]:
#create training and test set, number chosen based on in class notebook
data = adult_census_num.sample(n = 39073)
outer = adult_census_num.merge(data, how = 'outer', indicator = True)
test = outer[(outer._merge=='left_only')].drop('_merge', axis = 1)

In [8]:
data.head()

Unnamed: 0,age,capital-gain,capital-loss,hours-per-week,class
23856,42.0,0.0,0.0,50.0,<=50K
37292,21.0,0.0,0.0,40.0,<=50K
35764,53.0,0.0,1902.0,40.0,>50K
36599,38.0,0.0,0.0,50.0,<=50K
39583,59.0,0.0,0.0,50.0,<=50K


In [9]:
target_name = "class"
target = data[target_name]
target

23856    <=50K
37292    <=50K
35764     >50K
36599    <=50K
39583    <=50K
         ...  
13903     >50K
44421    <=50K
9188     <=50K
33634    <=50K
32118    <=50K
Name: class, Length: 39073, dtype: object

In [10]:
data = data.drop(columns = [target_name])
data.head()

Unnamed: 0,age,capital-gain,capital-loss,hours-per-week
23856,42.0,0.0,0.0,50.0
37292,21.0,0.0,0.0,40.0
35764,53.0,0.0,1902.0,40.0
36599,38.0,0.0,0.0,50.0
39583,59.0,0.0,0.0,50.0


In [11]:
data.columns

Index(['age', 'capital-gain', 'capital-loss', 'hours-per-week'], dtype='object')

In [12]:
print(
    f"The dataset contains {data.shape[0]} samples and "
    f"{data.shape[1]} features"
)

The dataset contains 39073 samples and 4 features


In [13]:
#fit K-nearest neighbors model

from sklearn.neighbors import KNeighborsClassifier

model = KNeighborsClassifier()
_ = model.fit(data, target)

In [14]:
target_predicted = model.predict(data)

In [15]:
target_predicted[:5]

array(['<=50K', '<=50K', '>50K', '>50K', '<=50K'], dtype=object)

In [16]:
#compare predicted to actual data
target[:5]

23856    <=50K
37292    <=50K
35764     >50K
36599    <=50K
39583    <=50K
Name: class, dtype: object

In [17]:
target[:5] == target_predicted[:5]

23856     True
37292     True
35764     True
36599    False
39583     True
Name: class, dtype: bool

In [18]:
print(
    "Number of correct predictions: "
    f"{(target[:5] == target_predicted[:5]).sum()} / 5"
)

Number of correct predictions: 4 / 5


In [19]:
(target == target_predicted).mean()

0.8136564891357203

In [20]:
#set up test data
target_test = test[target_name]
test = test.drop(columns = [target_name])

In [21]:
print(
    f"The test dataset contains {test.shape[0]} samples and "
    f"{test.shape[1]} features."
)

The test dataset contains 1128 samples and 4 features.


In [22]:
#use score method instead of manually computing the average success rate
accuracy = model.score(test, target_test)
model_name = model.__class__.__name__

print(f"The test accuracy using a {model_name} is {accuracy:.3f}")

The test accuracy using a KNeighborsClassifier is 0.840
