# Working with scikit-learn

In [1]:
import pandas as pd
from tabulate import tabulate

## Pre-processing

In [2]:
dataset = pd.read_csv('adult_data.csv', index_col=0)

to_drop = ['fnlwgt', 'education', 'capital-gain', 'capital-loss', 'native-country']
clean_dataset = dataset.drop(columns=to_drop)
clean_dataset['income'] = clean_dataset['income'].map({'<=50K.':0, '<=50K':0, '>50K':1, '>50K.':1})
assert clean_dataset.isna().sum()['income'] == 0

In [3]:
clean_dataset.isna().sum()

age                  0
workClass         2799
education-num        0
marital-status       0
occupation        2809
relationship         0
race                 0
sex                  0
hours-per-week       0
income               0
dtype: int64

In [4]:
clean_dataset['occupation'].value_counts()

Prof-specialty       6172
Craft-repair         6112
Exec-managerial      6086
Adm-clerical         5611
Sales                5504
Other-service        4923
Machine-op-inspct    3022
Transport-moving     2355
Handlers-cleaners    2072
Farming-fishing      1490
Tech-support         1446
Protective-serv       983
Priv-house-serv       242
Armed-Forces           15
Name: occupation, dtype: int64

In [5]:
clean_dataset['workClass'].value_counts()

Private             33906
Self-emp-not-inc     3862
Local-gov            3136
State-gov            1981
Self-emp-inc         1695
Federal-gov          1432
Without-pay            21
Never-worked           10
Name: workClass, dtype: int64

Hard to replace all missing values by most represented value. Prof and Craft nearly equally represented.

In [6]:
clean_dataset.fillna({'occupation': 'Prof-specialty', 'workClass': 'Private'}, inplace=True)
clean_dataset.isna().sum()

age               0
workClass         0
education-num     0
marital-status    0
occupation        0
relationship      0
race              0
sex               0
hours-per-week    0
income            0
dtype: int64

In [7]:
clean_dataset

Unnamed: 0,age,workClass,education-num,marital-status,occupation,relationship,race,sex,hours-per-week,income
0,36,State-gov,1,Never-married,Adm-clerical,Not-in-family,White,Male,74,0
1,54,Self-emp-not-inc,25,Married-civ-spouse,Exec-managerial,Husband,White,Male,39,0
2,26,Private,3,Divorced,Handlers-cleaners,Not-in-family,White,Male,71,0
3,71,Private,9,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,10,0
4,33,Private,20,Married-civ-spouse,Prof-specialty,Wife,Black,Female,75,0
...,...,...,...,...,...,...,...,...,...,...
16276,70,Private,24,Divorced,Prof-specialty,Not-in-family,White,Female,15,0
16277,89,Private,0,Widowed,Prof-specialty,Other-relative,Black,Male,63,0
16278,62,Private,4,Married-civ-spouse,Prof-specialty,Husband,White,Male,59,0
16279,81,Private,1,Divorced,Adm-clerical,Own-child,Asian-Pac-Islander,Male,68,0


In [8]:
categorical = ['workClass', 'marital-status', 'occupation', 'relationship', 'race', 'sex']
try:
    clean_dataset = pd.get_dummies(clean_dataset, columns=categorical)
except KeyError:
    print('Dataset has already beeen converted to categorical')
clean_dataset.head()

Unnamed: 0,age,education-num,hours-per-week,income,workClass_Federal-gov,workClass_Local-gov,workClass_Never-worked,workClass_Private,workClass_Self-emp-inc,workClass_Self-emp-not-inc,...,relationship_Own-child,relationship_Unmarried,relationship_Wife,race_Amer-Indian-Eskimo,race_Asian-Pac-Islander,race_Black,race_Other,race_White,sex_Female,sex_Male
0,36,1,74,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,1
1,54,25,39,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,1,0,1
2,26,3,71,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,1
3,71,9,10,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,1
4,33,20,75,0,0,0,0,1,0,0,...,0,0,1,0,0,1,0,0,1,0


Class imbalance

In [9]:
clean_dataset['income'].value_counts()

0    37155
1    11687
Name: income, dtype: int64

## Splitting the dataset

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix

In [11]:
input_columns = clean_dataset.drop(columns=['income'])
output_column = clean_dataset['income']
x_train, x_test, y_train, y_test = train_test_split(input_columns, output_column, test_size=0.3, random_state=42)

## Classification with K-Nearest Neighbour Classifier

In [12]:
from sklearn.neighbors import KNeighborsClassifier

In [13]:
model = KNeighborsClassifier()
model.fit(x_train, y_train)
y_pred = model.predict(x_test)

In [14]:
accuracy = accuracy_score(y_test, y_pred)
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
print(f'Accuracy : {accuracy:.2%}')
print(confusion_matrix(y_test, y_pred))
#pd.DataFrame(dict(tp=tp, tn=tn))

Accuracy : 72.74%
[[9764 1345]
 [2649  895]]


## Classification with K-Nearest Neighbour Classifier

In [15]:
from sklearn.tree import DecisionTreeClassifier

In [16]:
model = DecisionTreeClassifier()
model.fit(x_train, y_train)
y_pred = model.predict(x_test)

In [17]:
accuracy = accuracy_score(y_test, y_pred)
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
print(f'Accuracy : {accuracy:.2%}')
print(confusion_matrix(y_test, y_pred))

Accuracy : 74.13%
[[9243 1866]
 [1924 1620]]
