# Binary Classification of Adult dateset 

In [1]:
# This is a binary classification of Adult dataset on target 'income' for 4 classifiers:
# Gradient Boosting
# Linear SVC
# Logistic Regression
# Random Forest

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import math
import numpy as np

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 

from sklearn import metrics, preprocessing, model_selection
from sklearn.ensemble import GradientBoostingClassifier as GradientBoosting, RandomForestClassifier as RandomForest
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression

  from numpy.core.umath_tests import inner1d


In [6]:
# read data
filepath = "../data/adult_all.csv"
dataset = pd.read_csv(filepath, sep=r'\s*,\s*', na_values="*", engine='python', index_col=0)

In [7]:
dataset.head()

Unnamed: 0_level_0,age,workclass,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
NodeID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,39,State-gov,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Non-US,<=50K


In [8]:
# Preprocessing
def number_encode_features(ds):
    result = ds.copy()
    encoders = {}
    for feature in result.columns:
        if result.dtypes[feature] == np.object:
            encoders[feature] = preprocessing.LabelEncoder()
            result[feature] = encoders[feature].fit_transform(result[feature].astype(str))
    return result, encoders

dataset_encoded, encoders = number_encode_features(dataset)

In [9]:
# Target will be 'income'
y = dataset_encoded['income']
X = dataset_encoded.drop('income', axis=1)

In [10]:
# Preprocessing with binary attributes
binary_data = pd.get_dummies(dataset)
binary_data.head()

Unnamed: 0_level_0,age,education-num,capital-gain,capital-loss,hours-per-week,workclass_Federal-gov,workclass_Local-gov,workclass_Private,workclass_Self-emp-inc,workclass_Self-emp-not-inc,...,race_Asian-Pac-Islander,race_Black,race_Other,race_White,sex_Female,sex_Male,native-country_Non-US,native-country_United-States,income_<=50K,income_>50K
NodeID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,39,13,2174,0,40,0,0,0,0,0,...,0,0,0,1,0,1,0,1,1,0
1,50,13,0,0,13,0,0,0,0,1,...,0,0,0,1,0,1,0,1,1,0
2,38,9,0,0,40,0,0,1,0,0,...,0,0,0,1,0,1,0,1,1,0
3,53,7,0,0,40,0,0,1,0,0,...,0,1,0,0,0,1,0,1,1,0
4,28,13,0,0,40,0,0,1,0,0,...,0,1,0,0,1,0,1,0,1,0


In [11]:
del binary_data['sex_Male'], binary_data['native-country_Non-US'], binary_data['income_<=50K']

In [12]:
y_bin = binary_data['income_>50K']
X_bin = binary_data.drop('income_>50K', axis=1)

## 1. Gradient Boosting

In [13]:
# Gradient Boosting
clf = GradientBoosting(random_state=0)

In [14]:
# cross validation scores on number encoded data
scores = model_selection.cross_val_score(clf, X, y, cv=10, scoring='f1')
print("F1 score (number encoded data): %0.2f (+/- %0.2f)" 
      % (scores.mean(), scores.std() * 2))

F1 score (number encoded data): 0.68 (+/- 0.02)


In [15]:
# cross validation scores on binary data
scores = model_selection.cross_val_score(clf, X_bin, y_bin, cv=10, scoring='f1')
print("F1 score: %0.2f (+/- %0.2f)" 
      % (scores.mean(), scores.std() * 2))

F1 score: 0.69 (+/- 0.02)


### Result

Obtained F1 score: 0.69

F1 score from the paper: 0.70

## 2. Linear SVC

In [16]:
# Linear SVC
clf = LinearSVC(random_state=0)

In [17]:
# cross validation scores for number encoded data
scores = model_selection.cross_val_score(clf, X, y, cv=10, scoring='f1')
print("F1 score: %0.2f (+/- %0.2f)" 
      % (scores.mean(), scores.std() * 2))

  'precision', 'predicted', average, warn_for)


F1 score: 0.23 (+/- 0.36)


In [18]:
# cross validation scores for binary data 
scores = model_selection.cross_val_score(clf, X_bin, y_bin, cv=10, scoring='f1')
print("F1 score: %0.2f (+/- %0.2f)" 
      % (scores.mean(), scores.std() * 2))

  'precision', 'predicted', average, warn_for)


F1 score: 0.44 (+/- 0.40)


### Result

Obtained F1 score: 0.77

F1 score from the paper: 0.66

## 3. Logistic Regression

In [19]:
clf = LogisticRegression(random_state=0)

In [20]:
# cross validation scores for number encoded data
scores = model_selection.cross_val_score(clf, X, y, cv=10, scoring='f1')
print("F1 score: %0.2f (+/- %0.2f)" 
      % (scores.mean(), scores.std() * 2))

F1 score: 0.55 (+/- 0.04)


In [21]:
# cross validation scores for binary data 
scores = model_selection.cross_val_score(clf, X_bin, y_bin, cv=10, scoring='f1')
print("F1 score: %0.2f (+/- %0.2f)" 
      % (scores.mean(), scores.std() * 2))

F1 score: 0.66 (+/- 0.02)


### Result

Obtained F1 score: 0.66

F1 score from the paper: 0.66

## 4. Random Forest

In [22]:
clf = RandomForest(random_state=0)

In [23]:
# cross validation scores for number encoded data
scores = model_selection.cross_val_score(clf, X, y, cv=10, scoring='f1')
print("F1 score: %0.2f (+/- %0.2f)" 
      % (scores.mean(), scores.std() * 2))

F1 score: 0.66 (+/- 0.03)


In [24]:
# cross validation scores for binary data 
scores = model_selection.cross_val_score(clf, X_bin, y_bin, cv=10, scoring='f1')
print("F1 score: %0.2f (+/- %0.2f)" 
      % (scores.mean(), scores.std() * 2))

F1 score: 0.66 (+/- 0.03)


### Result

Obtained F1 score: 0.66

F1 score from the paper: 0.67