# Multi-class Classification of Adult Dataset

In [1]:
# This is a multi-class classification of Adult dataset on target 'maarital-status'
# Gradient Boosting
# Linear SVC
# Logistic Regression
# Random Forest

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import math
import numpy as np

from sklearn import metrics, preprocessing, model_selection
from sklearn.ensemble import GradientBoostingClassifier as GradientBoosting, RandomForestClassifier as RandomForest
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression

  from numpy.core.umath_tests import inner1d


In [3]:
# read data
filepath = "../data/adult_all.csv"
dataset = pd.read_csv(filepath, sep=r'\s*,\s*', na_values="*", engine='python', index_col=0)

In [4]:
dataset.head()

Unnamed: 0_level_0,age,workclass,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
NodeID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,39,State-gov,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Non-US,<=50K


In [5]:
# Preprocessing
def number_encode_features(ds):
    result = ds.copy()
    encoders = {}
    for feature in result.columns:
        if result.dtypes[feature] == np.object:
            encoders[feature] = preprocessing.LabelEncoder()
            result[feature] = encoders[feature].fit_transform(result[feature].astype(str))
    return result, encoders

dataset_encoded, encoders = number_encode_features(dataset)

In [6]:
dataset_encoded.head()

Unnamed: 0_level_0,age,workclass,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
NodeID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,39,5,13,4,0,1,4,1,2174,0,40,1,0
1,50,4,13,2,3,0,4,1,0,0,13,1,0
2,38,2,9,0,5,1,4,1,0,0,40,1,0
3,53,2,7,2,5,0,2,1,0,0,40,1,0
4,28,2,13,2,9,5,2,0,0,0,40,0,0


In [7]:
# Target will be 'marital-status'
y = dataset_encoded['marital-status']
X = dataset_encoded.drop('marital-status', axis=1)

In [8]:
# Scale
scaler = preprocessing.StandardScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

In [9]:
def f1_micro(clf, X, y):
    # cross validation scores on number encoded data
    scores = model_selection.cross_val_score(clf, X, y, cv=10, scoring='f1_micro')
    print("F1 score (number encoded data): %0.2f (+/- %0.2f)" 
      % (scores.mean(), scores.std() * 2))

In [10]:
def f1_macro(clf, X, y):
    # cross validation scores on number encoded data
    scores = model_selection.cross_val_score(clf, X, y, cv=10, scoring='f1_macro')
    print("F1 score (number encoded data): %0.2f (+/- %0.2f)" 
      % (scores.mean(), scores.std() * 2))

## 1. Gradient Boosting

Value from the paper: 0.85

In [11]:
# Gradient Boosting
clf = GradientBoosting(random_state=0)

In [12]:
f1_micro(clf, X, y)

F1 score (number encoded data): 0.85 (+/- 0.01)


## 2. Linear SVC

Score from the paper: 0.85

In [13]:
# Linear SVC
clf = LinearSVC(random_state=0)

In [14]:
f1_micro(clf, X, y)

F1 score (number encoded data): 0.69 (+/- 0.02)


## 3. Logistic Regression

Score from the paper: 0.81

In [15]:
clf = LogisticRegression(random_state=0)
f1_micro(clf, X, y)

F1 score (number encoded data): 0.71 (+/- 0.02)


## 4. Random Forest

Score from the paper: 0.84

In [16]:
clf = RandomForest(random_state=0)
f1_micro(clf, X, y)

F1 score (number encoded data): 0.82 (+/- 0.01)
