In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import math
import numpy as np
import pickle

from sklearn import metrics, preprocessing, model_selection
from sklearn.ensemble import GradientBoostingClassifier as GradientBoosting, RandomForestClassifier as RandomForest
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression

  from numpy.core.umath_tests import inner1d


In [2]:
# read data
filepath = "../data/adult_grouped.csv"
dataset = pd.read_csv(filepath, sep=r'\s*,\s*', na_values="*", engine='python', index_col=0)

In [3]:
dataset.head()

Unnamed: 0_level_0,age,workclass,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
NodeID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,39,State-gov,3,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,3,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,2,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,2,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,3,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Non-US,<=50K


In [4]:
# Preprocessing
def number_encode_features(ds):
    result = ds.copy()
    encoders = {}
    for feature in result.columns:
        if result.dtypes[feature] == np.object:
            encoders[feature] = preprocessing.LabelEncoder()
            result[feature] = encoders[feature].fit_transform(result[feature].astype(str))
    return result, encoders

dataset_encoded, encoders = number_encode_features(dataset)

In [5]:
dataset_encoded.head()

Unnamed: 0_level_0,age,workclass,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
NodeID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,39,5,3,4,0,1,4,1,2174,0,40,1,0
1,50,4,3,2,3,0,4,1,0,0,13,1,0
2,38,2,2,0,5,1,4,1,0,0,40,1,0
3,53,2,2,2,5,0,2,1,0,0,40,1,0
4,28,2,3,2,9,5,2,0,0,0,40,0,0


In [6]:
# Target will be 'education-num'
y = dataset_encoded['education-num']
X = dataset_encoded.drop('education-num', axis=1)

In [7]:
def f1_micro(clf, X, y):
    # cross validation scores on number encoded data
    scores = model_selection.cross_val_score(clf, X, y, cv=10, scoring='f1_micro')
    print("F1 score: %0.2f (+/- %0.2f)" 
      % (scores.mean(), scores.std() * 2))
    return scores.mean()

In [8]:
scores = {}

## 1. Gradient Boosting

In [9]:
# Gradient Boosting
# score from paper: 0.66
clf = GradientBoosting(random_state=0)

In [10]:
scores['Gradient Boosting'] = f1_micro(clf, X, y)

F1 score: 0.63 (+/- 0.02)


## 2. Random Forest

In [11]:
# Random Forest
# score from paper: 0.62
clf = RandomForest(random_state=0)
scores['Random Forest'] = f1_micro(clf, X, y)

F1 score: 0.57 (+/- 0.02)


## 3. Linear SVC

In [12]:
# Linear SVC
# score from paper: 0.65
clf = LinearSVC(random_state=0)

In [13]:
f1_micro(clf, X, y)

F1 score: 0.40 (+/- 0.21)


0.40142370578849473

## 4. Logistic Regression

In [14]:
# Logistic Regression
# score from the paper: 0.60
clf = LogisticRegression(random_state=0)

In [15]:
scores['Logistic Regression number-encoded'] = f1_micro(clf, X, y)

F1 score: 0.52 (+/- 0.02)


### Binary features

In [16]:
# we can try with binary encoded features
# Target will be 'education-num'
y = dataset['education-num']
X = dataset.drop('education-num', axis=1)

In [17]:
X = pd.get_dummies(X)
X.shape

(30162, 49)

In [18]:
X.columns

Index(['age', 'capital-gain', 'capital-loss', 'hours-per-week',
       'workclass_Federal-gov', 'workclass_Local-gov', 'workclass_Private',
       'workclass_Self-emp-inc', 'workclass_Self-emp-not-inc',
       'workclass_State-gov', 'workclass_Without-pay',
       'marital-status_Divorced', 'marital-status_Married-AF-spouse',
       'marital-status_Married-civ-spouse',
       'marital-status_Married-spouse-absent', 'marital-status_Never-married',
       'marital-status_Separated', 'marital-status_Widowed',
       'occupation_Adm-clerical', 'occupation_Armed-Forces',
       'occupation_Craft-repair', 'occupation_Exec-managerial',
       'occupation_Farming-fishing', 'occupation_Handlers-cleaners',
       'occupation_Machine-op-inspct', 'occupation_Other-service',
       'occupation_Priv-house-serv', 'occupation_Prof-specialty',
       'occupation_Protective-serv', 'occupation_Sales',
       'occupation_Tech-support', 'occupation_Transport-moving',
       'relationship_Husband', 'relatio

In [19]:
del X['sex_Male'], X['income_>50K'], X['native-country_Non-US']

In [20]:
# Linear SVC
# score from paper: 0.65
clf = LinearSVC(random_state=0)

In [21]:
scores['Linear SVC'] = f1_micro(clf, X, y)

F1 score: 0.43 (+/- 0.15)


In [22]:
# Logistic Regression
# score from the paper: 0.60
clf = LogisticRegression(random_state=0)

In [23]:
scores['Logistic Regression binary'] = f1_micro(clf, X, y)

F1 score: 0.61 (+/- 0.02)


### Saving the results

In [24]:
filename = '../output/education-num/classification-res/adult_multiclass_full'
outfile = open(filename,'wb')

In [25]:
pickle.dump(scores, outfile)
outfile.close()