In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import math
import numpy as np
import pickle

from sklearn import metrics, preprocessing, model_selection
from sklearn.ensemble import GradientBoostingClassifier as GradientBoosting, RandomForestClassifier as RandomForest
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression

  from numpy.core.umath_tests import inner1d


In [2]:
def read_anon_data(filename):
    filepath = "../output/education-num/"
    filepath += filename
    dataset = pd.read_csv(filepath, sep=r'\s*,\s*', na_values="*", engine='python', index_col=False)
    return dataset

In [3]:
k = 100

In [4]:
dataset = read_anon_data("anonymized_emph_age_weights_k_" + str(k) + ".csv")
dataset.head()

Unnamed: 0,age,workclass,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,,3,,,,White,Male,[0 - 6849],[0 - 1974],[18 - 99],United-States,<=50K
1,39,,3,,,,White,Male,[0 - 6849],[0 - 1974],[18 - 99],United-States,<=50K
2,39,,3,,,,White,Male,[0 - 6849],[0 - 1974],[18 - 99],United-States,<=50K
3,39,,3,,,,White,Male,[0 - 6849],[0 - 1974],[18 - 99],United-States,<=50K
4,39,,2,,,,White,Male,[0 - 6849],[0 - 1974],[18 - 99],United-States,<=50K


In [5]:
# Preprocessing
def number_encode_features(ds):
    result = ds.copy()
    encoders = {}
    for feature in result.columns:
        if result.dtypes[feature] == np.object:
            encoders[feature] = preprocessing.LabelEncoder()
            result[feature] = encoders[feature].fit_transform(result[feature].astype(str))
    return result, encoders

dataset_encoded, encoders = number_encode_features(dataset)

In [6]:
# Target will be 'education-num'
y = dataset_encoded['education-num']
X = dataset_encoded.drop('education-num', axis=1)

In [8]:
# Scoring
def f1_micro(clf, X, y):
    # cross validation scores on number encoded data
    scores = model_selection.cross_val_score(clf, X, y, cv=10, scoring='f1_micro')
    print("F1 score: %0.2f (+/- %0.2f)" 
          % (scores.mean(), scores.std() * 2))
    return scores.mean()

In [9]:
scores = {}

## 1. Gradient Boosting

In [10]:
# Gradient Boosting
# score from the paper: 
clf = GradientBoosting(random_state=0)

In [None]:
scores['Gradient Boosting'] = f1_micro(clf, X, y)

## 2. Random Forest

In [None]:
# Random Forest
# score from the paper: 
clf = RandomForest(random_state=0)

In [None]:
scores['Random Forest'] = f1_micro(clf, X, y)

## 3. Logistic Regression

In [None]:
# Logistic Regression
# score from the paper:
clf = LogisticRegression(random_state=0)

In [None]:
scores['Logistic Regression number-encoded'] = f1_micro(clf, X, y)

## 4. Linear SVC

In [None]:
# Linear SVC - binary attributes needed
# score from the paper: 
clf = LinearSVC(random_state=0)

In [None]:
f1_micro(clf, X, y)

### Binary features

In [None]:
del y, X, dataset_encoded, encoders

In [None]:
# we can try with binary encoded features
# Target will be 'education-num'
y = dataset['education-num']
X = dataset.drop('education-num', axis=1)
X.head()

In [None]:
X = pd.get_dummies(X)
X.shape

In [None]:
# Logistic Regression
# score from the paper: 0.53, 0.47
clf = LogisticRegression(random_state=0)

In [None]:
scores['Logistic Regression binary'] = f1_micro(clf, X, y)

In [None]:
# Linear SVC - binary attributes needed
# score from the paper: 0.62, 0.59
clf = LinearSVC(random_state=0)

In [None]:
scores['Linear SVC'] = f1_micro(clf, X, y)

### Saving scores

In [None]:
filename = '../output/education-num/classification-res/adult_multiclass_emph_age_k' + str(k)
outfile = open(filename,'wb')

In [None]:
pickle.dump(scores, outfile)
outfile.close()