In [None]:
# This is a multiclass classification of anonymized Adult datasets on target 'marital-status' for 4 classifiers:
# Gradient Boosting
# Linear SVC
# Logistic Regression
# Random Forest
# k = {3, 7, 11, 15, 19, 23, 27, 31, 35, 100}

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import math
import numpy as np

from sklearn import metrics, preprocessing, model_selection
from sklearn.ensemble import GradientBoostingClassifier as GradientBoosting, RandomForestClassifier as RandomForest
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression

In [3]:
def read_anon_data(filename):
    filepath = "../output/marital-status/"
    filepath += filename
    dataset = pd.read_csv(filepath, sep=r'\s*,\s*', na_values="*", engine='python', index_col=False)
    return dataset

In [4]:
dataset = read_anon_data("anonymized_equal_weights_k_15.csv")
dataset.head()

Unnamed: 0,age,education-num,capital-gain,capital-loss,hours-per-week,workclass,native-country,sex,race,relationship,occupation,income,marital-status
0,[22 - 55],[9 - 13],[0 - 2174],0,[37 - 47],State-gov,United-States,Male,White,Not-in-family,Adm-clerical,<=50K,Never-married
1,[22 - 55],[9 - 13],[0 - 2174],0,[37 - 47],State-gov,United-States,Male,White,Not-in-family,Adm-clerical,<=50K,Never-married
2,[22 - 55],[9 - 13],[0 - 2174],0,[37 - 47],State-gov,United-States,Male,White,Not-in-family,Adm-clerical,<=50K,Divorced
3,[22 - 55],[9 - 13],[0 - 2174],0,[37 - 47],State-gov,United-States,Male,White,Not-in-family,Adm-clerical,<=50K,Never-married
4,[22 - 55],[9 - 13],[0 - 2174],0,[37 - 47],State-gov,United-States,Male,White,Not-in-family,Adm-clerical,<=50K,Never-married


In [5]:
# Preprocessing
def number_encode_features(ds):
    result = ds.copy()
    encoders = {}
    for feature in result.columns:
        if result.dtypes[feature] == np.object:
            encoders[feature] = preprocessing.LabelEncoder()
            result[feature] = encoders[feature].fit_transform(result[feature].astype(str))
    return result, encoders

dataset_encoded, encoders = number_encode_features(dataset)

In [6]:
# Target will be 'marital-status'
y = dataset_encoded['marital-status']
X = dataset_encoded.drop('marital-status', axis=1)

In [10]:
# Scoring
def f1_micro(clf, X, y):
    # cross validation scores on number encoded data
    scores = model_selection.cross_val_score(clf, X, y, cv=10, scoring='f1_micro')
    print("F1 score (number encoded data): %0.2f (+/- %0.2f)" 
          % (scores.mean(), scores.std() * 2))

In [11]:
# Gradient Boosting
clf = GradientBoosting(random_state=0)

In [12]:
f1_micro(clf, X, y)

F1 score (number encoded data): 0.78 (+/- 0.06)


In [13]:
# Linear SVC
clf = LinearSVC(random_state=0)

In [14]:
f1_micro(clf, X, y)

F1 score (number encoded data): 0.48 (+/- 0.27)


In [15]:
clf = LogisticRegression(random_state=0)

In [16]:
f1_micro(clf, X, y)

F1 score (number encoded data): 0.68 (+/- 0.05)


In [17]:
clf = RandomForest(random_state=0)

In [18]:
f1_micro(clf, X, y)

F1 score (number encoded data): 0.69 (+/- 0.09)
