In [1]:
# This notebook is only for storing the results for Linear SVC (that uses binary features which take up a lot of memory)
import pandas as pd
import matplotlib.pyplot as plt
import math
import numpy as np
import pickle
import os.path

from sklearn import metrics, preprocessing, model_selection
from sklearn.svm import LinearSVC

In [2]:
def read_anon_data(filename):
    filepath = "../output/marital-status/"
    filepath += filename
    dataset = pd.read_csv(filepath, sep=r'\s*,\s*', na_values="*", engine='python', index_col=False)
    return dataset

In [3]:
k = 3

In [4]:
dataset = read_anon_data("anonymized_equal_weights_k_" + str(k) + ".csv")
dataset.head()

Unnamed: 0,age,education-num,capital-gain,capital-loss,hours-per-week,workclass,native-country,sex,race,relationship,occupation,income,marital-status
0,[39 - 42],13,[0 - 2174],0,40,State-gov,United-States,Male,White,Not-in-family,Adm-clerical,<=50K,Never-married
1,[39 - 42],13,[0 - 2174],0,40,State-gov,United-States,Male,White,Not-in-family,Adm-clerical,<=50K,Never-married
2,[39 - 42],13,[0 - 2174],0,40,State-gov,United-States,Male,White,Not-in-family,Adm-clerical,<=50K,Divorced
3,[47 - 51],13,0,0,[13 - 30],Self-emp-not-inc,United-States,Male,White,Husband,Exec-managerial,<=50K,Married-civ-spouse
4,[47 - 51],13,0,0,[13 - 30],Self-emp-not-inc,United-States,Male,White,Husband,Exec-managerial,<=50K,Married-civ-spouse


In [5]:
# Scoring
def f1_micro(clf, X, y):
    # cross validation scores on number encoded data
    scores = model_selection.cross_val_score(clf, X, y, cv=10, scoring='f1_micro')
    print("F1 score: %0.2f (+/- %0.2f)" 
          % (scores.mean(), scores.std() * 2))
    return scores.mean()

In [6]:
# Linear SVC - binary attributes needed
clf = LinearSVC(random_state=0)

In [7]:
# we can try with binary encoded features
# Target will be 'marital-status'
y = dataset['marital-status']
X = dataset.drop('marital-status', axis=1)
X.head()

Unnamed: 0,age,education-num,capital-gain,capital-loss,hours-per-week,workclass,native-country,sex,race,relationship,occupation,income
0,[39 - 42],13,[0 - 2174],0,40,State-gov,United-States,Male,White,Not-in-family,Adm-clerical,<=50K
1,[39 - 42],13,[0 - 2174],0,40,State-gov,United-States,Male,White,Not-in-family,Adm-clerical,<=50K
2,[39 - 42],13,[0 - 2174],0,40,State-gov,United-States,Male,White,Not-in-family,Adm-clerical,<=50K
3,[47 - 51],13,0,0,[13 - 30],Self-emp-not-inc,United-States,Male,White,Husband,Exec-managerial,<=50K
4,[47 - 51],13,0,0,[13 - 30],Self-emp-not-inc,United-States,Male,White,Husband,Exec-managerial,<=50K


In [8]:
X = pd.get_dummies(X)
X.shape

(30162, 2732)

In [9]:
score = f1_micro(clf, X, y)

F1 score: 0.80 (+/- 0.03)


In [10]:
# unpickle
file_path = '../output/marital-status/classification-res/adult_multiclass_k' + str(k)
if os.path.exists(file_path):
    infile = open(file_path, 'rb')
    scores = pickle.load(infile)
    infile.close()
else:
    scores = {}

In [11]:
# add LinearSVC score
scores['Linear SVC'] = score

In [12]:
# pickle
filename = '../output/marital-status/classification-res/adult_multiclass_k' + str(k)
outfile = open(filename,'wb')
pickle.dump(scores, outfile)
outfile.close()