In [2]:
# This is a binary classification of anonymized Adult datasets on target 'income' for 4 classifiers:
# Gradient Boosting
# Linear SVC
# Logistic Regression
# Random Forest
# k = {3, 7, 11, 15, 19}

In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import math
import numpy as np

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 

from sklearn import metrics, preprocessing, model_selection
from sklearn.ensemble import GradientBoostingClassifier as GradientBoosting, RandomForestClassifier as RandomForest
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression

  from numpy.core.umath_tests import inner1d


In [4]:
def read_anon_data(filename):
    filepath = "../original_algorithm/output/"
    filepath += filename
    dataset = pd.read_csv(filepath, sep=r'\s*,\s*', na_values="*", engine='python', index_col=False)
    return dataset

In [4]:
dataset = read_anon_data("anonymized_equal_weights_k_3.csv")
dataset.head()

Unnamed: 0,age,education-num,capital-gain,capital-loss,hours-per-week,workclass,native-country,sex,race,relationship,occupation,marital-status,income
0,[39 - 44],13,[0 - 2174],0,40,State-gov,United-States,Male,White,Not-in-family,Adm-clerical,Never-married,<=50K
1,[39 - 44],13,[0 - 2174],0,40,State-gov,United-States,Male,White,Not-in-family,Adm-clerical,Never-married,<=50K
2,[39 - 44],13,[0 - 2174],0,40,State-gov,United-States,Male,White,Not-in-family,Adm-clerical,Never-married,<=50K
3,[50 - 56],13,0,0,[13 - 25],Self-emp-not-inc,United-States,Male,White,Husband,Exec-managerial,Married-civ-spouse,<=50K
4,[50 - 56],13,0,0,[13 - 25],Self-emp-not-inc,United-States,Male,White,Husband,Exec-managerial,Married-civ-spouse,<=50K


In [5]:
# Preprocessing
def number_encode_features(ds):
    result = ds.copy()
    encoders = {}
    for feature in result.columns:
        if result.dtypes[feature] == np.object:
            encoders[feature] = preprocessing.LabelEncoder()
            result[feature] = encoders[feature].fit_transform(result[feature].astype(str))
    return result, encoders

dataset_encoded, encoders = number_encode_features(dataset)

In [6]:
# Target will be 'income'
y = dataset_encoded['income']
X = dataset_encoded.drop('income', axis=1)

## k = 3
### Gradient Boosting

Expected f1 score: 0.62

In [7]:
# Gradient Boosting
clf = GradientBoosting(random_state=0)

In [8]:
# cross validation scores on number encoded data
scores = model_selection.cross_val_score(clf, X, y, cv=10, scoring='f1')
print("F1 score (number encoded data): %0.2f (+/- %0.2f)" 
      % (scores.mean(), scores.std() * 2))

F1 score (number encoded data): 0.63 (+/- 0.03)


### Linear SVC

Expected value: 0.59

In [9]:
# Linear SVC
clf = LinearSVC(random_state=0)

In [10]:
# cross validation scores for number encoded data
scores = model_selection.cross_val_score(clf, X, y, cv=10, scoring='f1')
print("F1 score: %0.2f (+/- %0.2f)" 
      % (scores.mean(), scores.std() * 2))

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


F1 score: 0.30 (+/- 0.40)


### Logistic Regression

Expected value: 0.59

In [11]:
clf = LogisticRegression(random_state=0)

In [12]:
# cross validation scores for number encoded data
scores = model_selection.cross_val_score(clf, X, y, cv=10, scoring='f1')
print("F1 score: %0.2f (+/- %0.2f)" 
      % (scores.mean(), scores.std() * 2))

F1 score: 0.34 (+/- 0.04)


### Random Forest

Expected value: 0.56

In [13]:
clf = RandomForest(random_state=0)

In [14]:
# cross validation scores for number encoded data
scores = model_selection.cross_val_score(clf, X, y, cv=10, scoring='f1')
print("F1 score: %0.2f (+/- %0.2f)" 
      % (scores.mean(), scores.std() * 2))

F1 score: 0.58 (+/- 0.06)


## k=7

In [7]:
dataset = read_anon_data("anonymized_equal_weights_k_7.csv")
dataset.head()

Unnamed: 0,age,education-num,capital-gain,capital-loss,hours-per-week,workclass,native-country,sex,race,relationship,occupation,marital-status,income
0,[25 - 43],[10 - 13],[0 - 2174],0,[25 - 60],,United-States,Male,White,Not-in-family,Other,Never-married,<=50K
1,[25 - 43],[10 - 13],[0 - 2174],0,[25 - 60],,United-States,Male,White,Not-in-family,Other,Never-married,<=50K
2,[25 - 43],[10 - 13],[0 - 2174],0,[25 - 60],,United-States,Male,White,Not-in-family,Other,Never-married,<=50K
3,[25 - 43],[10 - 13],[0 - 2174],0,[25 - 60],,United-States,Male,White,Not-in-family,Other,Never-married,<=50K
4,[25 - 43],[10 - 13],[0 - 2174],0,[25 - 60],,United-States,Male,White,Not-in-family,Other,Never-married,<=50K


In [8]:
# Preprocessing
def number_encode_features(ds):
    result = ds.copy()
    encoders = {}
    for feature in result.columns:
        if result.dtypes[feature] == np.object:
            encoders[feature] = preprocessing.LabelEncoder()
            result[feature] = encoders[feature].fit_transform(result[feature].astype(str))
    return result, encoders

dataset_encoded, encoders = number_encode_features(dataset)

In [9]:
# Target will be 'income'
y = dataset_encoded['income']
X = dataset_encoded.drop('income', axis=1)

### Gradient Boosting
Expected f1 score: 0.57

In [10]:
# Gradient Boosting
clf = GradientBoosting(random_state=0)

In [11]:
# cross validation scores on number encoded data
scores = model_selection.cross_val_score(clf, X, y, cv=10, scoring='f1')
print("F1 score (number encoded data): %0.2f (+/- %0.2f)" 
      % (scores.mean(), scores.std() * 2))

F1 score (number encoded data): 0.44 (+/- 0.29)
