In [4]:
import csv
import pickle
import re
from sklearn.svm import LinearSVC

In [5]:
with open('data/adult.data') as f:
    train_data = list(csv.reader(f, skipinitialspace=True))
with open('data/adult.test') as f:
    test_data = list(csv.reader(f, skipinitialspace=True))

In [6]:
def parse(data):
    X = [m[:-1] for m in data if len(m) > 1]
    y = [1 if m[-1] == '>50K' or m[-1] == '>50K.' else 0 for m in data if len(m) > 1]
    assert(len(X) == len(y))
    return X, y

In [11]:
class CategoricalEncoder:
    def __init__(self, categorical_features):
        self.non_categorical_converter = int
        self.features = categorical_features
        
    def fit(self, items):
        self.feature_map = {c: sorted(list(set(item[c] for item in items))) for c in self.features}
        return self
        
    def transform(self, items):
        output = []
        for item in items:
            new_item = []
            for i in range(len(item)):
                if i in self.features:
                    new_item.append(self.feature_map[i].index(item[i]))
                else:
                    new_item.append(self.non_categorical_converter(item[i]))
            output.append(new_item)
        return output

In [8]:
categorical_features = [1, 3, 5, 6, 7, 8, 9, 13]

In [13]:
train_X, train_y = parse(train_data)
test_X, test_y = parse(test_data)

encoder = CategoricalEncoder(categorical_features=categorical_features)
encoder.fit(train_X)
train_X = encoder.transform(train_X)
test_X = encoder.transform(test_X)

In [14]:
clf = LinearSVC()
clf.fit(train_X, train_y)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [49]:
def convert_labels(labels, feature_map):
    output = []
    for i, label in enumerate(labels):
        if i not in feature_map:
            output.append(label)
        else:
            output += ['{}__{}'.format(label, re.sub('[^0-9a-zA-Z_]', '_', value)) for value in feature_map[i]]
    return output

In [25]:
labels = '''age
workclass
fnlwgt
education
education-num
marital-status
occupation
relationship
race
sex
capital-gain
capital-loss
hours-per-week
native-country'''.split('\n')
labels = [label.replace('-', '_') for label in labels]

In [20]:
assert(len(labels) == len(train_X[0]))

In [21]:
clf.score(test_X, test_y)

0.78213868926970087

In [22]:
print(clf.coef_)
print(clf.intercept_)

[[ -1.13092721e-03  -8.54246856e-04  -3.76566920e-06  -2.26396669e-03
   -8.44719676e-04  -1.47930310e-03  -1.03686808e-03  -1.41270445e-03
   -7.37885479e-04   1.23471751e-04   9.93503667e-05   3.51233836e-04
   -1.99181116e-03  -9.47669986e-03]]
[-0.00026464]


In [26]:
# Save converted data
with open('data/adult.data.csv', 'w') as f:
    writer = csv.writer(f)
    writer.writerow(labels)
    writer.writerows(train_X)
with open('data/adult.data.labels.csv', 'w') as f:
    csv.writer(f).writerow(train_y)
with open('data/adult.test.csv', 'w') as f:
    writer = csv.writer(f)
    writer.writerow(labels)
    writer.writerows(test_X)
with open('data/adult.test.labels.csv', 'w') as f:
    csv.writer(f).writerow(test_y)

In [24]:
# Save classifier
with open('classifiers/examples/adult_svm.pickle', 'wb') as f:
    pickle.dump(clf, f)