In [85]:
import csv
from sklearn.svm import LinearSVC
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline

In [29]:
with open('data/adult.data') as f:
    train_data = list(csv.reader(f, skipinitialspace=True))
with open('data/adult.test') as f:
    test_data = list(csv.reader(f, skipinitialspace=True))

In [119]:
def parse(data):
    X = [m[:-1] for m in data if len(m) > 1]
    y = [1 if m[-1] == '>50K' or m[-1] == '>50K.' else 0 for m in data if len(m) > 1]
    assert(len(X) == len(y))
    return X, y

In [59]:
def convert_categorical(items, features, other_converter=int):
    feature_map = {c: sorted(list(set(item[c] for item in items))) for c in features}
    output = []
    for item in items:
        new_item = []
        for i in range(len(item)):
            if i in features:
                new_item.append(feature_map[i].index(item[i]))
            else:
                new_item.append(other_converter(item[i]))
        output.append(new_item)
    return output

In [44]:
categorical_features = [1, 3, 5, 6, 7, 8, 9, 13]

In [122]:
train_X, train_y = parse(train_data)
test_X, test_y = parse(test_data)

converted = convert_categorical(train_X + test_X, categorical_features)
train_X = converted[:len(train_X)]
test_X = converted[len(train_X):]

encoder = OneHotEncoder(categorical_features=categorical_features)
train_X = encoder.fit_transform(train_X)
test_X = encoder.transform(test_X)

In [123]:
clf = LinearSVC()
clf.fit(train_X, train_y)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [124]:
clf.score(test_X, test_y)

0.79067624838769113

In [128]:
print(clf.coef_)
print(clf.intercept_)

[[ -9.10930116e-05   2.17169032e-05   6.36478905e-06  -7.64528951e-08
   -3.38837324e-04   4.03661565e-05  -2.74332888e-05  -8.08005821e-06
   -1.12244049e-06  -3.90058220e-05  -5.36121301e-05  -2.34280166e-05
   -3.02644752e-06  -4.40023556e-06  -2.71793990e-05  -1.53867370e-05
   -1.38777075e-05  -1.41025090e-05   1.03171180e-04   2.68085088e-05
   -2.24926096e-04   7.88442848e-05  -8.84732324e-07   3.74743478e-05
   -2.24663216e-04  -1.82376976e-04   3.01254504e-06   4.57758013e-04
   -1.79659305e-05  -5.81182192e-04  -3.59192540e-05  -4.15209327e-05
   -9.11694645e-05  -9.46553384e-05  -1.20076228e-07  -4.44705247e-05
    1.40250063e-04  -4.44563867e-05  -6.25146561e-05  -3.67396424e-05
   -2.03138331e-04  -6.37499781e-06   7.89141468e-05   1.55207500e-05
   -1.71392007e-05  -4.31264110e-06  -2.77884270e-05   4.10263662e-04
   -3.80685224e-04  -3.82984798e-05  -2.76557190e-04  -1.66855410e-04
    5.39379129e-05  -6.36022031e-05  -5.02131369e-06  -4.62569568e-05
   -3.45875478e-06  

In [125]:
# Save converted data
with open('data/adult.data.csv', 'w') as f:
    csv.writer(f).writerows(train_X.toarray())
with open('data/adult.data.labels.csv', 'w') as f:
    csv.writer(f).writerow(train_y)
with open('data/adult.test.csv', 'w') as f:
    csv.writer(f).writerows(test_X.toarray())
with open('data/adult.test.labels.csv', 'w') as f:
    csv.writer(f).writerow(test_y)