In [1]:
import numpy as np
import pandas as pd

In [2]:
# Where we had our Scala program dump all the training data.
csv_file_name = "../data/all-candidates-4-10-14-10.csv"

# The names of the columns in the CSV file. Should update if more features become available.
feature_count = 10
col_names = ['entity_id', 'mention', 'mention_start', 'mention_end', 'mention_length',
             'featureStartFlag']
feature_names = ['f%d' % f for f in range(feature_count)]
col_names += feature_names
col_names += ['featureEndFlag', 'relevant']

#train_data = pd.read_csv(csv_file_name, true_values=['true'], false_values=['false'],
#                        names=col_names)

In [3]:
X_raw = []
y_raw = []

with open(csv_file_name, 'r') as f:
    bad_lines = 0
    for line_number, line in enumerate(f.readlines()):
        if bad_lines > 10:
            print("Too many bad lines. The parsing code is bugged or the "
                  "CSV is badly formatted. Get yo shit together fam.")
            break
            
        parts = line[:-1].split(",")
        if len(parts) != 18:
            bad_lines += 1
            print("Skipping bad line")
            continue
        
        # Example CSV line, as of May 10.
        # 364646, lumet familt, 7, 19, 12, featureStart, [feature_count features], featureEnd, true
        try:
            # Not used at the moment.
            # meta = parts[:5]
            features = parts[6:6 + feature_count]
            label = parts[-1]
            features_np = np.array([float(f) for f in features])
            X_raw.append(features_np)
            y_raw.append(1 if label == 'true' else 0)
        except ValueError as e:
            print("Could not parse data line {0}: {1}.".format(line_number, line))
            print(e)
            bad_lines += 1
            
    X_raw = np.array(X_raw)
    y_raw = np.array(y_raw)
        
    if bad_lines > 0:
        print("Bad lines: {0}".format(bad_lines))
    else:
        print("Import was successful.")
        
    print("Feature shape: {0}".format(X_raw.shape))
    print("Label shape: {0}".format(y_raw.shape))
    
# Hacky imputation.
X_raw[np.isnan(X_raw)] = 0.0
# TODO(andrei): Maybe set this to a very large constant.
X_raw[np.isinf(X_raw)] = 0.0

Import was successful.
Feature shape: (43548, 10)
Label shape: (43548,)


In [5]:
# old_opts = np.get_printoptions()
# np.set_printoptions(threshold=np.nan)

print("Raw feature max vals: {0}".format(np.max(X_raw, axis=0)))
print("Raw feature min vals: {0}".format(np.min(X_raw, axis=0)))
print("Raw feature mean vals: {0}".format(np.mean(X_raw, axis=0)))
print("Raw feature stds: {0}".format(np.std(X_raw, axis=0)))

print("NaNs: {0}".format(np.sum(np.isnan(X_raw), axis=0)))
print("Infinities: {0}".format(np.sum(np.isinf(X_raw), axis=0)))

# np.set_printoptions(**old_opts)

Raw feature max vals: [  3.39000000e+08   1.00000000e+02   2.00000000e+01   2.00000000e+01
   1.57500000e+01   0.00000000e+00   2.35000000e+01   1.00000000e+00
   2.50000000e+01   4.20000000e+01]
Raw feature min vals: [ 24.   1.   0.   0.   0.   0.   0.   0.  13.  42.]
Raw feature mean vals: [  2.26494924e+07   9.34776568e+01   4.07127626e+00   3.98202806e+00
   1.40936438e-02   0.00000000e+00   6.49731911e+00   1.58556306e-01
   2.30570708e+01   4.20000000e+01]
Raw feature stds: [  4.93288879e+07   2.24788304e+01   2.10829459e+00   2.17877619e+00
   1.74351238e-01   0.00000000e+00   1.69935778e+00   2.20900887e-01
   2.74772698e+00   0.00000000e+00]
NaNs: [0 0 0 0 0 0 0 0 0 0]
Infinities: [0 0 0 0 0 0 0 0 0 0]


In [16]:
import sklearn
from sklearn import preprocessing

ranges = np.max(X_raw, axis=0) - np.min(X_raw, axis=0)
# Avoid divisions by zero
ranges[ranges == 0] = 1.0
X = (X_raw - np.mean(X_raw, axis=0)) / ranges
X = preprocessing.scale(X)
print(np.mean(X, axis=1))
print(np.std(X, axis=1))

# No scaling needed for y.
y = y_raw

X, y = sklearn.utils.shuffle(X, y)

[-0.77989972 -0.77989972 -0.17124558 ...,  0.58752018  0.24041326
 -0.05640984]
[ 1.93966831  1.93966831  0.37219353 ...,  1.16820207  0.68798874
  0.4059836 ]


In [21]:
pos_count = np.sum(y == 1)
neg_count = np.sum(y == 0)
print("We have {0} positive labels.".format(pos_count))
print("We have {0} negative labels.".format(neg_count))

We have 455 positive labels.
We have 43093 negative labels.


## Train/validation split with slicing wizardry

In [46]:
# Use a much smaller ratio of negative to positive samples in the validation
# set, for more accurate validation results.
VALIDATION_NEG_TO_POS_RATIO = 1

pos_count_valid = int(pos_count * 0.15)
neg_count_valid = pos_count_valid * VALIDATION_NEG_TO_POS_RATIO

# Indexes of positive rows to use for validation.
# This witchcraft isolates the indexes of the first 'pos_count_valid' rows
# with positive labels in the training data.
y_pos_ind = (y == 1)
y_pos_counts = np.cumsum(y_pos_ind)
y_pos_lim = np.where(y_pos_counts == (pos_count_valid + 1))[0][0]
y_pos_ind[y_pos_lim:] = False

# This bit does the same but for the first 'neg_count_valid' rows with negative
# labels.
y_neg_ind = (y == 0)
y_neg_counts = np.cumsum(y_neg_ind)
y_neg_lim = np.where(y_neg_counts == (neg_count_valid + 1))[0][0]
y_neg_ind[y_neg_lim:] = False

# Make sure that there's no overlap, which would signify that we messed something
# up with the slicing/indexing.
assert np.sum(y_pos_ind & y_neg_ind) == 0

X_valid = X[y_pos_ind | y_neg_ind]
y_valid = y[y_pos_ind | y_neg_ind]

X_train = X[~(y_pos_ind | y_neg_ind)]
y_train = y[~(y_pos_ind | y_neg_ind)]

# Just some manual extra checks.
# TODO(andrei): Label better or remove.
print(np.sum(y_pos_ind))
print(np.sum(y_neg_ind))
print("Training:")
print(X_train.shape)
print(y_train.shape)
print("Validation:")
print(X_valid.shape)
print(y_valid.shape)
print(pos_count_valid, neg_count_valid, pos_count_valid + neg_count_valid)

68
68
Training:
(43412, 10)
(43412,)
Validation:
(136, 10)
(136,)
68 68 136


## Linear SVC

In [63]:
from sklearn.cross_validation import cross_val_score
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import *
from sklearn.svm import SVC, SVR, LinearSVC, LinearSVR


# The "class_weight='balanced'" param tells the SVM to give adaptive weights
# to the labels from each class, in order to account for imbalanced data.

scores = cross_val_score(SGDClassifier(class_weight='balanced'), X_train, y_train, cv=10)
# scores = cross_val_score(LinearSVC(class_weight='balanced'), X_train, y_train, cv=5)
print(scores)
print(np.mean(scores))

[ 0.70336251  0.68217411  0.45508982  0.7245509   0.75886688  0.69569224
  0.77931352  0.60668203  0.87373272  0.67119816]
0.695066288152


In [132]:
lin_clf = SGDClassifier(class_weight='balanced', alpha=0.5)
# lin_clf = LinearSVC() -> validation accuracy 0.5 !!
lin_clf.fit(X_train, y_train)

SGDClassifier(alpha=0.5, average=False, class_weight='balanced', epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', n_iter=5, n_jobs=1,
       penalty='l2', power_t=0.5, random_state=None, shuffle=True,
       verbose=0, warm_start=False)

In [165]:
score = lin_clf.score(X_valid, y_valid)
y_valid_predicted = lin_clf.predict(X_valid)

print("Validation data shape: {0}".format(X_valid.shape))
print("Validation data neg:pos ratio: {0}".format(VALIDATION_NEG_TO_POS_RATIO))
print("Validation accuracy: {0}".format(score))

print(confusion_matrix(y_valid, y_valid_predicted))

Validation data shape: (136, 10)
Validation data neg:pos ratio: 1
Validation accuracy: 0.7794117647058824
[[62  6]
 [24 44]]


### Grid search for Linear SVM (SGDClassifier)

In [91]:
from operator import itemgetter

def report(grid_scores, n_top=3):
    top_scores = sorted(grid_scores, key=itemgetter(1), reverse=True)[:n_top]
    for i, score in enumerate(top_scores):
        print("[{0}] Mean validation score: {1:.3f} (std: {2:.3f}): {3}".format(
              i,
              score.mean_validation_score,
              np.std(score.cv_validation_scores),
              score.parameters))

In [155]:
from sklearn.grid_search import GridSearchCV

pg_lin = {
    'alpha': [0.0005, 0.001, 0.01, 0.05, 0.1, 0.5, 0.75],
    'loss': ['hinge', 'log'],
    'penalty': ['l1', 'l2', 'elasticnet'],
#     'n_iter': [5, 10, 25]
}

# Simple cross-validation doesn't work well out of the box, as our data is very imbalanced.
score_on_validation = lambda est, xx_ignore, yy_ignore: est.score(X_valid, y_valid)
gs_lin = GridSearchCV(lin_clf, pg_lin, cv=None, scoring=score_on_validation)

In [156]:
gs_lin.fit(X_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=SGDClassifier(alpha=0.5, average=False, class_weight='balanced', epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', n_iter=5, n_jobs=1,
       penalty='l2', power_t=0.5, random_state=None, shuffle=True,
       verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'penalty': ['l1', 'l2', 'elasticnet'], 'loss': ['hinge', 'log'], 'alpha': [0.0005, 0.001, 0.01, 0.05, 0.1, 0.5, 0.75]},
       pre_dispatch='2*n_jobs', refit=True,
       scoring=<function <lambda> at 0x112497510>, verbose=0)

In [157]:
report(gs_lin.grid_scores_, n_top=250)

[0] Mean validation score: 0.779 (std: 0.000): {'penalty': 'elasticnet', 'loss': 'hinge', 'alpha': 0.1}
[1] Mean validation score: 0.779 (std: 0.000): {'penalty': 'l1', 'loss': 'log', 'alpha': 0.1}
[2] Mean validation score: 0.777 (std: 0.003): {'penalty': 'l1', 'loss': 'hinge', 'alpha': 0.05}
[3] Mean validation score: 0.777 (std: 0.003): {'penalty': 'l2', 'loss': 'hinge', 'alpha': 0.5}
[4] Mean validation score: 0.775 (std: 0.003): {'penalty': 'l1', 'loss': 'hinge', 'alpha': 0.1}
[5] Mean validation score: 0.772 (std: 0.006): {'penalty': 'elasticnet', 'loss': 'hinge', 'alpha': 0.5}
[6] Mean validation score: 0.772 (std: 0.006): {'penalty': 'l2', 'loss': 'hinge', 'alpha': 0.05}
[7] Mean validation score: 0.772 (std: 0.006): {'penalty': 'elasticnet', 'loss': 'log', 'alpha': 0.1}
[8] Mean validation score: 0.770 (std: 0.007): {'penalty': 'l2', 'loss': 'log', 'alpha': 0.01}
[9] Mean validation score: 0.767 (std: 0.003): {'penalty': 'elasticnet', 'loss': 'hinge', 'alpha': 0.05}
[10] Mean 

## Nonlinear SVC

In [158]:
nonlinear_svm = SVC(kernel='rbf', class_weight='balanced', C=0.1)
nonlinear_svm_scores = cross_val_score(nonlinear_svm, X_train, y_train, cv=3)
print(nonlinear_svm_scores)
print(np.mean(nonlinear_svm_scores))

[ 0.73822412  0.72889554  0.74798434  0.72736697  0.72920986]
0.734336164025


In [159]:
nonlin_clf = nonlinear_svm
nonlin_clf.fit(X_train, y_train)

SVC(C=0.1, cache_size=200, class_weight='balanced', coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [164]:
score = nonlin_clf.score(X_valid, y_valid)
y_valid_predicted = nonlin_clf.predict(X_valid)

print("Validation data shape: {0}".format(X_valid.shape))
print("Validation data neg:pos ratio: {0}".format(VALIDATION_NEG_TO_POS_RATIO))
print("Validation accuracy: {0}".format(score))

print(confusion_matrix(y_valid, y_valid_predicted))

Validation data shape: (136, 10)
Validation data neg:pos ratio: 1
Validation accuracy: 0.7720588235294118
[[52 16]
 [15 53]]


### Grid search for kernelized SVM

In [167]:
pg_nonlin = {
    'C': [0.0005, 0.001, 0.01, 0.05, 0.1, 0.5, 0.75],
}

gs_nonlin = GridSearchCV(nonlin_clf, pg_nonlin, cv=None, scoring=score_on_validation)

In [168]:
gs_nonlin.fit(X_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=SVC(C=0.1, cache_size=200, class_weight='balanced', coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'C': [0.0005, 0.001, 0.01, 0.05, 0.1, 0.5, 0.75]},
       pre_dispatch='2*n_jobs', refit=True,
       scoring=<function <lambda> at 0x112497510>, verbose=0)

Top results from nonlinear SVM grid search (rbf kernel, default settings, checked on 15% validation data set, biased for 1:1 pos/neg labels.

```
[0] Mean validation score: 0.779 (std: 0.000): {'penalty': 'elasticnet', 'loss': 'hinge', 'alpha': 0.1}
[1] Mean validation score: 0.779 (std: 0.000): {'penalty': 'l1', 'loss': 'log', 'alpha': 0.1}
[2] Mean validation score: 0.777 (std: 0.003): {'penalty': 'l1', 'loss': 'hinge', 'alpha': 0.05}
[3] Mean validation score: 0.777 (std: 0.003): {'penalty': 'l2', 'loss': 'hinge', 'alpha': 0.5}
[4] Mean validation score: 0.775 (std: 0.003): {'penalty': 'l1', 'loss': 'hinge', 'alpha': 0.1}
[5] Mean validation score: 0.772 (std: 0.006): {'penalty': 'elasticnet', 'loss': 'hinge', 'alpha': 0.5}
[6] Mean validation score: 0.772 (std: 0.006): {'penalty': 'l2', 'loss': 'hinge', 'alpha': 0.05}
[7] Mean validation score: 0.772 (std: 0.006): {'penalty': 'elasticnet', 'loss': 'log', 'alpha': 0.1}
[8] Mean validation score: 0.770 (std: 0.007): {'penalty': 'l2', 'loss': 'log', 'alpha': 0.01}
[9] Mean validation score: 0.767 (std: 0.003): {'penalty': 'elasticnet', 'loss': 'hinge', 'alpha': 0.05}
[10] Mean validation score: 0.765 (std: 0.006): {'penalty': 'elasticnet', 'loss': 'log', 'alpha': 0.05}
```

In [170]:
report(gs_lin.grid_scores_, n_top=250)

[0] Mean validation score: 0.779 (std: 0.000): {'penalty': 'elasticnet', 'loss': 'hinge', 'alpha': 0.1}
[1] Mean validation score: 0.779 (std: 0.000): {'penalty': 'l1', 'loss': 'log', 'alpha': 0.1}
[2] Mean validation score: 0.777 (std: 0.003): {'penalty': 'l1', 'loss': 'hinge', 'alpha': 0.05}
[3] Mean validation score: 0.777 (std: 0.003): {'penalty': 'l2', 'loss': 'hinge', 'alpha': 0.5}
[4] Mean validation score: 0.775 (std: 0.003): {'penalty': 'l1', 'loss': 'hinge', 'alpha': 0.1}
[5] Mean validation score: 0.772 (std: 0.006): {'penalty': 'elasticnet', 'loss': 'hinge', 'alpha': 0.5}
[6] Mean validation score: 0.772 (std: 0.006): {'penalty': 'l2', 'loss': 'hinge', 'alpha': 0.05}
[7] Mean validation score: 0.772 (std: 0.006): {'penalty': 'elasticnet', 'loss': 'log', 'alpha': 0.1}
[8] Mean validation score: 0.770 (std: 0.007): {'penalty': 'l2', 'loss': 'log', 'alpha': 0.01}
[9] Mean validation score: 0.767 (std: 0.003): {'penalty': 'elasticnet', 'loss': 'hinge', 'alpha': 0.05}
[10] Mean 

## Old metrics code

In [None]:
def train_metrics(clf, X, y):
    clf.fit(X, y)
    y_pred = clf.predict(X)
    print(average_precision_score(y, y_pred, average='micro'))
    print(average_precision_score(y, y_pred, average='macro'))
    print(average_precision_score(y, y_pred, average='weighted'))
#     print("F1:        {0:.4f}".format(f1_score(y, y_pred, pos_label='true\n')))
#     print("Precision: {0:.4f}".format(precision_score(y, y_pred)))
#     print("Accuracy:  {0:.4f}".format(accuracy_score(y, y_pred)))
    print("Confusion matrix:")
    print(confusion_matrix(y, y_pred, labels=None))

In [None]:
# pred_y = clf.predict(X)
#clf = SGDClassifier(class_weight='balanced')
train_metrics(LinearSVC(class_weight='balanced'), X, y_raw)

In [None]:
train_metrics(SVC(kernel='rbf', class_weight='balanced'), X, y_raw)