In [52]:
from __future__ import print_function
import numpy as np

In [53]:
np.random.seed(0xF00BA2)


In [54]:
# Where we had our Scala program dump all the training data.
# (back to python, to main, to src, to root)
# Andrei: data only from Gerdaq-train-a
# csv_file_name = "../../../../data/all-candidates-4-10-14-10.csv"
# Andrei: data from both Gerdaq-train-a and b, but with fewer features
# csv_file_name = "../../../../data/all-candidates-5-14-13-25.csv"

# Andrei: data from both A and B, and with more features!
# Entity: 1; 3-10; 15-19; 23-24;
# Mention-entity: 26-29
csv_file_name = "../../../../data/all-candidates-5-17-14-5.csv"

In [55]:
# Load our data loading and scaling utilities.
%run '../data_util.py'

In [56]:
FEATURE_COUNT = 20
X_raw, y_raw = load_training_data(csv_file_name, FEATURE_COUNT)

Import was successful.
Feature shape: (82127, 20)
Label shape: (82127,)


In [57]:
print("Raw feature max vals: {0}".format(np.max(X_raw, axis=0)))
print("Raw feature min vals: {0}".format(np.min(X_raw, axis=0)))
print("Raw feature mean vals: {0}".format(np.mean(X_raw, axis=0)))
print("Raw feature stds: {0}".format(np.std(X_raw, axis=0)))

print()
print("Unwanted value checks:")
print("NaNs: {0}".format(np.sum(np.isnan(X_raw), axis=0)))
print("Infinities: {0}".format(np.sum(np.isinf(X_raw), axis=0)))

Raw feature max vals: [  3.39000000e+08   1.00000000e+02   2.53333333e+01   2.53333333e+01
   1.00000000e+04   0.00000000e+00   3.10000000e+01   1.00000000e+00
   2.50000000e+01   1.00000000e+00   1.00000000e+00   1.00000000e+00
   1.00000000e+00   1.00000000e+00   1.05000000e+01   1.70000000e+01
   2.50000000e+01   7.90000000e+01   1.00000000e+00   1.00000000e+00]
Raw feature min vals: [ 0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.]
Raw feature mean vals: [  1.81894359e+07   9.40710241e+01   4.04707035e+00   3.95638653e+00
   1.02438591e+01   0.00000000e+00   6.60782541e+00   1.57812291e-01
   2.30326494e+01   6.39798172e-03   8.79772465e-01   4.53163184e-04
   6.01287434e-01   4.15894032e-02   7.11704056e-02   6.43480451e+00
   5.18667289e+00   1.69747586e+01   9.78713518e-03   4.07028997e-02]
Raw feature stds: [  4.15479096e+07   2.14905149e+01   2.04727469e+00   2.10104523e+00
   3.19649394e+02   0.00000000e+00   1.81093658e+00   2.22818689e-01
  

In [58]:
import sklearn
X_res, y_res, _, _, _ = rescale(X_raw, y_raw)
X, y = sklearn.utils.shuffle(X_res, y_res)

[ 0.74871574 -0.12804783  0.07147374 ..., -0.09966897  0.09620482
 -0.11730227]
[ 3.67884292  2.1972016   1.71475578 ...,  1.14650541  0.9152049   1.1598036 ]


In [59]:
pos_count = np.sum(y == 1)
neg_count = np.sum(y == 0)
print("We have {0} positive labels.".format(pos_count))
print("We have {0} negative labels.".format(neg_count))

We have 887 positive labels.
We have 81240 negative labels.


## Train/validation split with slicing wizardry

In [60]:
# Use a much smaller ratio of negative to positive samples in the validation
# set, for more accurate validation results.
VALIDATION_NEG_TO_POS_RATIO = 1

pos_count_valid = int(pos_count * 0.15)
neg_count_valid = pos_count_valid * VALIDATION_NEG_TO_POS_RATIO

# Indexes of positive rows to use for validation.
# This witchcraft isolates the indexes of the first 'pos_count_valid' rows
# with positive labels in the training data.
y_pos_ind = (y == 1)
y_pos_counts = np.cumsum(y_pos_ind)
y_pos_lim = np.where(y_pos_counts == (pos_count_valid + 1))[0][0]
y_pos_ind[y_pos_lim:] = False

# This bit does the same but for the first 'neg_count_valid' rows with negative
# labels.
y_neg_ind = (y == 0)
y_neg_counts = np.cumsum(y_neg_ind)
y_neg_lim = np.where(y_neg_counts == (neg_count_valid + 1))[0][0]
y_neg_ind[y_neg_lim:] = False

# Make sure that there's no overlap, which would signify that we messed something
# up with the slicing/indexing.
assert np.sum(y_pos_ind & y_neg_ind) == 0

X_valid = X[y_pos_ind | y_neg_ind]
y_valid = y[y_pos_ind | y_neg_ind]

X_train = X[~(y_pos_ind | y_neg_ind)]
y_train = y[~(y_pos_ind | y_neg_ind)]

# Just some manual extra checks.
# TODO(andrei): Label better or remove.
print(np.sum(y_pos_ind))
print(np.sum(y_neg_ind))
print("Training:")
print(X_train.shape)
print(y_train.shape)
print("Validation:")
print(X_valid.shape)
print(y_valid.shape)
print(pos_count_valid, neg_count_valid, pos_count_valid + neg_count_valid)

133
133
Training:
(81861, 20)
(81861,)
Validation:
(266, 20)
(266,)
133 133 266


## Linear SVC

In [61]:
from sklearn.cross_validation import cross_val_score
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import *
from sklearn.svm import SVC, SVR, LinearSVC, LinearSVR


# The "class_weight='balanced'" param tells the SVM to give adaptive weights
# to the labels from each class, in order to account for imbalanced data.

scores = cross_val_score(SGDClassifier(class_weight='balanced'), X_train, y_train, cv=10)
# scores = cross_val_score(LinearSVC(class_weight='balanced'), X_train, y_train, cv=5)
print(scores)
print(np.mean(scores))

[ 0.90081837  0.89129107  0.77036766  0.82911934  0.93696555  0.795871
  0.78487662  0.78155162  0.78729383  0.75442883]
0.823258388442


Sample for train-A:
```
[ 0.59212345  0.69622294  0.93390143  0.69415016  0.73952096  0.71020502
  0.73324119  0.67396313  0.69746544  0.78225806]
0.725305177786
```

In [62]:
lin_clf = SGDClassifier(class_weight='balanced', alpha=0.001, loss='hinge')
# lin_clf = LinearSVC() -> validation accuracy 0.5 !!
lin_clf.fit(X_train, y_train)

SGDClassifier(alpha=0.001, average=False, class_weight='balanced',
       epsilon=0.1, eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', n_iter=5, n_jobs=1,
       penalty='l2', power_t=0.5, random_state=None, shuffle=True,
       verbose=0, warm_start=False)

In [63]:
score = lin_clf.score(X_valid, y_valid)
y_valid_predicted = lin_clf.predict(X_valid)

print("Validation data shape: {0}".format(X_valid.shape))
print("Validation data neg:pos ratio: {0}".format(VALIDATION_NEG_TO_POS_RATIO))
print("Validation accuracy: {0}".format(score))

print(confusion_matrix(y_valid, y_valid_predicted))

Validation data shape: (266, 20)
Validation data neg:pos ratio: 1
Validation accuracy: 0.8421052631578947
[[115  18]
 [ 24 109]]


Sample from train-A only:

```
Validation data shape: (136, 10)
Validation data neg:pos ratio: 1
Validation accuracy: 0.7941176470588235
[[65  3]
 [25 43]]
```

### Grid search for Linear SVM (SGDClassifier)

In [64]:
from operator import itemgetter

def report(grid_scores, n_top=3):
    top_scores = sorted(grid_scores, key=itemgetter(1), reverse=True)[:n_top]
    for i, score in enumerate(top_scores):
        print("[{0}] Mean validation score: {1:.3f} (std: {2:.3f}): {3}".format(
              i,
              score.mean_validation_score,
              np.std(score.cv_validation_scores),
              score.parameters))

In [68]:
from sklearn.grid_search import GridSearchCV
from sklearn.externals.joblib import delayed

pg_lin = {
    'alpha': [0.0005, 0.00075, 0.00100, 0.00500, 0.010, 0.015, 0.025, 0.050, 0.1, 0.5],
    'loss': ['hinge', 'log'],
#     'loss': ['hinge'],
    'penalty': ['l1', 'l2', 'elasticnet'],
    # The default iteration count is 5.
#     'n_iter': [25],
}

# Simple cross-validation doesn't work well out of the box, as our data is very imbalanced.
# score_on_validation = lambda est, xx_ignore, yy_ignore: est.score(X_valid, y_valid)
def score_on_validation(est, xx_ignore, yy_ignore):
    return est.score(X_valid, y_valid)

gs_lin = GridSearchCV(lin_clf, pg_lin, cv=None, scoring=score_on_validation, n_jobs=-1)

In [69]:
gs_lin.fit(X_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=SGDClassifier(alpha=0.001, average=False, class_weight='balanced',
       epsilon=0.1, eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', n_iter=5, n_jobs=1,
       penalty='l2', power_t=0.5, random_state=None, shuffle=True,
       verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'alpha': [0.0005, 0.00075, 0.001, 0.005, 0.01, 0.015, 0.025, 0.05, 0.1, 0.5], 'loss': ['hinge', 'log'], 'penalty': ['l1', 'l2', 'elasticnet']},
       pre_dispatch='2*n_jobs', refit=True,
       scoring=<function score_on_validation at 0x10fbd48c8>, verbose=0)

In [70]:
report(gs_lin.grid_scores_, n_top=250)

[0] Mean validation score: 0.866 (std: 0.006): {'alpha': 0.0005, 'loss': 'hinge', 'penalty': 'l1'}
[1] Mean validation score: 0.855 (std: 0.012): {'alpha': 0.00075, 'loss': 'log', 'penalty': 'l1'}
[2] Mean validation score: 0.851 (std: 0.012): {'alpha': 0.005, 'loss': 'hinge', 'penalty': 'l2'}
[3] Mean validation score: 0.850 (std: 0.006): {'alpha': 0.001, 'loss': 'log', 'penalty': 'elasticnet'}
[4] Mean validation score: 0.846 (std: 0.015): {'alpha': 0.01, 'loss': 'hinge', 'penalty': 'l1'}
[5] Mean validation score: 0.846 (std: 0.000): {'alpha': 0.01, 'loss': 'log', 'penalty': 'elasticnet'}
[6] Mean validation score: 0.845 (std: 0.006): {'alpha': 0.015, 'loss': 'log', 'penalty': 'l1'}
[7] Mean validation score: 0.845 (std: 0.013): {'alpha': 0.005, 'loss': 'log', 'penalty': 'elasticnet'}
[8] Mean validation score: 0.845 (std: 0.012): {'alpha': 0.015, 'loss': 'log', 'penalty': 'l2'}
[9] Mean validation score: 0.843 (std: 0.015): {'alpha': 0.005, 'loss': 'log', 'penalty': 'l2'}
[10] Mean

Sample from train-A only:
    
```
[0] Mean validation score: 0.806 (std: 0.003): {'loss': 'hinge', 'penalty': 'l1', 'alpha': 0.025}
[1] Mean validation score: 0.806 (std: 0.009): {'loss': 'log', 'penalty': 'l1', 'alpha': 0.025}
[2] Mean validation score: 0.794 (std: 0.032): {'loss': 'hinge', 'penalty': 'l1', 'alpha': 0.001}
[3] Mean validation score: 0.794 (std: 0.006): {'loss': 'log', 'penalty': 'elasticnet', 'alpha': 0.075}
[4] Mean validation score: 0.794 (std: 0.000): {'loss': 'hinge', 'penalty': 'elasticnet', 'alpha': 0.1}
[5] Mean validation score: 0.792 (std: 0.014): {'loss': 'hinge', 'penalty': 'l2', 'alpha': 0.075}
[6] Mean validation score: 0.789 (std: 0.017): {'loss': 'hinge', 'penalty': 'elasticnet', 'alpha': 0.05}
[7] Mean validation score: 0.789 (std: 0.014): {'loss': 'hinge', 'penalty': 'l1', 'alpha': 0.1}
[8] Mean validation score: 0.789 (std: 0.003): {'loss': 'log', 'penalty': 'l2', 'alpha': 0.1}
[9] Mean validation score: 0.787 (std: 0.045): {'loss': 'hinge', 'penalty': 'elasticnet', 'alpha': 0.001}
[10] Mean validation score: 0.787 (std: 0.016): {'loss': 'hinge', 'penalty': 'l2', 'alpha': 0.1}
[11] Mean validation score: 0.787 (std: 0.012): {'loss': 'log', 'penalty': 'l2', 'alpha': 0.075}
[12] Mean validation score: 0.787 (std: 0.010): {'loss': 'hinge', 'penalty': 'elasticnet', 'alpha': 0.075}
[13] Mean validation score: 0.787 (std: 0.006): {'loss': 'hinge', 'penalty': 'l2', 'alpha': 0.5}
[14] Mean validation score: 0.784 (std: 0.015): {'loss': 'log', 'penalty': 'elasticnet', 'alpha': 0.1}
[15] Mean validation score: 0.784 (std: 0.017): {'loss': 'log', 'penalty': 'elasticnet', 'alpha': 0.5}
[16] Mean validation score: 0.784 (std: 0.019): {'loss': 'log', 'penalty': 'l1', 'alpha': 0.1}
[17] Mean validation score: 0.782 (std: 0.015): {'loss': 'log', 'penalty': 'l2', 'alpha': 0.5}
[18] Mean validation score: 0.782 (std: 0.014): {'loss': 'log', 'penalty': 'l2', 'alpha': 0.05}
[19] Mean validation score: 0.782 (std: 0.012): {'loss': 'hinge', 'penalty': 'l1', 'alpha': 0.01}
[20] Mean validation score: 0.782 (std: 0.009): {'loss': 'hinge', 'penalty': 'elasticnet', 'alpha': 0.01}
[21] Mean validation score: 0.779 (std: 0.022): {'loss': 'log', 'penalty': 'l2', 'alpha': 0.025}
[22] Mean validation score: 0.779 (std: 0.016): {'loss': 'hinge', 'penalty': 'l2', 'alpha': 0.05}
[23] Mean validation score: 0.779 (std: 0.010): {'loss': 'log', 'penalty': 'elasticnet', 'alpha': 0.05}
[24] Mean validation score: 0.777 (std: 0.018): {'loss': 'hinge', 'penalty': 'elasticnet', 'alpha': 0.025}
[25] Mean validation score: 0.777 (std: 0.024): {'loss': 'hinge', 'penalty': 'l1', 'alpha': 0.075}
[26] Mean validation score: 0.777 (std: 0.017): {'loss': 'log', 'penalty': 'l1', 'alpha': 0.05}
[27] Mean validation score: 0.775 (std: 0.018): {'loss': 'hinge', 'penalty': 'l2', 'alpha': 0.01}
[28] Mean validation score: 0.775 (std: 0.009): {'loss': 'log', 'penalty': 'l2', 'alpha': 0.01}
[29] Mean validation score: 0.767 (std: 0.033): {'loss': 'hinge', 'penalty': 'l2', 'alpha': 0.025}
[30] Mean validation score: 0.767 (std: 0.049): {'loss': 'log', 'penalty': 'l1', 'alpha': 0.075}
[31] Mean validation score: 0.762 (std: 0.021): {'loss': 'log', 'penalty': 'elasticnet', 'alpha': 0.025}
[32] Mean validation score: 0.757 (std: 0.022): {'loss': 'log', 'penalty': 'l1', 'alpha': 0.01}
[33] Mean validation score: 0.755 (std: 0.003): {'loss': 'log', 'penalty': 'elasticnet', 'alpha': 0.01}
[34] Mean validation score: 0.750 (std: 0.037): {'loss': 'log', 'penalty': 'l1', 'alpha': 0.001}
[35] Mean validation score: 0.728 (std: 0.094): {'loss': 'log', 'penalty': 'l2', 'alpha': 0.0005}
[36] Mean validation score: 0.716 (std: 0.050): {'loss': 'log', 'penalty': 'elasticnet', 'alpha': 0.001}
[37] Mean validation score: 0.708 (std: 0.072): {'loss': 'log', 'penalty': 'l2', 'alpha': 0.001}
[38] Mean validation score: 0.708 (std: 0.062): {'loss': 'hinge', 'penalty': 'elasticnet', 'alpha': 0.0005}
[39] Mean validation score: 0.701 (std: 0.106): {'loss': 'log', 'penalty': 'l1', 'alpha': 0.0005}
[40] Mean validation score: 0.686 (std: 0.139): {'loss': 'hinge', 'penalty': 'l1', 'alpha': 0.05}
[41] Mean validation score: 0.686 (std: 0.132): {'loss': 'hinge', 'penalty': 'elasticnet', 'alpha': 0.5}
[42] Mean validation score: 0.672 (std: 0.121): {'loss': 'log', 'penalty': 'elasticnet', 'alpha': 0.0005}
[43] Mean validation score: 0.652 (std: 0.117): {'loss': 'hinge', 'penalty': 'l1', 'alpha': 0.0005}
[44] Mean validation score: 0.652 (std: 0.100): {'loss': 'hinge', 'penalty': 'l2', 'alpha': 0.0005}
[45] Mean validation score: 0.583 (std: 0.191): {'loss': 'hinge', 'penalty': 'l2', 'alpha': 0.001}
[46] Mean validation score: 0.500 (std: 0.000): {'loss': 'hinge', 'penalty': 'l1', 'alpha': 0.5}
[47] Mean validation score: 0.500 (std: 0.000): {'loss': 'log', 'penalty': 'l1', 'alpha': 0.5}
```

In [17]:
y_pred_lin = gs_lin.predict(X_train)
print(confusion_matrix(y_train, y_pred_lin))

[[70902 10205]
 [  147   607]]


Old: 

```
[[63080 18027]
 [  245   509]]
```

## Nonlinear SVC

In [None]:
nonlinear_svm = SVC(kernel='rbf', class_weight='balanced', C=1)
nonlinear_svm_scores = cross_val_score(nonlinear_svm, X_train, y_train, cv=3)
print(nonlinear_svm_scores)
print(np.mean(nonlinear_svm_scores))

In [None]:
nonlin_clf = nonlinear_svm
nonlin_clf.fit(X_train, y_train)

In [None]:
score = nonlin_clf.score(X_valid, y_valid)
y_valid_predicted = nonlin_clf.predict(X_valid)

print("Validation data shape: {0}".format(X_valid.shape))
print("Validation data neg:pos ratio: {0}".format(VALIDATION_NEG_TO_POS_RATIO))
print("Validation accuracy: {0}".format(score))

print(confusion_matrix(y_valid, y_valid_predicted))

Old SVM nonlin numbers:
    
```
Validation data shape: (266, 10)
Validation data neg:pos ratio: 1
Validation accuracy: 0.8045112781954887
[[107  26]
 [ 26 107]]
```

In [26]:
# Multiple C confusion matrices.

from sklearn.externals.joblib import Parallel, delayed

def quick_eval_svc(C):
    output = ""
    output += "\nC = {0:10.8f}\n--------\n\n".format(C)
    svc = SVC(kernel='rbf', class_weight='balanced', C=C)
    svc.fit(X_train, y_train)
    y_valid_pred = svc.predict(X_valid)
    y_train_pred = svc.predict(X_train)
    output += "Validation data confusion matrix:\n"
    output += str(confusion_matrix(y_valid, y_valid_pred))
    output += "\n"
    output += "Training data confusion matrix (prone to overfitting):\n"
    output += str(confusion_matrix(y_train, y_train_pred))
    output += "\n"
    return output

def confusion_search():
    Cs = [0.00001, 0.0001, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 0.6, 0.7, 0.8, 0.9, 1, 2, 5]
    res = Parallel(n_jobs=6)(delayed(quick_eval_svc)(C) for C in Cs)
    return res

In [27]:
results = confusion_search()

In [28]:
for r in results:
    print(r)


C = 0.00001000
--------

Validation data confusion matrix:
[[133   0]
 [133   0]]
Training data confusion matrix (prone to overfitting):
[[81107     0]
 [  754     0]]


C = 0.00010000
--------

Validation data confusion matrix:
[[105  28]
 [ 25 108]]
Training data confusion matrix (prone to overfitting):
[[67386 13721]
 [  171   583]]


C = 0.00100000
--------

Validation data confusion matrix:
[[111  22]
 [ 18 115]]
Training data confusion matrix (prone to overfitting):
[[68205 12902]
 [  130   624]]


C = 0.00500000
--------

Validation data confusion matrix:
[[120  13]
 [ 16 117]]
Training data confusion matrix (prone to overfitting):
[[70683 10424]
 [  110   644]]


C = 0.01000000
--------

Validation data confusion matrix:
[[120  13]
 [ 16 117]]
Training data confusion matrix (prone to overfitting):
[[70969 10138]
 [  105   649]]


C = 0.05000000
--------

Validation data confusion matrix:
[[120  13]
 [ 15 118]]
Training data confusion matrix (prone to overfitting):
[[72372  873

### Grid search for kernelized SVM (expensive to compute)

In [None]:
pg_nonlin = {
    'C': [0.0005, 0.001, 0.01, 0.1, 1],
}

gs_nonlin = GridSearchCV(nonlin_clf, pg_nonlin, cv=None, scoring=score_on_validation, verbose=2)

In [None]:
gs_nonlin.fit(X_train, y_train)

Top results from nonlinear SVM grid search (rbf kernel, default settings, checked on 15% validation data set, biased for 1:1 pos/neg labels).
Using 10 features.


```
[0] Mean validation score: 0.824 (std: 0.010): {'C': 0.0005}
[1] Mean validation score: 0.811 (std: 0.015): {'C': 0.001}
[2] Mean validation score: 0.782 (std: 0.015): {'C': 0.01}
[3] Mean validation score: 0.748 (std: 0.007): {'C': 0.1}
[4] Mean validation score: 0.733 (std: 0.003): {'C': 0.05}
[5] Mean validation score: 0.694 (std: 0.012): {'C': 0.5}
[6] Mean validation score: 0.691 (std: 0.010): {'C': 0.75}
```

Top results from nonlinear SVM grid search (rbf kernel, default settings, ...).
Using 20 features.

```
[0] Mean validation score: 0.860 (std: 0.008): {'C': 0.01}
[1] Mean validation score: 0.848 (std: 0.010): {'C': 0.1}
[2] Mean validation score: 0.840 (std: 0.004): {'C': 0.001}
[3] Mean validation score: 0.836 (std: 0.005): {'C': 0.0005}
[4] Mean validation score: 0.796 (std: 0.004): {'C': 1}
```

In [None]:
report(gs_nonlin.grid_scores_, n_top=250)

In [None]:
y_pred_nonlin = gs_nonlin.predict(X_train)
print(confusion_matrix(y_train, y_pred_nonlin))

## Old metrics code

In [None]:
def train_metrics(clf, X, y):
    clf.fit(X, y)
    y_pred = clf.predict(X)
    print(average_precision_score(y, y_pred, average='micro'))
    print(average_precision_score(y, y_pred, average='macro'))
    print(average_precision_score(y, y_pred, average='weighted'))
#     print("F1:        {0:.4f}".format(f1_score(y, y_pred, pos_label='true\n')))
#     print("Precision: {0:.4f}".format(precision_score(y, y_pred)))
#     print("Accuracy:  {0:.4f}".format(accuracy_score(y, y_pred)))
    print("Confusion matrix:")
    print(confusion_matrix(y, y_pred, labels=None))

In [None]:
# pred_y = clf.predict(X)
#clf = SGDClassifier(class_weight='balanced')
train_metrics(LinearSVC(class_weight='balanced'), X, y_raw)

In [None]:
train_metrics(SVC(kernel='rbf', class_weight='balanced'), X, y_raw)