Also using around ~1500 more queries from the Yahoo Webscope dataset.

In [1]:
from __future__ import print_function
import numpy as np

In [2]:
np.random.seed(0xF00BA2)

In [3]:
# Where we had our Scala program dump all the training data.
# (back to python, to main, to src, to root)

# Andrei: data from both A and B, and with more features!
# csv_file_name = "../../../../data/all-candidates-5-18-16-26.csv"
# csv_file_name = "../../../../data/all-candidates-5-18-18-12.csv"
# csv_file_name = "../../../../data/all-candidates-5-19-16-12.csv"
# csv_file_name = "../../../../data/all-candidates-5-19-16-20.csv.firstpart.bak.clean"
csv_file_name = "../../../../data/all-candidates-05-19-23-59.clean.csv"

In [4]:
# Load our data loading and scaling utilities.
%run '../data_util.py'

In [5]:
FEATURE_COUNT = 24

X_raw, y_raw = load_training_data(csv_file_name, FEATURE_COUNT)

Import was successful.
Feature shape: (192713, 24)
Label shape: (192713,)


In [6]:
print("Raw feature max vals: {0}".format(np.max(X_raw, axis=0)))
print("Raw feature min vals: {0}".format(np.min(X_raw, axis=0)))
print("Raw feature mean vals: {0}".format(np.mean(X_raw, axis=0)))
print("Raw feature stds: {0}".format(np.std(X_raw, axis=0)))

print()
print("Unwanted value checks:")
print("NaNs: {0}".format(np.sum(np.isnan(X_raw), axis=0)))
print("Infinities: {0}".format(np.sum(np.isinf(X_raw), axis=0)))

Raw feature max vals: [  4.29496730e+09   1.00000000e+02   2.53333333e+01   2.53333333e+01
   1.00000000e+04   3.66000000e+02   3.10000000e+01   1.00000000e+00
   2.50000000e+01   1.00000000e+00   1.00000000e+00   1.00000000e+00
   1.00000000e+00   1.00000000e+00   5.00000000e-01   5.00000000e-01
   5.00000000e-01   1.70000000e+01   1.93333333e+01   6.80000000e+01
   2.50000000e+01   9.60000000e+01   1.00000000e+00   1.00000000e+00]
Raw feature min vals: [ 0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.]
Raw feature mean vals: [  3.81320309e+07   9.30489017e+01   4.00768855e+00   3.88328597e+00
   4.37843321e+00   1.16572395e+02   6.79603290e+00   1.62278636e-01
   2.28476208e+01   1.80171458e-01   2.26127933e-01   5.25687806e-01
   5.74170699e-01   5.51275519e-01   2.24478976e-01   2.54912585e-01
   2.34994994e-01   1.81204442e+00   2.50582644e+00   1.41830247e+01
   4.91165581e+00   1.56329256e+01   1.81625458e-02   7.02458354e-02]
Raw 

In [7]:
import sklearn
X_res, y_res, _, _, _ = rescale(X_raw, y_raw)
X, y = sklearn.utils.shuffle(X_res, y_res)

[ 0.35576272 -0.12292028 -0.49747385 ...,  0.20090491  0.14791047
  0.08640878]
[ 2.52282403  1.54292582  0.47264302 ...,  0.63322274  0.59135574
  0.52146836]


In [8]:
pos_count = np.sum(y == 1)
neg_count = np.sum(y == 0)
print("We have {0} positive labels.".format(pos_count))
print("We have {0} negative labels.".format(neg_count))

We have 2796 positive labels.
We have 189917 negative labels.


## Train/validation split with slicing wizardry

In [16]:
# Use a much smaller ratio of negative to positive samples in the validation
# set, for more accurate validation results.
VALIDATION_NEG_TO_POS_RATIO = 1

pos_count_valid = int(pos_count * 0.25)
neg_count_valid = pos_count_valid * VALIDATION_NEG_TO_POS_RATIO

# Indexes of positive rows to use for validation.
# This witchcraft isolates the indexes of the first 'pos_count_valid' rows
# with positive labels in the training data.
y_pos_ind = (y == 1)
y_pos_counts = np.cumsum(y_pos_ind)
y_pos_lim = np.where(y_pos_counts == (pos_count_valid + 1))[0][0]
y_pos_ind[y_pos_lim:] = False

# This bit does the same but for the first 'neg_count_valid' rows with negative
# labels.
y_neg_ind = (y == 0)
y_neg_counts = np.cumsum(y_neg_ind)
y_neg_lim = np.where(y_neg_counts == (neg_count_valid + 1))[0][0]
y_neg_ind[y_neg_lim:] = False

# Make sure that there's no overlap, which would signify that we messed something
# up with the slicing/indexing.
assert np.sum(y_pos_ind & y_neg_ind) == 0

X_valid = X[y_pos_ind | y_neg_ind]
y_valid = y[y_pos_ind | y_neg_ind]

X_train = X[~(y_pos_ind | y_neg_ind)]
y_train = y[~(y_pos_ind | y_neg_ind)]

# # Just some manual extra checks.
# # TODO(andrei): Label better or remove.
print(np.sum(y_pos_ind))
print(np.sum(y_neg_ind))
print("Training:")
print(X_train.shape)
print(y_train.shape)
print("Validation:")
print(X_valid.shape)
print(y_valid.shape)
print(pos_count_valid, neg_count_valid, pos_count_valid + neg_count_valid)

699
699
Training:
(191315, 24)
(191315,)
Validation:
(1398, 24)
(1398,)
699 699 1398


### Paranoid version

In [15]:
pos_ids = np.where(y == 1)[0]
neg_ids = np.where(y == 0)[0]
print("Pos/neg shapes (all data):")
print(pos_ids.shape)
print(neg_ids.shape)

# Partition the data in the old-fashioned way
pos_count_train = pos_count - pos_count_valid
neg_count_train = neg_count - neg_count_valid
pos_idx = 0
neg_idx = 0
features= X.shape[1]
x_pos_valid = np.zeros((pos_count_valid, features))
x_neg_valid = np.zeros((neg_count_valid, features))
x_pos_train = np.zeros((pos_count_train, features))
x_neg_train = np.zeros((neg_count_train, features))

y_pos_valid = np.zeros(pos_count_valid)
y_neg_valid = np.zeros(neg_count_valid)
y_pos_train = np.zeros(pos_count_train)
y_neg_train = np.zeros(neg_count_train)
for idx in range(y.shape[0]):
    if y[idx] == 1:
        # Positive label
        if pos_idx < pos_count_valid:
            # Still adding to validation dataset
            x_pos_valid[pos_idx] = X[idx]
            y_pos_valid[pos_idx] = y[idx]
            pos_idx += 1
        else:
            # Done with validation. Now adding to training dataset.
            x_pos_train[pos_idx - pos_count_valid] = X[idx]
            y_pos_train[pos_idx - pos_count_valid] = y[idx]
            pos_idx += 1
    elif y[idx] == 0:
        # Negative label
        if neg_idx < neg_count_valid:
            x_neg_valid[neg_idx] = X[idx]
            y_neg_valid[neg_idx] = y[idx]
            neg_idx += 1
        else:
            x_neg_train[neg_idx - neg_count_valid] = X[idx]
            y_neg_train[neg_idx - neg_count_valid] = y[idx]
            neg_idx += 1
    else:
        raise ValueError("Invalid y value: {}".format(y[idx]))

print("Positive: train/valid")
print(pos_count_train, pos_count_valid)
print("Negative: train/valid")
print(neg_count_train, neg_count_valid)
print(x_neg_valid.shape)
print(x_neg_valid[-1, :])
print(x_pos_valid.shape)
print(x_pos_valid[-1, :])

# Uncomment these to actually use the results of this cell later on!
# X_valid = np.append(x_neg_valid, x_pos_valid, axis=0)
# X_train = np.append(x_neg_train, x_pos_train, axis=0)
# y_valid = np.append(y_neg_valid, y_pos_valid, axis=0)
# y_train = np.append(y_neg_train, y_pos_train, axis=0)
# X_valid, y_valid = sklearn.utils.shuffle(X_valid, y_valid)
# X_train, y_train = sklearn.utils.shuffle(X_train, y_train)

Pos/neg shapes (all data):
(2796,)
(189917,)
Positive: train/valid
2097 699
Negative: train/valid
189218 699
(699, 24)
[-0.15152073  0.29853626  0.96039764  0.99351307 -0.02097635 -1.24239408
 -0.06298682 -0.52779043  0.59527895  0.52599533  0.3304225   1.06634672
  0.95088522  1.01910707  1.27854255  1.12022742  1.24202679  1.42882086
  0.95924082 -0.11436485  0.17637816 -0.07776715 -0.15219806 -0.37567905]
(699, 24)
[-0.14549162  0.29853626 -1.44986216 -1.35331569 -0.02097635 -0.33556308
  0.04216498  0.33546819  0.12474773 -0.45813112 -0.03236828  1.06634672
  0.95088522  1.01910707  1.27854255  1.12022742  1.24202679 -0.69743579
 -0.48382437 -0.775485   -1.63615701 -1.67506852  7.89895919 -0.18428249]


## Linear SVC

In [17]:
from sklearn.cross_validation import cross_val_score
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import *
from sklearn.svm import SVC, SVR, LinearSVC, LinearSVR

# The "class_weight='balanced'" param tells the SVM to give adaptive weights
# to the labels from each class, in order to account for imbalanced data.

vanilla_clf = SGDClassifier(class_weight='balanced', n_iter=5)
print("Will score and then fit the following linear classifier on the training data.")
print(vanilla_clf)
print("Shape of X_train: {}".format(X_train.shape))
print("Shape of X_valid: {}".format(X_valid.shape))
scores = cross_val_score(vanilla_clf, X_train, y_train, cv=10)

print("Naive CV scores (misleading due to data imbalance):")
print(scores)
print(np.mean(scores))

vanilla_clf.fit(X_train, y_train)

print("On validation data")
# vanilla_val_y = vanilla_clf.predict(X_valid)
y_valid_pos_mask = (y_valid == 1)
# print(np.sum(y_valid_pos_mask))
X_valid_pos = X_valid[y_valid_pos_mask]
print(X_valid_pos.shape)

# We are only validating positive labels.
vanilla_y_pos_predict = vanilla_clf.predict(X_valid_pos)
vanilla_tp = np.sum(vanilla_y_pos_predict == 1)
print("True positives")
print(vanilla_tp)
print("False negatives")
print(np.sum(vanilla_y_pos_predict == 0))

vanilla_pos_acc = vanilla_tp / vanilla_y_pos_predict.shape[0]
print("Accuracy on positive data: {0:6.4f}".format(vanilla_pos_acc))
    
print("Confusion matrix as an extra sanity check:")
print(confusion_matrix(y_valid, vanilla_clf.predict(X_valid)))


Will score and then fit the following linear classifier on the training data.
SGDClassifier(alpha=0.0001, average=False, class_weight='balanced',
       epsilon=0.1, eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', n_iter=5, n_jobs=1,
       penalty='l2', power_t=0.5, random_state=None, shuffle=True,
       verbose=0, warm_start=False)
Shape of X_train: (191315, 24)
Shape of X_valid: (1398, 24)
Naive CV scores (misleading due to data imbalance):
[ 0.9160046   0.89452227  0.87727368  0.89211792  0.83336818  0.96780263
  0.9034079   0.93466102  0.76821746  0.86842656]
0.885580221352
On validation data
(699, 24)
True positives
660
False negatives
39
Accuracy on positive data: 0.9442
Confusion matrix as an extra sanity check:
[[595 104]
 [ 39 660]]


Sample for train-A:
```
[ 0.59212345  0.69622294  0.93390143  0.69415016  0.73952096  0.71020502
  0.73324119  0.67396313  0.69746544  0.78225806]
0.725305177786
```

In [18]:
lin_clf = SGDClassifier(class_weight='balanced', alpha=0.001, loss='hinge')
# lin_clf = LinearSVC() -> validation accuracy 0.5 !!
lin_clf.fit(X_train, y_train)

SGDClassifier(alpha=0.001, average=False, class_weight='balanced',
       epsilon=0.1, eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', n_iter=5, n_jobs=1,
       penalty='l2', power_t=0.5, random_state=None, shuffle=True,
       verbose=0, warm_start=False)

In [19]:
score = lin_clf.score(X_valid, y_valid)
y_valid_predicted = lin_clf.predict(X_valid)

print("Validation data shape: {0}".format(X_valid.shape))
print("Validation data neg:pos ratio: {0}".format(VALIDATION_NEG_TO_POS_RATIO))
print("Validation accuracy: {0}".format(score))

print(confusion_matrix(y_valid, y_valid_predicted))

Validation data shape: (1398, 24)
Validation data neg:pos ratio: 1
Validation accuracy: 0.9213161659513591
[[647  52]
 [ 58 641]]


Sample from train-AB with fewer features (20 only) and old WAT style, salsa-auth:
```
Validation data shape: (266, 20)
Validation data neg:pos ratio: 1
Validation accuracy: 0.8421052631578947
[[115  18]
 [ 24 109]]
 ```

Sample from train-A only:

```
Validation data shape: (136, 10)
Validation data neg:pos ratio: 1
Validation accuracy: 0.7941176470588235
[[65  3]
 [25 43]]
```

### Grid search for Linear SVM (SGDClassifier)

In [23]:
from operator import itemgetter

def report(grid_scores, n_top=3):
    top_scores = sorted(grid_scores, key=itemgetter(1), reverse=True)[:n_top]
    for i, score in enumerate(top_scores):
        print("[{0}] Mean validation score: {1:.3f} (std: {2:.3f}): {3}".format(
              i,
              score.mean_validation_score,
              np.std(score.cv_validation_scores),
              score.parameters))

In [20]:
from sklearn.grid_search import GridSearchCV
from sklearn.externals.joblib import delayed

pg_lin = {
    'alpha': [0.0005, 0.00075, 0.00100, 0.00500, 0.010, 0.015, 0.025, 0.050, 0.5, 1.0],
    'loss': ['hinge', 'log'],
    'penalty': ['l1', 'l2', 'elasticnet'],
    # The default iteration count is 5.
    'n_iter': [5, 25],
}

# Simple cross-validation doesn't work well out of the box, as our data is very imbalanced.
# score_on_validation = lambda est, xx_ignore, yy_ignore: est.score(X_valid, y_valid)
def score_on_validation(est, xx_ignore, yy_ignore):
    return est.score(X_valid, y_valid)

# The CV isn't really necessary, nor does it make sense, but there's no real way to avoid it.
gs_lin = GridSearchCV(lin_clf, pg_lin, cv=2, scoring=score_on_validation, n_jobs=-1)

In [21]:
gs_lin.fit(X_train, y_train)

GridSearchCV(cv=2, error_score='raise',
       estimator=SGDClassifier(alpha=0.001, average=False, class_weight='balanced',
       epsilon=0.1, eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', n_iter=5, n_jobs=1,
       penalty='l2', power_t=0.5, random_state=None, shuffle=True,
       verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'loss': ['hinge', 'log'], 'alpha': [0.0005, 0.00075, 0.001, 0.005, 0.01, 0.015, 0.025, 0.05, 0.5, 1.0], 'penalty': ['l1', 'l2', 'elasticnet'], 'n_iter': [5, 25]},
       pre_dispatch='2*n_jobs', refit=True,
       scoring=<function score_on_validation at 0x1132ddb70>, verbose=0)

In [24]:
report(gs_lin.grid_scores_, n_top=50)

[0] Mean validation score: 0.928 (std: 0.003): {'loss': 'log', 'alpha': 0.001, 'penalty': 'l1', 'n_iter': 5}
[1] Mean validation score: 0.928 (std: 0.001): {'loss': 'hinge', 'alpha': 0.005, 'penalty': 'l2', 'n_iter': 25}
[2] Mean validation score: 0.927 (std: 0.003): {'loss': 'hinge', 'alpha': 0.005, 'penalty': 'elasticnet', 'n_iter': 5}
[3] Mean validation score: 0.927 (std: 0.001): {'loss': 'log', 'alpha': 0.01, 'penalty': 'elasticnet', 'n_iter': 5}
[4] Mean validation score: 0.927 (std: 0.000): {'loss': 'log', 'alpha': 0.001, 'penalty': 'elasticnet', 'n_iter': 25}
[5] Mean validation score: 0.926 (std: 0.001): {'loss': 'hinge', 'alpha': 0.015, 'penalty': 'l2', 'n_iter': 25}
[6] Mean validation score: 0.926 (std: 0.003): {'loss': 'hinge', 'alpha': 0.001, 'penalty': 'elasticnet', 'n_iter': 25}
[7] Mean validation score: 0.926 (std: 0.004): {'loss': 'hinge', 'alpha': 0.00075, 'penalty': 'l1', 'n_iter': 25}
[8] Mean validation score: 0.926 (std: 0.000): {'loss': 'log', 'alpha': 0.005, '

Sample from train-A only:
    
```
[0] Mean validation score: 0.806 (std: 0.003): {'loss': 'hinge', 'penalty': 'l1', 'alpha': 0.025}
[1] Mean validation score: 0.806 (std: 0.009): {'loss': 'log', 'penalty': 'l1', 'alpha': 0.025}
[2] Mean validation score: 0.794 (std: 0.032): {'loss': 'hinge', 'penalty': 'l1', 'alpha': 0.001}
[3] Mean validation score: 0.794 (std: 0.006): {'loss': 'log', 'penalty': 'elasticnet', 'alpha': 0.075}
[4] Mean validation score: 0.794 (std: 0.000): {'loss': 'hinge', 'penalty': 'elasticnet', 'alpha': 0.1}
[5] Mean validation score: 0.792 (std: 0.014): {'loss': 'hinge', 'penalty': 'l2', 'alpha': 0.075}
[6] Mean validation score: 0.789 (std: 0.017): {'loss': 'hinge', 'penalty': 'elasticnet', 'alpha': 0.05}
[7] Mean validation score: 0.789 (std: 0.014): {'loss': 'hinge', 'penalty': 'l1', 'alpha': 0.1}
[8] Mean validation score: 0.789 (std: 0.003): {'loss': 'log', 'penalty': 'l2', 'alpha': 0.1}
[9] Mean validation score: 0.787 (std: 0.045): {'loss': 'hinge', 'penalty': 'elasticnet', 'alpha': 0.001}
[10] Mean validation score: 0.787 (std: 0.016): {'loss': 'hinge', 'penalty': 'l2', 'alpha': 0.1}
[11] Mean validation score: 0.787 (std: 0.012): {'loss': 'log', 'penalty': 'l2', 'alpha': 0.075}
[12] Mean validation score: 0.787 (std: 0.010): {'loss': 'hinge', 'penalty': 'elasticnet', 'alpha': 0.075}
[13] Mean validation score: 0.787 (std: 0.006): {'loss': 'hinge', 'penalty': 'l2', 'alpha': 0.5}
[14] Mean validation score: 0.784 (std: 0.015): {'loss': 'log', 'penalty': 'elasticnet', 'alpha': 0.1}
[15] Mean validation score: 0.784 (std: 0.017): {'loss': 'log', 'penalty': 'elasticnet', 'alpha': 0.5}
[16] Mean validation score: 0.784 (std: 0.019): {'loss': 'log', 'penalty': 'l1', 'alpha': 0.1}
[17] Mean validation score: 0.782 (std: 0.015): {'loss': 'log', 'penalty': 'l2', 'alpha': 0.5}
[18] Mean validation score: 0.782 (std: 0.014): {'loss': 'log', 'penalty': 'l2', 'alpha': 0.05}
[19] Mean validation score: 0.782 (std: 0.012): {'loss': 'hinge', 'penalty': 'l1', 'alpha': 0.01}
[20] Mean validation score: 0.782 (std: 0.009): {'loss': 'hinge', 'penalty': 'elasticnet', 'alpha': 0.01}
[21] Mean validation score: 0.779 (std: 0.022): {'loss': 'log', 'penalty': 'l2', 'alpha': 0.025}
[22] Mean validation score: 0.779 (std: 0.016): {'loss': 'hinge', 'penalty': 'l2', 'alpha': 0.05}
[23] Mean validation score: 0.779 (std: 0.010): {'loss': 'log', 'penalty': 'elasticnet', 'alpha': 0.05}
[24] Mean validation score: 0.777 (std: 0.018): {'loss': 'hinge', 'penalty': 'elasticnet', 'alpha': 0.025}
[25] Mean validation score: 0.777 (std: 0.024): {'loss': 'hinge', 'penalty': 'l1', 'alpha': 0.075}
[26] Mean validation score: 0.777 (std: 0.017): {'loss': 'log', 'penalty': 'l1', 'alpha': 0.05}
[27] Mean validation score: 0.775 (std: 0.018): {'loss': 'hinge', 'penalty': 'l2', 'alpha': 0.01}
[28] Mean validation score: 0.775 (std: 0.009): {'loss': 'log', 'penalty': 'l2', 'alpha': 0.01}
[29] Mean validation score: 0.767 (std: 0.033): {'loss': 'hinge', 'penalty': 'l2', 'alpha': 0.025}
[30] Mean validation score: 0.767 (std: 0.049): {'loss': 'log', 'penalty': 'l1', 'alpha': 0.075}
[31] Mean validation score: 0.762 (std: 0.021): {'loss': 'log', 'penalty': 'elasticnet', 'alpha': 0.025}
[32] Mean validation score: 0.757 (std: 0.022): {'loss': 'log', 'penalty': 'l1', 'alpha': 0.01}
[33] Mean validation score: 0.755 (std: 0.003): {'loss': 'log', 'penalty': 'elasticnet', 'alpha': 0.01}
[34] Mean validation score: 0.750 (std: 0.037): {'loss': 'log', 'penalty': 'l1', 'alpha': 0.001}
[35] Mean validation score: 0.728 (std: 0.094): {'loss': 'log', 'penalty': 'l2', 'alpha': 0.0005}
[36] Mean validation score: 0.716 (std: 0.050): {'loss': 'log', 'penalty': 'elasticnet', 'alpha': 0.001}
[37] Mean validation score: 0.708 (std: 0.072): {'loss': 'log', 'penalty': 'l2', 'alpha': 0.001}
[38] Mean validation score: 0.708 (std: 0.062): {'loss': 'hinge', 'penalty': 'elasticnet', 'alpha': 0.0005}
[39] Mean validation score: 0.701 (std: 0.106): {'loss': 'log', 'penalty': 'l1', 'alpha': 0.0005}
[40] Mean validation score: 0.686 (std: 0.139): {'loss': 'hinge', 'penalty': 'l1', 'alpha': 0.05}
[41] Mean validation score: 0.686 (std: 0.132): {'loss': 'hinge', 'penalty': 'elasticnet', 'alpha': 0.5}
[42] Mean validation score: 0.672 (std: 0.121): {'loss': 'log', 'penalty': 'elasticnet', 'alpha': 0.0005}
[43] Mean validation score: 0.652 (std: 0.117): {'loss': 'hinge', 'penalty': 'l1', 'alpha': 0.0005}
[44] Mean validation score: 0.652 (std: 0.100): {'loss': 'hinge', 'penalty': 'l2', 'alpha': 0.0005}
[45] Mean validation score: 0.583 (std: 0.191): {'loss': 'hinge', 'penalty': 'l2', 'alpha': 0.001}
[46] Mean validation score: 0.500 (std: 0.000): {'loss': 'hinge', 'penalty': 'l1', 'alpha': 0.5}
[47] Mean validation score: 0.500 (std: 0.000): {'loss': 'log', 'penalty': 'l1', 'alpha': 0.5}
```

In [None]:
y_pred_lin = gs_lin.predict(X_train)
print(confusion_matrix(y_train, y_pred_lin))

On slightly less data:
```
[[110587   4643]
 [   155   1024]]
 ```

Old:

```
[[70902 10205]
 [  147   607]]
```

Older: 

```
[[63080 18027]
 [  245   509]]
```

## Nonlinear SVC

In [None]:
nonlinear_svm = SVC(kernel='rbf', class_weight='balanced', C=1)
nonlinear_svm_scores = cross_val_score(nonlinear_svm, X_train, y_train, cv=3)
print(nonlinear_svm_scores)
print(np.mean(nonlinear_svm_scores))

In [None]:
nonlin_clf = nonlinear_svm
nonlin_clf.fit(X_train, y_train)

In [None]:
score = nonlin_clf.score(X_valid, y_valid)
y_valid_predicted = nonlin_clf.predict(X_valid)

print("Validation data shape: {0}".format(X_valid.shape))
print("Validation data neg:pos ratio: {0}".format(VALIDATION_NEG_TO_POS_RATIO))
print("Validation accuracy: {0}".format(score))

print(confusion_matrix(y_valid, y_valid_predicted))

Old SVM nonlin numbers:
    
```
Validation data shape: (266, 10)
Validation data neg:pos ratio: 1
Validation accuracy: 0.8045112781954887
[[107  26]
 [ 26 107]]
```

In [None]:
# Multiple C confusion matrices.

from sklearn.externals.joblib import Parallel, delayed

def quick_eval_svc(C):
    output = ""
    output += "\nC = {0:10.8f}\n--------\n\n".format(C)
    svc = SVC(kernel='rbf', class_weight='balanced', C=C)
    svc.fit(X_train, y_train)
    y_valid_pred = svc.predict(X_valid)
    y_train_pred = svc.predict(X_train)
    output += "Validation data confusion matrix:\n"
    output += str(confusion_matrix(y_valid, y_valid_pred))
    output += "\n"
    output += "Training data confusion matrix (prone to overfitting):\n"
    output += str(confusion_matrix(y_train, y_train_pred))
    output += "\n"
    return output

def confusion_search():
    Cs = [0.00001, 0.0001, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 0.6, 0.7, 0.8, 0.9, 1, 2, 5]
    res = Parallel(n_jobs=6)(delayed(quick_eval_svc)(C) for C in Cs)
    return res

In [None]:
results = confusion_search()

In [None]:
for r in results:
    print(r)

Old confusion matrices (before positive label bug fix):

```
C = 0.00001000
--------

Validation data confusion matrix:
[[133   0]
 [133   0]]
Training data confusion matrix (prone to overfitting):
[[81107     0]
 [  754     0]]


C = 0.00010000
--------

Validation data confusion matrix:
[[105  28]
 [ 25 108]]
Training data confusion matrix (prone to overfitting):
[[67386 13721]
 [  171   583]]


C = 0.00100000
--------

Validation data confusion matrix:
[[111  22]
 [ 18 115]]
Training data confusion matrix (prone to overfitting):
[[68205 12902]
 [  130   624]]


C = 0.00500000
--------

Validation data confusion matrix:
[[120  13]
 [ 16 117]]
Training data confusion matrix (prone to overfitting):
[[70683 10424]
 [  110   644]]


C = 0.01000000
--------

Validation data confusion matrix:
[[120  13]
 [ 16 117]]
Training data confusion matrix (prone to overfitting):
[[70969 10138]
 [  105   649]]


C = 0.05000000
--------

Validation data confusion matrix:
[[120  13]
 [ 15 118]]
Training data confusion matrix (prone to overfitting):
[[72372  8735]
 [   82   672]]


C = 0.10000000
--------

Validation data confusion matrix:
[[121  12]
 [ 18 115]]
Training data confusion matrix (prone to overfitting):
[[72817  8290]
 [   67   687]]


C = 0.50000000
--------

Validation data confusion matrix:
[[123  10]
 [ 25 108]]
Training data confusion matrix (prone to overfitting):
[[74080  7027]
 [   40   714]]


C = 0.60000000
--------

Validation data confusion matrix:
[[124   9]
 [ 29 104]]
Training data confusion matrix (prone to overfitting):
[[74291  6816]
 [   38   716]]


C = 0.70000000
--------

Validation data confusion matrix:
[[123  10]
 [ 30 103]]
Training data confusion matrix (prone to overfitting):
[[74413  6694]
 [   34   720]]


C = 0.80000000
--------

Validation data confusion matrix:
[[124   9]
 [ 30 103]]
Training data confusion matrix (prone to overfitting):
[[74562  6545]
 [   33   721]]


C = 0.90000000
--------

Validation data confusion matrix:
[[124   9]
 [ 30 103]]
Training data confusion matrix (prone to overfitting):
[[74678  6429]
 [   32   722]]


C = 1.00000000
--------

Validation data confusion matrix:
[[125   8]
 [ 30 103]]
Training data confusion matrix (prone to overfitting):
[[74798  6309]
 [   32   722]]


C = 2.00000000
--------

Validation data confusion matrix:
[[126   7]
 [ 38  95]]
Training data confusion matrix (prone to overfitting):
[[75630  5477]
 [   24   730]]


C = 5.00000000
--------

Validation data confusion matrix:
[[128   5]
 [ 48  85]]
Training data confusion matrix (prone to overfitting):
[[76552  4555]
 [   13   741]]
```

### Grid search for kernelized SVM (expensive to compute)

In [None]:
pg_nonlin = {
    'C': [0.001, 0.01, 0.075, 0.100, 0.125, 0.5, 0.75, 0.9, 1, 1.2],
}

gs_nonlin = GridSearchCV(nonlin_clf, pg_nonlin, cv=2, scoring=score_on_validation, verbose=2, n_jobs=4)

In [None]:
gs_nonlin.fit(X_train, y_train)

Top results from nonlinear SVM grid search (rbf kernel, default settings, checked on 15% validation data set, biased for 1:1 pos/neg labels).
Using 10 features.


```
[0] Mean validation score: 0.824 (std: 0.010): {'C': 0.0005}
[1] Mean validation score: 0.811 (std: 0.015): {'C': 0.001}
[2] Mean validation score: 0.782 (std: 0.015): {'C': 0.01}
[3] Mean validation score: 0.748 (std: 0.007): {'C': 0.1}
[4] Mean validation score: 0.733 (std: 0.003): {'C': 0.05}
[5] Mean validation score: 0.694 (std: 0.012): {'C': 0.5}
[6] Mean validation score: 0.691 (std: 0.010): {'C': 0.75}
```

Top results from nonlinear SVM grid search (rbf kernel, default settings, ...).
Using 20 features.

```
[0] Mean validation score: 0.860 (std: 0.008): {'C': 0.01}
[1] Mean validation score: 0.848 (std: 0.010): {'C': 0.1}
[2] Mean validation score: 0.840 (std: 0.004): {'C': 0.001}
[3] Mean validation score: 0.836 (std: 0.005): {'C': 0.0005}
[4] Mean validation score: 0.796 (std: 0.004): {'C': 1}
```

In [None]:
report(gs_nonlin.grid_scores_, n_top=250)

In [None]:
y_pred_nonlin = gs_nonlin.predict(X_train)
print(confusion_matrix(y_train, y_pred_nonlin))

y_pred_valid_nonlin = gs_nonlin.predict(X_valid)
print(confusion_matrix(y_valid, y_pred_valid_nonlin))

## Old metrics code

In [None]:
def train_metrics(clf, X, y):
    clf.fit(X, y)
    y_pred = clf.predict(X)
    print(average_precision_score(y, y_pred, average='micro'))
    print(average_precision_score(y, y_pred, average='macro'))
    print(average_precision_score(y, y_pred, average='weighted'))
#     print("F1:        {0:.4f}".format(f1_score(y, y_pred, pos_label='true\n')))
#     print("Precision: {0:.4f}".format(precision_score(y, y_pred)))
#     print("Accuracy:  {0:.4f}".format(accuracy_score(y, y_pred)))
    print("Confusion matrix:")
    print(confusion_matrix(y, y_pred, labels=None))

In [None]:
# pred_y = clf.predict(X)
#clf = SGDClassifier(class_weight='balanced')
train_metrics(LinearSVC(class_weight='balanced'), X, y_raw)

In [None]:
train_metrics(SVC(kernel='rbf', class_weight='balanced'), X, y_raw)