In [1]:
import numpy as np
import pandas as pd

In [2]:
# Where we had our Scala program dump all the training data.
csv_file_name = "../data/all-candidates-4-10-14-10.csv"

# The names of the columns in the CSV file. Should update if more features become available.
feature_count = 10
col_names = ['entity_id', 'mention', 'mention_start', 'mention_end', 'mention_length',
             'featureStartFlag']
feature_names = ['f%d' % f for f in range(feature_count)]
col_names += feature_names
col_names += ['featureEndFlag', 'relevant']

#train_data = pd.read_csv(csv_file_name, true_values=['true'], false_values=['false'],
#                        names=col_names)

In [40]:
X_raw = []
y_raw = []

with open(csv_file_name, 'r') as f:
    bad_lines = 0
    for line_number, line in enumerate(f.readlines()):
        if bad_lines > 10:
            print("Too many bad lines. The parsing code is bugged or the "
                  "CSV is badly formatted. Get yo shit together fam.")
            break
            
        parts = line[:-1].split(",")
        if len(parts) != 18:
            bad_lines += 1
            print("Skipping bad line")
            continue
        
        # Example CSV line, as of May 10.
        # 364646, lumet familt, 7, 19, 12, featureStart, [feature_count features], featureEnd, true
        try:
            # Not used at the moment.
            # meta = parts[:5]
            features = parts[6:6 + feature_count]
            label = parts[-1]
            features_np = np.array([float(f) for f in features])
            X_raw.append(features_np)
            y_raw.append(1 if label == 'true' else 0)
        except ValueError as e:
            print("Could not parse data line {0}: {1}.".format(line_number, line))
            print(e)
            bad_lines += 1
            
    X_raw = np.array(X_raw)
    y_raw = np.array(y_raw)
        
    if bad_lines > 0:
        print("Bad lines: {0}".format(bad_lines))
    else:
        print("Import was successful.")
        
    print("Feature shape: {0}".format(X_raw.shape))
    print("Label shape: {0}".format(y_raw.shape))
    
# Hacky imputation.
X_raw[np.isnan(X_raw)] = 0.0
# TODO(andrei): Maybe set this to a very large constant.
X_raw[np.isinf(X_raw)] = 0.0

Import was successful.
Feature shape: (43548, 10)
Label shape: (43548,)


In [41]:
# old_opts = np.get_printoptions()
# np.set_printoptions(threshold=np.nan)

print("Raw feature max vals: {0}".format(np.max(X_raw, axis=0)))
print("Raw feature min vals: {0}".format(np.min(X_raw, axis=0)))
print("Raw feature mean vals: {0}".format(np.mean(X_raw, axis=0)))
print("Raw feature stds: {0}".format(np.std(X_raw, axis=0)))

print("NaNs: {0}".format(np.sum(np.isnan(X_raw), axis=0)))
print("Infinities: {0}".format(np.sum(np.isinf(X_raw), axis=0)))

# np.set_printoptions(**old_opts)

Raw feature max vals: [  3.39000000e+08   1.00000000e+02   2.00000000e+01   2.00000000e+01
   1.57500000e+01   0.00000000e+00   2.35000000e+01   1.00000000e+00
   2.50000000e+01   4.20000000e+01]
Raw feature min vals: [ 24.   1.   0.   0.   0.   0.   0.   0.  13.  42.]
Raw feature mean vals: [  2.26494924e+07   9.34776568e+01   4.07127626e+00   3.98202806e+00
   1.40936438e-02   0.00000000e+00   6.49731911e+00   1.58556306e-01
   2.30570708e+01   4.20000000e+01]
Raw feature stds: [  4.93288879e+07   2.24788304e+01   2.10829459e+00   2.17877619e+00
   1.74351238e-01   0.00000000e+00   1.69935778e+00   2.20900887e-01
   2.74772698e+00   0.00000000e+00]
NaNs: [0 0 0 0 0 0 0 0 0 0]
Infinities: [0 0 0 0 0 0 0 0 0 0]


In [42]:
from sklearn import preprocessing

X = preprocessing.scale(X_raw)
print(np.mean(X, axis=1))
print(np.std(X, axis=1))

[-0.77989972 -0.77989972 -0.17124558 ...,  0.58752018  0.24041326
 -0.05640984]
[ 1.93966831  1.93966831  0.37219353 ...,  1.16820207  0.68798874
  0.4059836 ]




In [43]:
from sklearn.cross_validation import cross_val_score
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import *
from sklearn.svm import SVC, SVR, LinearSVC, LinearSVR


# The "class_weight='balanced'" param tells the SVM to give adaptive weights
# to the labels from each class, in order to account for imbalanced data.

# scores = cross_val_score(SGDClassifier(class_weight='balanced'), X, y_raw, cv=10)
scores = cross_val_score(LinearSVC(class_weight='balanced'), X, y_raw, cv=5)
print(scores)
print(np.mean(scores))

[ 0.85384615  0.81595867  0.8326062   0.8196119   0.81846366]
0.828097315168


In [8]:
nonlinear_svm = SVC(kernel='rbf', class_weight='balanced')
nonlinear_svm_scores = cross_val_score(nonlinear_svm, X, y_raw, cv=5)
print(nonlinear_svm_scores)
print(np.mean(nonlinear_svm_scores))

[ 0.78392652  0.73765786  0.76337543  0.71156275  0.76036284]
0.751377082103


In [44]:
def train_metrics(clf, X, y):
    clf.fit(X, y)
    y_pred = clf.predict(X)
    print(average_precision_score(y, y_pred, average='micro'))
    print(average_precision_score(y, y_pred, average='macro'))
    print(average_precision_score(y, y_pred, average='weighted'))
#     print("F1:        {0:.4f}".format(f1_score(y, y_pred, pos_label='true\n')))
#     print("Precision: {0:.4f}".format(precision_score(y, y_pred)))
#     print("Accuracy:  {0:.4f}".format(accuracy_score(y, y_pred)))
    print("Confusion matrix:")
    print(confusion_matrix(y, y_pred, labels=None))

In [45]:
# pred_y = clf.predict(X)
#clf = SGDClassifier(class_weight='balanced')
train_metrics(LinearSVC(class_weight='balanced'), X, y_raw)

0.317132474404
0.317132474404
0.317132474404
Confusion matrix:
[[35988  7105]
 [  185   270]]


In [46]:
train_metrics(SVC(kernel='rbf', class_weight='balanced'), X, y_raw)

0.44347221463
0.44347221463
0.44347221463
Confusion matrix:
[[32370 10723]
 [   68   387]]


In [28]:
"{0:.2f}".format(1.555)

'1.55'