In [710]:
import numpy as np
import pandas
from sklearn.grid_search import GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt
from collections import Counter

In [711]:
hashtags_data = pandas.read_csv("../data_collection/hashtags_data.csv", )

In [712]:
# build feature extractor
feature_extraction = TfidfVectorizer(min_df=0.1, sublinear_tf=True)
feature_extraction.fit(hashtags_data["hashtags"].values)
X = feature_extraction.transform(hashtags_data["hashtags"].values)

In [713]:
y= hashtags_data["label"].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [714]:
# train classifier
clf = SVC(kernel='linear')
clf.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [715]:
# predict and evaluate predictions
predictions = clf.predict(X_test)
print('ROC-AUC yields ' + str(roc_auc_score(y_test, predictions)))

ROC-AUC yields 0.9875


In [716]:
print(predictions, y_test, [x - y for x, y in zip(predictions, y_test)])

[1 0 1 0 0 1 0 1 0 0 1 0 0 1 1 1 0 1 0 1 1 1 1 0 0 0 1 0 0 0 1 0 0 0 0 1 0
 1 1 1 1 1 1 1 0 0 1 0 1 0 0 0 0 0 0 1 1 0 0 1 0 0 0 0 0 0 1 0] [1 0 1 0 0 1 0 1 0 0 1 0 0 1 1 1 0 1 0 1 0 1 1 0 0 0 1 0 0 0 1 0 0 0 0 1 0
 1 1 1 1 1 1 1 0 0 1 0 1 0 0 0 0 0 0 1 1 0 0 1 0 0 0 0 0 0 1 0] [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


### Comment after first prediction

The results above are great, we managed to accuratemy predict the political view for several twitter accounts. We will now look at parameter tuning, to increase our prediction score even more.
To do so, we will use a tool provided by Scikit learn, called GridSearchCV. Given a range of possibilities for our parameters, GridSearchCV will test every combination itself, returning the best parameters for our problem.

In [717]:
vect = TfidfVectorizer()
X = vect.fit_transform(hashtags_data["hashtags"].values)

params = {"svc__C": [.01, .1, 1, 10, 100],
          "svc__kernel": ['rbf', 'poly', 'linear'],
          "tfidf__min_df": [.01, .03, .05, .01, .02],
          "tfidf__sublinear_tf": [True, False]}

clf = Pipeline([("tfidf", TfidfVectorizer()),
                ("svc", SVC())])

estimator = GridSearchCV(clf, params, verbose=3, n_jobs=2)
estimator.fit(hashtags_data["hashtags"].values, hashtags_data["label"].values)

print(estimator.best_estimator_)
print(estimator.best_score_)

Fitting 3 folds for each of 150 candidates, totalling 450 fits
[CV] svc__C=0.01, svc__kernel=rbf, tfidf__min_df=0.01, tfidf__sublinear_tf=True 
[CV] svc__C=0.01, svc__kernel=rbf, tfidf__min_df=0.01, tfidf__sublinear_tf=True 
[CV]  svc__C=0.01, svc__kernel=rbf, tfidf__min_df=0.01, tfidf__sublinear_tf=True, score=0.508929 -   0.3s
[CV]  svc__C=0.01, svc__kernel=rbf, tfidf__min_df=0.01, tfidf__sublinear_tf=True, score=0.508929 -   0.3s
[CV] svc__C=0.01, svc__kernel=rbf, tfidf__min_df=0.01, tfidf__sublinear_tf=True 
[CV] svc__C=0.01, svc__kernel=rbf, tfidf__min_df=0.01, tfidf__sublinear_tf=False 
[CV]  svc__C=0.01, svc__kernel=rbf, tfidf__min_df=0.01, tfidf__sublinear_tf=True, score=0.508929 -   0.3s
[CV]  svc__C=0.01, svc__kernel=rbf, tfidf__min_df=0.01, tfidf__sublinear_tf=False, score=0.508929 -   0.3s
[CV] svc__C=0.01, svc__kernel=rbf, tfidf__min_df=0.01, tfidf__sublinear_tf=False 
[CV] svc__C=0.01, svc__kernel=rbf, tfidf__min_df=0.01, tfidf__sublinear_tf=False 
[CV]  svc__C=0.01, svc_

[Parallel(n_jobs=2)]: Done  28 tasks      | elapsed:    4.9s


[CV]  svc__C=0.01, svc__kernel=rbf, tfidf__min_df=0.02, tfidf__sublinear_tf=False, score=0.508929 -   0.4s
[CV] svc__C=0.01, svc__kernel=poly, tfidf__min_df=0.01, tfidf__sublinear_tf=True 
[CV]  svc__C=0.01, svc__kernel=rbf, tfidf__min_df=0.02, tfidf__sublinear_tf=False, score=0.508929 -   0.4s
[CV] svc__C=0.01, svc__kernel=poly, tfidf__min_df=0.01, tfidf__sublinear_tf=True 
[CV]  svc__C=0.01, svc__kernel=poly, tfidf__min_df=0.01, tfidf__sublinear_tf=True, score=0.508929 -   0.3s
[CV] svc__C=0.01, svc__kernel=poly, tfidf__min_df=0.01, tfidf__sublinear_tf=True 
[CV]  svc__C=0.01, svc__kernel=poly, tfidf__min_df=0.01, tfidf__sublinear_tf=True, score=0.508929 -   0.3s
[CV] svc__C=0.01, svc__kernel=poly, tfidf__min_df=0.01, tfidf__sublinear_tf=False 
[CV]  svc__C=0.01, svc__kernel=poly, tfidf__min_df=0.01, tfidf__sublinear_tf=True, score=0.508929 -   0.4s
[CV] svc__C=0.01, svc__kernel=poly, tfidf__min_df=0.01, tfidf__sublinear_tf=False 
[CV]  svc__C=0.01, svc__kernel=poly, tfidf__min_df=0.

[CV]  svc__C=0.01, svc__kernel=linear, tfidf__min_df=0.03, tfidf__sublinear_tf=False, score=0.508929 -   0.4s
[CV] svc__C=0.01, svc__kernel=linear, tfidf__min_df=0.05, tfidf__sublinear_tf=True 
[CV]  svc__C=0.01, svc__kernel=linear, tfidf__min_df=0.05, tfidf__sublinear_tf=True, score=0.508929 -   0.3s
[CV] svc__C=0.01, svc__kernel=linear, tfidf__min_df=0.05, tfidf__sublinear_tf=True 
[CV]  svc__C=0.01, svc__kernel=linear, tfidf__min_df=0.05, tfidf__sublinear_tf=True, score=0.508929 -   0.3s
[CV] svc__C=0.01, svc__kernel=linear, tfidf__min_df=0.05, tfidf__sublinear_tf=False 
[CV]  svc__C=0.01, svc__kernel=linear, tfidf__min_df=0.05, tfidf__sublinear_tf=True, score=0.508929 -   0.4s
[CV] svc__C=0.01, svc__kernel=linear, tfidf__min_df=0.05, tfidf__sublinear_tf=False 
[CV]  svc__C=0.01, svc__kernel=linear, tfidf__min_df=0.05, tfidf__sublinear_tf=False, score=0.508929 -   0.4s
[CV] svc__C=0.01, svc__kernel=linear, tfidf__min_df=0.05, tfidf__sublinear_tf=False 
[CV]  svc__C=0.01, svc__kernel

[CV] svc__C=0.1, svc__kernel=rbf, tfidf__min_df=0.02, tfidf__sublinear_tf=True 
[CV]  svc__C=0.1, svc__kernel=rbf, tfidf__min_df=0.02, tfidf__sublinear_tf=True, score=0.508929 -   0.4s
[CV] svc__C=0.1, svc__kernel=rbf, tfidf__min_df=0.02, tfidf__sublinear_tf=False 
[CV]  svc__C=0.1, svc__kernel=rbf, tfidf__min_df=0.02, tfidf__sublinear_tf=True, score=0.508929 -   0.4s
[CV] svc__C=0.1, svc__kernel=rbf, tfidf__min_df=0.02, tfidf__sublinear_tf=False 
[CV]  svc__C=0.1, svc__kernel=rbf, tfidf__min_df=0.02, tfidf__sublinear_tf=False, score=0.508929 -   0.4s
[CV] svc__C=0.1, svc__kernel=rbf, tfidf__min_df=0.02, tfidf__sublinear_tf=False 
[CV]  svc__C=0.1, svc__kernel=rbf, tfidf__min_df=0.02, tfidf__sublinear_tf=False, score=0.508929 -   0.5s
[CV]  svc__C=0.1, svc__kernel=rbf, tfidf__min_df=0.02, tfidf__sublinear_tf=False, score=0.508929 -   0.4s
[CV] svc__C=0.1, svc__kernel=poly, tfidf__min_df=0.01, tfidf__sublinear_tf=True 
[CV] svc__C=0.1, svc__kernel=poly, tfidf__min_df=0.01, tfidf__sublin

[Parallel(n_jobs=2)]: Done 124 tasks      | elapsed:   23.3s


[CV]  svc__C=0.1, svc__kernel=poly, tfidf__min_df=0.01, tfidf__sublinear_tf=False, score=0.508929 -   0.3s
[CV] svc__C=0.1, svc__kernel=poly, tfidf__min_df=0.03, tfidf__sublinear_tf=True 
[CV]  svc__C=0.1, svc__kernel=poly, tfidf__min_df=0.01, tfidf__sublinear_tf=False, score=0.508929 -   0.3s
[CV] svc__C=0.1, svc__kernel=poly, tfidf__min_df=0.03, tfidf__sublinear_tf=True 
[CV]  svc__C=0.1, svc__kernel=poly, tfidf__min_df=0.03, tfidf__sublinear_tf=True, score=0.508929 -   0.3s
[CV] svc__C=0.1, svc__kernel=poly, tfidf__min_df=0.03, tfidf__sublinear_tf=True 
[CV]  svc__C=0.1, svc__kernel=poly, tfidf__min_df=0.03, tfidf__sublinear_tf=True, score=0.508929 -   0.3s
[CV] svc__C=0.1, svc__kernel=poly, tfidf__min_df=0.03, tfidf__sublinear_tf=False 
[CV]  svc__C=0.1, svc__kernel=poly, tfidf__min_df=0.03, tfidf__sublinear_tf=True, score=0.508929 -   0.3s
[CV] svc__C=0.1, svc__kernel=poly, tfidf__min_df=0.03, tfidf__sublinear_tf=False 
[CV]  svc__C=0.1, svc__kernel=poly, tfidf__min_df=0.03, tfidf

[CV] svc__C=0.1, svc__kernel=linear, tfidf__min_df=0.01, tfidf__sublinear_tf=True 
[CV]  svc__C=0.1, svc__kernel=linear, tfidf__min_df=0.01, tfidf__sublinear_tf=True, score=0.955357 -   0.4s
[CV]  svc__C=0.1, svc__kernel=linear, tfidf__min_df=0.01, tfidf__sublinear_tf=True, score=0.937500 -   0.4s
[CV] svc__C=0.1, svc__kernel=linear, tfidf__min_df=0.01, tfidf__sublinear_tf=True 
[CV] svc__C=0.1, svc__kernel=linear, tfidf__min_df=0.01, tfidf__sublinear_tf=False 
[CV]  svc__C=0.1, svc__kernel=linear, tfidf__min_df=0.01, tfidf__sublinear_tf=True, score=0.964286 -   0.3s
[CV]  svc__C=0.1, svc__kernel=linear, tfidf__min_df=0.01, tfidf__sublinear_tf=False, score=0.875000 -   0.3s
[CV] svc__C=0.1, svc__kernel=linear, tfidf__min_df=0.01, tfidf__sublinear_tf=False 
[CV] svc__C=0.1, svc__kernel=linear, tfidf__min_df=0.01, tfidf__sublinear_tf=False 
[CV]  svc__C=0.1, svc__kernel=linear, tfidf__min_df=0.01, tfidf__sublinear_tf=False, score=0.866071 -   0.3s
[CV] svc__C=0.1, svc__kernel=linear, tfi

[CV]  svc__C=1, svc__kernel=poly, tfidf__min_df=0.01, tfidf__sublinear_tf=True, score=0.508929 -   0.4s
[CV] svc__C=1, svc__kernel=poly, tfidf__min_df=0.01, tfidf__sublinear_tf=False 
[CV]  svc__C=1, svc__kernel=poly, tfidf__min_df=0.01, tfidf__sublinear_tf=False, score=0.508929 -   0.4s
[CV] svc__C=1, svc__kernel=poly, tfidf__min_df=0.01, tfidf__sublinear_tf=False 
[CV]  svc__C=1, svc__kernel=poly, tfidf__min_df=0.01, tfidf__sublinear_tf=False, score=0.508929 -   0.3s
[CV] svc__C=1, svc__kernel=poly, tfidf__min_df=0.03, tfidf__sublinear_tf=True 
[CV]  svc__C=1, svc__kernel=poly, tfidf__min_df=0.01, tfidf__sublinear_tf=False, score=0.508929 -   0.3s
[CV] svc__C=1, svc__kernel=poly, tfidf__min_df=0.03, tfidf__sublinear_tf=True 
[CV]  svc__C=1, svc__kernel=poly, tfidf__min_df=0.03, tfidf__sublinear_tf=True, score=0.508929 -   0.3s
[CV]  svc__C=1, svc__kernel=poly, tfidf__min_df=0.03, tfidf__sublinear_tf=True, score=0.508929 -   0.3s
[CV] svc__C=1, svc__kernel=poly, tfidf__min_df=0.03, tf

[CV] svc__C=1, svc__kernel=linear, tfidf__min_df=0.01, tfidf__sublinear_tf=True 
[CV]  svc__C=1, svc__kernel=linear, tfidf__min_df=0.05, tfidf__sublinear_tf=False, score=0.982143 -   0.2s
[CV] svc__C=1, svc__kernel=linear, tfidf__min_df=0.01, tfidf__sublinear_tf=True 
[CV]  svc__C=1, svc__kernel=linear, tfidf__min_df=0.01, tfidf__sublinear_tf=True, score=0.964286 -   0.3s
[CV] svc__C=1, svc__kernel=linear, tfidf__min_df=0.01, tfidf__sublinear_tf=True 
[CV]  svc__C=1, svc__kernel=linear, tfidf__min_df=0.01, tfidf__sublinear_tf=True, score=0.991071 -   0.3s
[CV] svc__C=1, svc__kernel=linear, tfidf__min_df=0.01, tfidf__sublinear_tf=False 
[CV]  svc__C=1, svc__kernel=linear, tfidf__min_df=0.01, tfidf__sublinear_tf=False, score=0.919643 -   0.2s
[CV] svc__C=1, svc__kernel=linear, tfidf__min_df=0.01, tfidf__sublinear_tf=False 
[CV]  svc__C=1, svc__kernel=linear, tfidf__min_df=0.01, tfidf__sublinear_tf=True, score=0.991071 -   0.3s
[CV] svc__C=1, svc__kernel=linear, tfidf__min_df=0.01, tfidf_

[Parallel(n_jobs=2)]: Done 284 tasks      | elapsed:   49.7s


[CV]  svc__C=10, svc__kernel=rbf, tfidf__min_df=0.05, tfidf__sublinear_tf=True, score=0.892857 -   0.3s
[CV]  svc__C=10, svc__kernel=rbf, tfidf__min_df=0.05, tfidf__sublinear_tf=False, score=0.910714 -   0.3s
[CV] svc__C=10, svc__kernel=rbf, tfidf__min_df=0.05, tfidf__sublinear_tf=False 
[CV] svc__C=10, svc__kernel=rbf, tfidf__min_df=0.05, tfidf__sublinear_tf=False 
[CV]  svc__C=10, svc__kernel=rbf, tfidf__min_df=0.05, tfidf__sublinear_tf=False, score=0.875000 -   0.3s
[CV]  svc__C=10, svc__kernel=rbf, tfidf__min_df=0.05, tfidf__sublinear_tf=False, score=0.919643 -   0.3s
[CV] svc__C=10, svc__kernel=rbf, tfidf__min_df=0.01, tfidf__sublinear_tf=True 
[CV] svc__C=10, svc__kernel=rbf, tfidf__min_df=0.01, tfidf__sublinear_tf=True 
[CV]  svc__C=10, svc__kernel=rbf, tfidf__min_df=0.01, tfidf__sublinear_tf=True, score=0.508929 -   0.3s
[CV]  svc__C=10, svc__kernel=rbf, tfidf__min_df=0.01, tfidf__sublinear_tf=True, score=0.508929 -   0.3s
[CV] svc__C=10, svc__kernel=rbf, tfidf__min_df=0.01, tf

[CV]  svc__C=10, svc__kernel=poly, tfidf__min_df=0.02, tfidf__sublinear_tf=False, score=0.508929 -   0.3s
[CV] svc__C=10, svc__kernel=linear, tfidf__min_df=0.01, tfidf__sublinear_tf=True 
[CV] svc__C=10, svc__kernel=linear, tfidf__min_df=0.01, tfidf__sublinear_tf=True 
[CV]  svc__C=10, svc__kernel=linear, tfidf__min_df=0.01, tfidf__sublinear_tf=True, score=0.964286 -   0.3s
[CV]  svc__C=10, svc__kernel=linear, tfidf__min_df=0.01, tfidf__sublinear_tf=True, score=0.991071 -   0.3s
[CV] svc__C=10, svc__kernel=linear, tfidf__min_df=0.01, tfidf__sublinear_tf=True 
[CV] svc__C=10, svc__kernel=linear, tfidf__min_df=0.01, tfidf__sublinear_tf=False 
[CV]  svc__C=10, svc__kernel=linear, tfidf__min_df=0.01, tfidf__sublinear_tf=True, score=0.991071 -   0.3s
[CV]  svc__C=10, svc__kernel=linear, tfidf__min_df=0.01, tfidf__sublinear_tf=False, score=0.919643 -   0.3s
[CV] svc__C=10, svc__kernel=linear, tfidf__min_df=0.01, tfidf__sublinear_tf=False 
[CV] svc__C=10, svc__kernel=linear, tfidf__min_df=0.0

[CV]  svc__C=100, svc__kernel=rbf, tfidf__min_df=0.05, tfidf__sublinear_tf=True, score=0.982143 -   0.2s
[CV] svc__C=100, svc__kernel=rbf, tfidf__min_df=0.05, tfidf__sublinear_tf=True 
[CV]  svc__C=100, svc__kernel=rbf, tfidf__min_df=0.05, tfidf__sublinear_tf=True, score=0.973214 -   0.2s
[CV] svc__C=100, svc__kernel=rbf, tfidf__min_df=0.05, tfidf__sublinear_tf=False 
[CV]  svc__C=100, svc__kernel=rbf, tfidf__min_df=0.05, tfidf__sublinear_tf=True, score=0.991071 -   0.2s
[CV] svc__C=100, svc__kernel=rbf, tfidf__min_df=0.05, tfidf__sublinear_tf=False 
[CV]  svc__C=100, svc__kernel=rbf, tfidf__min_df=0.05, tfidf__sublinear_tf=False, score=0.937500 -   0.2s
[CV] svc__C=100, svc__kernel=rbf, tfidf__min_df=0.05, tfidf__sublinear_tf=False 
[CV]  svc__C=100, svc__kernel=rbf, tfidf__min_df=0.05, tfidf__sublinear_tf=False, score=0.919643 -   0.2s
[CV] svc__C=100, svc__kernel=rbf, tfidf__min_df=0.01, tfidf__sublinear_tf=True 
[CV]  svc__C=100, svc__kernel=rbf, tfidf__min_df=0.05, tfidf__sublinea

[CV]  svc__C=100, svc__kernel=poly, tfidf__min_df=0.02, tfidf__sublinear_tf=True, score=0.508929 -   0.3s
[CV] svc__C=100, svc__kernel=poly, tfidf__min_df=0.02, tfidf__sublinear_tf=False 
[CV]  svc__C=100, svc__kernel=poly, tfidf__min_df=0.02, tfidf__sublinear_tf=False, score=0.508929 -   0.3s
[CV] svc__C=100, svc__kernel=poly, tfidf__min_df=0.02, tfidf__sublinear_tf=False 
[CV]  svc__C=100, svc__kernel=poly, tfidf__min_df=0.02, tfidf__sublinear_tf=False, score=0.508929 -   0.3s
[CV] svc__C=100, svc__kernel=linear, tfidf__min_df=0.01, tfidf__sublinear_tf=True 
[CV]  svc__C=100, svc__kernel=poly, tfidf__min_df=0.02, tfidf__sublinear_tf=False, score=0.508929 -   0.3s
[CV] svc__C=100, svc__kernel=linear, tfidf__min_df=0.01, tfidf__sublinear_tf=True 
[CV]  svc__C=100, svc__kernel=linear, tfidf__min_df=0.01, tfidf__sublinear_tf=True, score=0.964286 -   0.4s
[CV]  svc__C=100, svc__kernel=linear, tfidf__min_df=0.01, tfidf__sublinear_tf=True, score=0.991071 -   0.3s
[CV] svc__C=100, svc__kerne

[Parallel(n_jobs=2)]: Done 450 out of 450 | elapsed:  1.3min finished


Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=0.03,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True...,
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])
0.9880952380952381


In [718]:
print(estimator.best_estimator_.steps)

[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=0.03,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)), ('svc', SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))]


In [753]:
clf = Pipeline([('vect', TfidfVectorizer(min_df=0.05, sublinear_tf=True)), ('svc', SVC(C=1, kernel='linear'))])

In [754]:
scores = cross_val_score(clf, hashtags_data["hashtags"].values, hashtags_data["label"].values, cv=3)
print ('Our algorithm\'s final score is:', sum(scores)/len(scores))

Our algorithm's final score is: 0.988095238095


### Quick test with user mentions

In our data extraction we also kept the user mentions in the tweets. Let's have a quick try at using these to predict the political view, using the same techniques as hashtags. We will mostly copy the code that was already written, since the process is the same.

In [735]:
mentions_data = pandas.read_csv("../data_collection/mentions_data.csv", )
mentions_data.head()

Unnamed: 0,label,mentions
0,0,stephsmithfl danp_att arringtond3 hispaniccauc...
1,0,girlscouts bradybuzz repjackyrosen averywgardi...
2,0,secretarysonny famu_1887 repallawsonjr lrobins...
3,0,repespaillat repjohnlewis mmviverito cunydream...
4,0,officialcbc ducksunlimited agriculturede shalo...


In [736]:
# build feature extractor
feature_extraction = TfidfVectorizer(min_df=0.1, sublinear_tf=True)
feature_extraction.fit(mentions_data["mentions"].values)
X = feature_extraction.transform(mentions_data["mentions"].values)

In [737]:
y= mentions_data["label"].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [749]:
# train classifier
clf = Pipeline([('vect', TfidfVectorizer(min_df=0.05, sublinear_tf=True)), ('svm', SVC(C=1, kernel='linear'))])

In [750]:
scores = cross_val_score(clf, mentions_data["mentions"].values, mentions_data["label"].values, cv=3)
print ('Our algorithm\'s final score is:', sum(scores)/len(scores))

Our algorithm's final score is: 0.988147914033


In [742]:
vect = TfidfVectorizer()
X = vect.fit_transform(mentions_data["mentions"].values)

params = {"svc__C": [.01, .1, 1, 10, 100],
          "svc__kernel": ['linear'],
          "tfidf__min_df": [.01, .03, .05, .01, .02],
          "tfidf__sublinear_tf": [True]}

clf = Pipeline([("tfidf", TfidfVectorizer()),
                ("svc", SVC())])

estimator = GridSearchCV(clf, params, verbose=3, n_jobs=2)
estimator.fit(mentions_data["mentions"].values, mentions_data["label"].values)

print(estimator.best_estimator_)
print(estimator.best_score_)

Fitting 3 folds for each of 25 candidates, totalling 75 fits
[CV] svc__C=0.01, svc__kernel=linear, tfidf__min_df=0.01, tfidf__sublinear_tf=True 
[CV] svc__C=0.01, svc__kernel=linear, tfidf__min_df=0.01, tfidf__sublinear_tf=True 
[CV]  svc__C=0.01, svc__kernel=linear, tfidf__min_df=0.01, tfidf__sublinear_tf=True, score=0.504425 -   0.5s
[CV]  svc__C=0.01, svc__kernel=linear, tfidf__min_df=0.01, tfidf__sublinear_tf=True, score=0.504425 -   0.5s
[CV] svc__C=0.01, svc__kernel=linear, tfidf__min_df=0.01, tfidf__sublinear_tf=True 
[CV] svc__C=0.01, svc__kernel=linear, tfidf__min_df=0.03, tfidf__sublinear_tf=True 
[CV]  svc__C=0.01, svc__kernel=linear, tfidf__min_df=0.03, tfidf__sublinear_tf=True, score=0.504425 -   0.4s
[CV] svc__C=0.01, svc__kernel=linear, tfidf__min_df=0.03, tfidf__sublinear_tf=True 
[CV]  svc__C=0.01, svc__kernel=linear, tfidf__min_df=0.01, tfidf__sublinear_tf=True, score=0.508929 -   0.5s
[CV] svc__C=0.01, svc__kernel=linear, tfidf__min_df=0.03, tfidf__sublinear_tf=True 

[Parallel(n_jobs=2)]: Done  28 tasks      | elapsed:    6.6s


[CV]  svc__C=0.1, svc__kernel=linear, tfidf__min_df=0.02, tfidf__sublinear_tf=True, score=0.938053 -   0.5s
[CV] svc__C=1, svc__kernel=linear, tfidf__min_df=0.01, tfidf__sublinear_tf=True 
[CV]  svc__C=0.1, svc__kernel=linear, tfidf__min_df=0.02, tfidf__sublinear_tf=True, score=0.955357 -   0.5s
[CV] svc__C=1, svc__kernel=linear, tfidf__min_df=0.01, tfidf__sublinear_tf=True 
[CV]  svc__C=1, svc__kernel=linear, tfidf__min_df=0.01, tfidf__sublinear_tf=True, score=1.000000 -   0.5s
[CV] svc__C=1, svc__kernel=linear, tfidf__min_df=0.01, tfidf__sublinear_tf=True 
[CV]  svc__C=1, svc__kernel=linear, tfidf__min_df=0.01, tfidf__sublinear_tf=True, score=0.982301 -   0.6s
[CV] svc__C=1, svc__kernel=linear, tfidf__min_df=0.03, tfidf__sublinear_tf=True 
[CV]  svc__C=1, svc__kernel=linear, tfidf__min_df=0.03, tfidf__sublinear_tf=True, score=0.973451 -   0.4s
[CV] svc__C=1, svc__kernel=linear, tfidf__min_df=0.03, tfidf__sublinear_tf=True 
[CV]  svc__C=1, svc__kernel=linear, tfidf__min_df=0.01, tfidf

[CV] svc__C=100, svc__kernel=linear, tfidf__min_df=0.02, tfidf__sublinear_tf=True 
[CV]  svc__C=100, svc__kernel=linear, tfidf__min_df=0.02, tfidf__sublinear_tf=True, score=0.991150 -   0.5s
[CV] svc__C=100, svc__kernel=linear, tfidf__min_df=0.02, tfidf__sublinear_tf=True 
[CV]  svc__C=100, svc__kernel=linear, tfidf__min_df=0.02, tfidf__sublinear_tf=True, score=1.000000 -   0.5s
[CV]  svc__C=100, svc__kernel=linear, tfidf__min_df=0.02, tfidf__sublinear_tf=True, score=1.000000 -   0.4s


[Parallel(n_jobs=2)]: Done  75 out of  75 | elapsed:   18.1s finished


Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=0.02,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True...,
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])
0.9970414201183432


In [743]:
print(estimator.best_estimator_.steps)

[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=0.02,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)), ('svc', SVC(C=10, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))]


In [745]:
# train classifier
clf = Pipeline([('vect', TfidfVectorizer(min_df=0.02, sublinear_tf=True)), ('svm', SVC(C=10, kernel='linear'))])

In [746]:
scores = cross_val_score(clf, mentions_data["mentions"].values, mentions_data["label"].values, cv=3)
print ('Our algorithm\'s final score is:', sum(scores)/len(scores))

Our algorithm's final score is: 0.997050147493
