In [83]:
import numpy as np
import pandas
from sklearn.grid_search import GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt
from collections import Counter

In [84]:
hashtags_data = pandas.read_csv("../data_collection/hashtags_data.csv", )

In [85]:
# build feature extractor
feature_extraction = TfidfVectorizer(min_df=0.1, sublinear_tf=True)
feature_extraction.fit(hashtags_data["hashtags"].values)
X = feature_extraction.transform(hashtags_data["hashtags"].values)

In [86]:
y= hashtags_data["label"].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [87]:
# train classifier
clf = SVC(kernel='linear')
clf.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [88]:
# predict and evaluate predictions
predictions = clf.predict(X_test)
print('ROC-AUC yields ' + str(roc_auc_score(y_test, predictions)))

ROC-AUC yields 0.986486486486


In [89]:
print(predictions, y_test, [x - y for x, y in zip(predictions, y_test)])

[0 0 1 1 0 0 1 0 1 1 1 0 1 1 0 0 0 0 0 0 0 0 1 0 1 1 0 0 0 0 0 1 1 0 1 0 0
 0 0 1 1 1 1 0 0 1 0 1 1 1 1 1 0 1 0 1 1 0 1 1 0 0 1 0 1 0 1 0] [0 0 1 1 0 0 1 0 1 1 1 0 1 1 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 1 1 0 1 0 0
 0 0 1 1 1 1 0 0 1 0 1 1 1 1 1 0 1 0 1 1 0 1 1 0 0 1 0 1 0 1 0] [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


### Comment after first prediction

The results above are great, we managed to accuratemy predict the political view for several twitter accounts. We will now look at parameter tuning, to increase our prediction score even more.
To do so, we will use a tool provided by Scikit learn, called GridSearchCV. Given a range of possibilities for our parameters, GridSearchCV will test every combination itself, returning the best parameters for our problem.

In [90]:
vect = TfidfVectorizer()
X = vect.fit_transform(hashtags_data["hashtags"].values)

params = {"svc__C": [.01, .1, 1, 10, 100],
          "svc__kernel": ['rbf', 'poly', 'linear'],
          "tfidf__min_df": [.01, .03, .05, .01, .02],
          "tfidf__sublinear_tf": [True, False]}

clf = Pipeline([("tfidf", TfidfVectorizer()),
                ("svc", SVC())])

estimator = GridSearchCV(clf, params, verbose=0, n_jobs=2)
estimator.fit(hashtags_data["hashtags"].values, hashtags_data["label"].values)

print(estimator.best_estimator_)
print(estimator.best_score_)

Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=0.03,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True...,
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])
0.9880952380952381


In [91]:
print(estimator.best_estimator_.steps)

[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=0.03,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)), ('svc', SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))]


In [92]:
clf = Pipeline([('vect', TfidfVectorizer(min_df=0.05, sublinear_tf=True)), ('svc', SVC(C=1, kernel='linear'))])

In [93]:
scores = cross_val_score(clf, hashtags_data["hashtags"].values, hashtags_data["label"].values, cv=3)
print ('Our algorithm\'s final score is:', sum(scores)/len(scores))

Our algorithm's final score is: 0.988095238095


### Quick test with user mentions

In our data extraction we also kept the user mentions in the tweets. Let's have a quick try at using these to predict the political view, using the same techniques as hashtags. We will mostly copy the code that was already written, since the process is the same.

In [94]:
mentions_data = pandas.read_csv("../data_collection/mentions_data.csv", )
mentions_data.head()

Unnamed: 0,label,mentions
0,0,stephsmithfl danp_att arringtond3 hispaniccauc...
1,0,girlscouts bradybuzz repjackyrosen averywgardi...
2,0,secretarysonny famu_1887 repallawsonjr lrobins...
3,0,repespaillat repjohnlewis mmviverito cunydream...
4,0,officialcbc ducksunlimited agriculturede shalo...


In [95]:
# build feature extractor
feature_extraction = TfidfVectorizer(min_df=0.1, sublinear_tf=True)
feature_extraction.fit(mentions_data["mentions"].values)
X = feature_extraction.transform(mentions_data["mentions"].values)

In [52]:
y= mentions_data["label"].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [53]:
# train classifier
clf = Pipeline([('vect', TfidfVectorizer(min_df=0.05, sublinear_tf=True)), ('svm', SVC(C=1, kernel='linear'))])

In [54]:
scores = cross_val_score(clf, mentions_data["mentions"].values, mentions_data["label"].values, cv=3)
print ('Our algorithm\'s final score is:', sum(scores)/len(scores))

Our algorithm's final score is: 0.988147914033


In [55]:
vect = TfidfVectorizer()
X = vect.fit_transform(mentions_data["mentions"].values)

params = {"svc__C": [.01, .1, 1, 10, 100],
          "svc__kernel": ['linear'],
          "tfidf__min_df": [.01, .03, .05, .01, .02],
          "tfidf__sublinear_tf": [True]}

clf = Pipeline([("tfidf", TfidfVectorizer()),
                ("svc", SVC())])

estimator = GridSearchCV(clf, params, verbose=0, n_jobs=2)
estimator.fit(mentions_data["mentions"].values, mentions_data["label"].values)

print(estimator.best_estimator_)
print(estimator.best_score_)

Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=0.02,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True...,
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])
0.9970414201183432


In [56]:
print(estimator.best_estimator_.steps)

[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=0.02,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)), ('svc', SVC(C=10, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))]


In [57]:
# train classifier
clf = Pipeline([('vect', TfidfVectorizer(min_df=0.02, sublinear_tf=True)), ('svm', SVC(C=10, kernel='linear'))])

In [58]:
scores = cross_val_score(clf, mentions_data["mentions"].values, mentions_data["label"].values, cv=3)
print ('Our algorithm\'s final score is:', sum(scores)/len(scores))

Our algorithm's final score is: 0.997050147493


### Quick test with urls

In our data extraction we also kept the urls in the tweets. Let's have a quick try at using these to predict the political view, using the same techniques as hashtags. We will mostly copy the code that was already written, since the process is the same.

In [59]:
urls_data = pandas.read_csv("../data_collection/urls_data.csv", )
urls_data.head()

Unnamed: 0,label,urls
0,0,http://www.businessinsider.com/ http://www.tam...
1,0,https://lasvegassun.com/ http://www.mcclatchyd...
2,0,https://usat.ly/ https://medium.com/ https://m...
3,0,http://thehill.com/ https://www.healthcare.gov...
4,0,http://thndr.me/ http://www.healthcare.gov/ ht...


In [60]:
# build feature extractor
feature_extraction = TfidfVectorizer(min_df=0.1, sublinear_tf=True)
feature_extraction.fit(urls_data["urls"].values)
X = feature_extraction.transform(urls_data["urls"].values)

In [61]:
y= urls_data["label"].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [62]:
# train classifier
clf = Pipeline([('vect', TfidfVectorizer(min_df=0.05, sublinear_tf=True)), ('svm', SVC(C=1, kernel='linear'))])

In [63]:
scores = cross_val_score(clf, urls_data["urls"].values, urls_data["label"].values, cv=3)
print ('Our algorithm\'s final score is:', sum(scores)/len(scores))

Our algorithm's final score is: 0.961572903498


In [64]:
vect = TfidfVectorizer()
X = vect.fit_transform(urls_data["urls"].values)

params = {"svc__C": [.01, .1, 1, 10, 100],
          "svc__kernel": ['linear'],
          "tfidf__min_df": [.01, .03, .05, .01, .02],
          "tfidf__sublinear_tf": [True]}

clf = Pipeline([("tfidf", TfidfVectorizer()),
                ("svc", SVC())])

estimator = GridSearchCV(clf, params, verbose=0, n_jobs=2)
estimator.fit(urls_data["urls"].values, urls_data["label"].values)

print(estimator.best_estimator_)
print(estimator.best_score_)

Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=0.01,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True...,
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])
0.9704142011834319


In [65]:
 print(estimator.best_estimator_.steps)

[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=0.01,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)), ('svc', SVC(C=10, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))]


## Test on republican supporter accounts

I have gathered accounts from trump supporters. Let's see if our models manages to predict that these accounts are indeed republican accounts. The exercise is much harder than what we have done before, since these new accounts don't necessarily talk about politics a lot.

### First with mentions

In [119]:
mentions_test_data = pandas.read_csv("../data_collection/mentions_test_data.csv", )

In [120]:
clf_mentions = Pipeline([('vect', TfidfVectorizer(min_df=0.02, sublinear_tf=True)), ('svm', SVC(C=10, kernel='linear'))])
clf_mentions.fit(mentions_data["mentions"].values, mentions_data["label"].values)

Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=0.02,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,...,
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])

In [121]:
prediction = clf_mentions.predict(mentions_test_data['mentions'].values)
print(prediction)
print('Our algorithm accuracy on mentions for', len(prediction), 'accounts :', sum(prediction)/len(prediction))

[1 1 1 1 0 1 0 1 0 0 1 1 0 1 1 0 0 0 1 1 1 1 1 1 0 1 1 1 1 1 1]
Our algorithm accuracy on mentions for 31 accounts : 0.709677419355


### Now with hashtags

In [122]:
hashtags_test_data = pandas.read_csv("../data_collection/hashtags_test_data.csv", )

In [123]:
clf_hashtags = Pipeline([('vect', TfidfVectorizer(min_df=0.05, sublinear_tf=True)), ('svc', SVC(C=1, kernel='linear'))])
clf_hashtags.fit(hashtags_data["hashtags"].values, hashtags_data["label"].values)

Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=0.05,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,...,
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])

In [124]:
prediction = clf_hashtags.predict(hashtags_test_data['hashtags'].values)
print(prediction)
print('Our algorithm accuracy on mentions for', len(prediction), 'accounts :', sum(prediction)/len(prediction))

[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1]
Our algorithm accuracy on mentions for 30 accounts : 0.966666666667


### Let's finish with urls

In [128]:
urls_test_data = pandas.read_csv("../data_collection/urls_test_data.csv", )

In [129]:
clf_urls = Pipeline([('vect', TfidfVectorizer(min_df=0.01, sublinear_tf=True)), ('svc', SVC(C=10, kernel='linear'))])
clf_urls.fit(urls_data["urls"].values, urls_data["label"].values)

Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=0.01,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,...,
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])

In [131]:
prediction = clf_hashtags.predict(urls_test_data['urls'].values)
print(prediction)
print('Our algorithm accuracy on mentions for', len(prediction), 'accounts :', sum(prediction)/len(prediction))

[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1]
Our algorithm accuracy on mentions for 31 accounts : 0.967741935484
