In [38]:
import numpy as np
import pandas
from sklearn.grid_search import GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt
from collections import Counter

In [39]:
hashtags_data = pandas.read_csv("../data_collection/hashtags_data.csv", )

In [75]:
# build feature extractor
feature_extraction = TfidfVectorizer(min_df=0.1, sublinear_tf=True)
feature_extraction.fit(hashtags_data["hashtags"].values)
X = feature_extraction.transform(hashtags_data["hashtags"].values)

In [76]:
y= hashtags_data["label"].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [77]:
# train classifier
clf = SVC(kernel='linear')
clf.fit(X_train, y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [78]:
# predict and evaluate predictions
predictions = clf.predict(X_test)
print('ROC-AUC yields ' + str(roc_auc_score(y_test, predictions)))

ROC-AUC yields 1.0


In [79]:
print(predictions, y_test, [x - y for x, y in zip(predictions, y_test)])

[0 0 0 1 0 1 1 1 1 1 1 1 0 0 0 0 1 1 0 0 0 1 0 1 1 1 1 0 1 1 0 1 0 1 0 0 1
 1 0 1 0 1 0 1 1 1 1 0 0 1 0 0 1 1 1 0 1 1 1 1 1 1 1 0 0 0 1 1] [0 0 0 1 0 1 1 1 1 1 1 1 0 0 0 0 1 1 0 0 0 1 0 1 1 1 1 0 1 1 0 1 0 1 0 0 1
 1 0 1 0 1 0 1 1 1 1 0 0 1 0 0 1 1 1 0 1 1 1 1 1 1 1 0 0 0 1 1] [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


### Comment after first prediction

The results above are great, we managed to accuratemy predict the political view for several twitter accounts. We will now look at parameter tuning, to increase our prediction score even more.
To do so, we will use a tool provided by Scikit learn, called GridSearchCV. Given a range of possibilities for our parameters, GridSearchCV will test every combination itself, returning the best parameters for our problem.

In [96]:
vect = TfidfVectorizer()
X = vect.fit_transform(hashtags_data["hashtags"].values)

params = {"svc__C": [.01, .1, 1, 10, 100],
          "svc__kernel": ['rbf', 'poly', 'linear'],
          "tfidf__min_df": [.01, .03, .05, .1, .2],
          "tfidf__sublinear_tf": [True, False]}

clf = Pipeline([("tfidf", TfidfVectorizer()),
                ("svc", SVC())])

estimator = GridSearchCV(clf, params, verbose=0, n_jobs=2)
estimator.fit(hashtags_data["hashtags"].values, hashtags_data["label"].values)

print(estimator.best_estimator_)
print(estimator.best_score_)

Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=0.1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,...,
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])
0.9910714285714286


In [97]:
print(estimator.best_estimator_.steps)

[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=0.1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)), ('svc', SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))]


In [98]:
clf = Pipeline([('vect', TfidfVectorizer(min_df=0.01, sublinear_tf=True)), ('svc', SVC(C=1, kernel='linear'))])

In [99]:
scores = cross_val_score(clf, hashtags_data["hashtags"].values, hashtags_data["label"].values, cv=3)
print ('Our algorithm\'s final score is:', sum(scores)/len(scores))

Our algorithm's final score is: 0.982142857143


### Quick test with user mentions

In our data extraction we also kept the user mentions in the tweets. Let's have a quick try at using these to predict the political view, using the same techniques as hashtags. We will mostly copy the code that was already written, since the process is the same.

In [83]:
mentions_data = pandas.read_csv("../data_collection/mentions_data.csv", )
mentions_data.head()

Unnamed: 0,label,mentions
0,0,stephsmithfl danp_att arringtond3 hispaniccauc...
1,0,girlscouts bradybuzz repjackyrosen averywgardi...
2,0,secretarysonny famu_1887 repallawsonjr lrobins...
3,0,repespaillat repjohnlewis mmviverito cunydream...
4,0,officialcbc ducksunlimited agriculturede shalo...


In [84]:
# build feature extractor
feature_extraction = TfidfVectorizer(min_df=0.1, sublinear_tf=True)
feature_extraction.fit(mentions_data["mentions"].values)
X = feature_extraction.transform(mentions_data["mentions"].values)

In [85]:
y= mentions_data["label"].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [86]:
# train classifier
clf = Pipeline([('vect', TfidfVectorizer(min_df=0.05, sublinear_tf=True)), ('svm', SVC(C=1, kernel='linear'))])

In [87]:
scores = cross_val_score(clf, mentions_data["mentions"].values, mentions_data["label"].values, cv=3)
print ('Our algorithm\'s final score is:', sum(scores)/len(scores))

Our algorithm's final score is: 0.988147914033


In [88]:
vect = TfidfVectorizer()
X = vect.fit_transform(mentions_data["mentions"].values)

params = {"svc__C": [.01, .1, 1, 10, 100],
          "svc__kernel": ['linear'],
          "tfidf__min_df": [.01, .03, .05, .1, .2],
          "tfidf__sublinear_tf": [True]}

clf = Pipeline([("tfidf", TfidfVectorizer()),
                ("svc", SVC())])

estimator = GridSearchCV(clf, params, verbose=0, n_jobs=2)
estimator.fit(mentions_data["mentions"].values, mentions_data["label"].values)

print(estimator.best_estimator_)
print(estimator.best_score_)

Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=0.01,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True...,
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])
0.9881656804733728


In [89]:
print(estimator.best_estimator_.steps)

[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=0.01,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)), ('svc', SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))]


In [90]:
# train classifier
clf = Pipeline([('vect', TfidfVectorizer(min_df=0.02, sublinear_tf=True)), ('svm', SVC(C=10, kernel='linear'))])

In [91]:
scores = cross_val_score(clf, mentions_data["mentions"].values, mentions_data["label"].values, cv=3)
print ('Our algorithm\'s final score is:', sum(scores)/len(scores))

Our algorithm's final score is: 0.997050147493


### Quick test with urls

In our data extraction we also kept the urls in the tweets. Let's have a quick try at using these to predict the political view, using the same techniques as hashtags. We will mostly copy the code that was already written, since the process is the same.

In [21]:
urls_data = pandas.read_csv("../data_collection/urls_data.csv", )
urls_data.head()

Unnamed: 0,label,urls
0,0,http://www.businessinsider.com/ http://www.tam...
1,0,https://lasvegassun.com/ http://www.mcclatchyd...
2,0,https://usat.ly/ https://medium.com/ https://m...
3,0,http://thehill.com/ https://www.healthcare.gov...
4,0,http://thndr.me/ http://www.healthcare.gov/ ht...


In [22]:
# build feature extractor
feature_extraction = TfidfVectorizer(min_df=0.1, sublinear_tf=True)
feature_extraction.fit(urls_data["urls"].values)
X = feature_extraction.transform(urls_data["urls"].values)

In [23]:
y= urls_data["label"].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [24]:
# train classifier
clf = Pipeline([('vect', TfidfVectorizer(min_df=0.05, sublinear_tf=True)), ('svm', SVC(C=1, kernel='linear'))])

In [25]:
scores = cross_val_score(clf, urls_data["urls"].values, urls_data["label"].values, cv=3)
print ('Our algorithm\'s final score is:', sum(scores)/len(scores))

Our algorithm's final score is: 0.961572903498


In [26]:
vect = TfidfVectorizer()
X = vect.fit_transform(urls_data["urls"].values)

params = {"svc__C": [.01, .1, 1, 10, 100],
          "svc__kernel": ['linear'],
          "tfidf__min_df": [.01, .03, .05, .01, .02],
          "tfidf__sublinear_tf": [True]}

clf = Pipeline([("tfidf", TfidfVectorizer()),
                ("svc", SVC())])

estimator = GridSearchCV(clf, params, verbose=0, n_jobs=2)
estimator.fit(urls_data["urls"].values, urls_data["label"].values)

print(estimator.best_estimator_)
print(estimator.best_score_)

Pipeline(memory=None,
     steps=[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=0.01,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True...,
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])
0.9704142011834319


In [27]:
 print(estimator.best_estimator_.steps)

[('tfidf', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=0.01,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)), ('svc', SVC(C=10, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))]


## Test on supporter accounts

I have gathered accounts from trump supporters. Let's see if our models manages to predict that these accounts are indeed republican accounts. The exercise is much harder than what we have done before, since these new accounts don't necessarily talk about politics a lot.

### First with mentions

In [141]:
mentions_test_data = pandas.read_csv("../data_collection/mentions_test_data.csv", )

In [142]:
clf_mentions = Pipeline([('vect', TfidfVectorizer(min_df=0.02, sublinear_tf=True)), ('svm', SVC(C=10, kernel='linear'))])
clf_mentions.fit(mentions_data["mentions"].values, mentions_data["label"].values)

Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=0.02,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,...,
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])

In [143]:
prediction_mentions = clf_mentions.predict(mentions_test_data['mentions'].values)
print('ROC-AUC yields ' + str(roc_auc_score(mentions_test_data['label'], prediction_mentions)))

ROC-AUC yields 0.82420212766


### Now with hashtags

In [160]:
hashtags_test_data = pandas.read_csv("../data_collection/hashtags_test_data.csv", )
hashtags_test_data = hashtags_test_data[hashtags_test_data['hashtags'].notnull()]
print('Nombre d\'échantillons tests :', hashtags_test_data.shape[0])
hashtags_test_data.head()

Nombre d'échantillons tests : 87


Unnamed: 0,label,hashtags
0,1,fakenews
1,1,reallyfakenews fbi dirtyfbi miga hypocrite pre...
2,1,starwars tgif jobs gooddayny mustread bingewor...
3,1,mueller specialcounsel flynn foxnews foxnews b...
4,1,readabookday deepestbluest gamerecognizegame s...


In [161]:
clf_hashtags = Pipeline([('vect', TfidfVectorizer(min_df=0.01, sublinear_tf=True)), ('svc', SVC(C=1, kernel='linear'))])
clf_hashtags.fit(hashtags_data["hashtags"].values, hashtags_data["label"].values)

Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=0.01,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,...,
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])

In [162]:
prediction_hashtags = clf_hashtags.predict(hashtags_test_data['hashtags'].values)
print('ROC-AUC yields ' + str(roc_auc_score(hashtags_test_data['label'], prediction_hashtags)))

ROC-AUC yields 0.816223404255


### Let's finish with urls

In [173]:
urls_test_data = pandas.read_csv("../data_collection/urls_test_data.csv", )
urls_test_data = urls_test_data[urls_test_data['urls'].notnull()]
print('Nombre d\'échantillons tests :', urls_test_data.shape[0])
urls_test_data.head()

Nombre d'échantillons tests : 87


Unnamed: 0,label,urls
0,1,http://thehill.com/ http://thehill.com/ https:...
1,1,https://nypost.com/ https://www.investors.com/...
2,1,https://www.21cf.com/ http://press.foxbusiness...
3,1,https://www.washingtontimes.com/ http://thehil...
4,1,http://tinyurl.com/ http://www.nydailynews.com...


In [174]:
clf_urls = Pipeline([('vect', TfidfVectorizer(min_df=0.01, sublinear_tf=True)), ('svc', SVC(C=10, kernel='linear'))])
clf_urls.fit(urls_data["urls"].values, urls_data["label"].values)

Pipeline(memory=None,
     steps=[('vect', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=0.01,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,...,
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])

In [175]:
prediction_urls = clf_hashtags.predict(urls_test_data['urls'].values)
print('ROC-AUC yields ' + str(roc_auc_score(urls_test_data['label'], prediction_urls)))

ROC-AUC yields 0.626861702128


In [176]:
print(prediction_urls, prediction_mentions, prediction_hashtags)

[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 1 1 1 1 1 1 0 0 1 0 1 0 1 1 1 0 0 1 0 1 0 1
 1 1 1 1 0 1 1 1 1 1 1 1 1] [1 1 1 1 0 1 0 1 0 0 1 1 0 1 1 0 0 0 1 1 1 1 1 1 0 1 1 1 1 1 1 0 1 1 1 1 1
 1 1 0 1 1 1 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 1 1 0 0 0] [1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 1 1 0 0 0 0 0 1 0 0 0 0 0 0 1 0 1 0 0
 0 1 0 0 0 1 1 0 1 1 0 1 0]


In [177]:
voting_predictions = [1 if a + b + c > 1 else 0 for a, b, c in zip(prediction_urls, prediction_mentions, prediction_hashtags)]
print(len(voting_predictions), len(prediction_urls), len(prediction_mentions), len(prediction_hashtags))

87 87 87 87


In [187]:
print([a-b for a,b in zip(prediction_mentions,urls_test_data['label'].values)], sum([a-b for a,b in zip(prediction_mentions,urls_test_data['label'].values)]))
print([a-b for a,b in zip(prediction_hashtags,urls_test_data['label'].values)], sum([a-b for a,b in zip(prediction_hashtags,urls_test_data['label'].values)]))
print([a-b for a,b in zip(prediction_urls,urls_test_data['label'].values)], sum([a-b for a,b in zip(prediction_urls,urls_test_data['label'].values)]))
print([a + b + c - 3*d for a, b, c, d in zip(prediction_urls, prediction_mentions, prediction_hashtags, urls_test_data['label'].values)]
)
print([a-b for a,b in zip(voting_predictions,urls_test_data['label'].values)])
print(len(urls_test_data['label']))
print('ROC-AUC yields ' + str(roc_auc_score(urls_test_data['label'], voting_predictions)))

[0, 0, 0, 0, -1, 0, -1, 0, -1, -1, 0, 0, -1, 0, 0, -1, -1, -1, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, -1, -1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0] -10
[0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0] 11
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1] 28
[0, 0, 0, 0, -1, 0, -1, 0, -2, -1, 0, 0, -1, 0, 0, -1, -1, -1, 0, 0, 0, -1, 0, 0, -1, 0, 0, -1, 0, 0, 0, -1, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0, -1, -1, 0, 0, 3, 2, 1, 0, 0, 1, 1, 1, 2, 2, 1