# Tweets Ratio Prediction

This notebook is preliminary analysis of tweets by US politicians. We will build a machine learning pipeline that determines the likely author of a tweet.

In [1]:
# The suspects ...
import random
import glob as gb
import pandas as pd
import dask.dataframe as dd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score

### Senators

In [2]:
# Retrieving the data
%time senators = pd.read_csv('senators.csv', encoding='ISO-8859-1', usecols=['text', 'user']).head(10000)
assert senators.user.unique().shape[0] > 1

CPU times: user 775 ms, sys: 78.2 ms, total: 854 ms
Wall time: 855 ms


In [3]:
# Vectorizing
def vectorizer(data):
    '''Vectoring.'''
    vectorizer = CountVectorizer(analyzer='word', lowercase=False)
    features = vectorizer.fit_transform(data)
    return features.toarray()

features_nd = vectorizer(senators.text)
data_labels = senators.user

In [4]:
# Train-test split
def data_split(X, y):
    '''Train-test data split.'''
    return train_test_split(X, y, random_state=1234)

X_train, X_test, y_train, y_test = data_split(features_nd, data_labels)

In [5]:
# Fitting
log_model = LogisticRegression().fit(X_train, y_train)

In [6]:
# Predicting
y_pred = log_model.predict(X_test)

In [7]:
# Testing
def test_prediction(data):
    '''Testing predictions.'''
    value = random.randint(0, len(X_test)-7)
    for i in range(value,value+7):
        print('Who said it:\t\t{}'.format(y_pred[0]))
        ind=features_nd.tolist().index(X_test[i].tolist())
        print('What they said:\t\t{}\n'.format(data[ind].strip()))
        
test_prediction(senators.text)

Who said it:		amyklobuchar
What they said:		RT @BrookingsInst: Sen. @ChrisCoons explains how Putin launched an undeclared war on the international order: https://t.co/NE6fduoEz4 httpsäó_

Who said it:		amyklobuchar
What they said:		Happy Statehood Day, Slovenia! Spoke at @SLOinUSA celebration of 25 years as nation. Gained independence in 1991. https://t.co/h5dM4fSxG5

Who said it:		amyklobuchar
https://t.co/Gpk2JSMfzt

Who said it:		amyklobuchar
What they said:		RT @NHC_Atlantic: The flood threat from #Harvey is spreading farther east in Louisiana. Stay vigilant. @NWSNewOrleans @NWSLakeCharles @NWSWäó_

Who said it:		amyklobuchar
What they said:		Reminder of what Congress can do if we put policy before party- Important step for making foreign aid more effective https://t.co/MZgjfIM3iX

Who said it:		amyklobuchar
What they said:		RT @RefugeesIntl: It's often women who most brutally bear the brunt of conflict, violence, hunger - @ChrisCoons @usip #coonsatusip #SouthSuäó_

Who said it:		a

In [8]:
# Accuracy
print('Accuracy: {}%'.format(accuracy_score(y_test, y_pred)*100))

Accuracy: 86.16%


### Trump & Obama

In [9]:
# Retrieving the data
%time trump_obama = pd.concat([pd.read_csv(file, encoding='ISO-8859-1', usecols=['text', 'user']) for file in ['BarackObama.csv', 'realDonaldTrump.csv']])
assert trump_obama.user.unique().shape[0] > 1

CPU times: user 23.2 ms, sys: 2.92 ms, total: 26.2 ms
Wall time: 24.9 ms


In [10]:
# Vectorizing
features_nd = vectorizer(trump_obama.text)
data_labels = trump_obama.user

In [11]:
# Train-test split
X_train, X_test, y_train, y_test = data_split(features_nd, data_labels)

In [12]:
# Fitting
log_model = LogisticRegression().fit(X_train, y_train)

In [13]:
# Predicting
y_pred = log_model.predict(X_test)

In [14]:
# Accuracy
print('Accuracy: {}%'.format(round(accuracy_score(y_test, y_pred)*100, 2)))

Accuracy: 96.09%


### All Tweets

In [15]:
# Retrieving the data
%time tweets = dd.concat([dd.read_csv(file, encoding='ISO-8859-1', usecols=['text', 'user']) for file in gb.glob('*.csv')])

CPU times: user 151 ms, sys: 16.4 ms, total: 168 ms
Wall time: 45.2 ms
