# Tweets Ratio Prediction

This notebook is preliminary analysis of tweets by US politicians. We will build a machine learning pipeline that determines the likely author of a tweet.

In [None]:
# The suspects ...
import glob as gb
import pandas as pd
import dask.dataframe as dd
import dask_ml.joblib as joblib
import dask_searchcv as dcv
from dask.distributed import Client
from sklearn.externals.joblib import parallel_backend
from sklearn.model_selection import train_test_split
from sklearn.model_selection import ParameterGrid
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from warnings import filterwarnings
filterwarnings('ignore')

In [None]:
# Retrieving the data
%time tweets = pd.concat([pd.read_csv(file, encoding='ISO-8859-1', usecols=['text', 'user']) for file in gb.glob('*.csv')])

In [None]:
# Pipeline
pipeline = Pipeline([('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', SGDClassifier())])

# Parameters
parameters = {'vect__max_df': (.5, .75, 1.),
              'vect__ngram_range': ((1,1), (1,2)),
              'tfidf__use_idf': (True, False),
              'clf__alpha': (1e-2, 1e-3, 1e-4,1e-5),
              'clf__max_iter': (10, 50, 80),
              'clf__penalty': ('l2', 'elasticnet')}

In [None]:
print('Number of candidates: {}'.format(len(ParameterGrid(parameters))))

In [None]:
# Using scikit
grid_search = GridSearchCV(pipeline, parameters, n_jobs=16)
%time grid_search.fit(tweets.text, tweets.user)

In [None]:
# Using dask
client = Client()
dgrid_search = dcv.GridSearchCV(pipeline, parameters)
%time dgrid_search.fit(tweets.text, tweets.user)

In [None]:
client = Client()
# Instantiating joblib in the backend
with parallel_backend('dask'):
    # Vectorizing
    vectorizer = CountVectorizer(analyzer='word', lowercase=False)
    features = vectorizer.fit_transform(tweets.text.head(10000000))
    features_nd = features.toarray()
    data_labels = tweets.user

    # Splitting
    X_train, X_test, y_train, y_test = train_test_split(features_nd,
                                                        data_labels,
                                                        random_state=1234)

    # Fitting
    log_model = LogisticRegression()
    log_model.fit(X_train, y_train)

    # Predicting
    y_pred = log_model.predict(X_test)

In [None]:
# Testing
import random
j = random.randint(0, len(X_test)-7)
for i in range(j,j+7):
    print('Who said it:\t\t{}'.format(y_pred[0]))
    ind=features_nd.tolist().index(X_test[i].tolist())
    print('What they said:\t\t{}\n'.format(tweets.text[ind].strip()))

In [None]:
# Accuracy
print('Accuracy {}%'.format(accuracy_score(y_test, y_pred)*100))