# Predictive modeling for Senate data


In [21]:


import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Varun\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [55]:
df = pd.read_csv(R"CleanDemRepSen.csv")
df[:1]


Unnamed: 0,TweetId,CleanText,party
0,1198374984902676480,approval rate republican party record thank,republican


In [56]:
import numpy as np
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df.CleanText, df.party, test_size=0.30, random_state=42)
name = df.party.name


In [57]:
from textblob import TextBlob
import nltk
def split_into_lemmas(message):
    words = TextBlob(message).words
    return [word.lemma for word in words]

In [58]:


from sklearn.pipeline import Pipeline
text_clf_SGDClassifier = Pipeline([('vect', CountVectorizer(analyzer=split_into_lemmas, ngram_range=(2,4), stop_words='english',lowercase=True)),
                     ('tfidf', TfidfTransformer()),
                     ('clf', SGDClassifier()),
])
text_clf_SGDClassifier.fit(X_train, y_train)



Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer=<function split_into_lemmas at 0x0B320390>,
                                 binary=False, decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(2, 4), preprocessor=None,
                                 stop_words='english', strip_accents=None,
                                 token_pattern='(?u)\\b\\w...
                 SGDClassifier(alpha=0.0001, average=False, class_weight=None,
                               early_stopping=False, epsilon=0.1, eta0=0.0,
                               fit_intercept=True, l1_ratio=0.15,
                               learning_rate='optimal', loss='hinge',
                               max_iter=1000, n_iter_no_change=5, n_jobs=None

In [47]:


text_clf_LogisticRegression = Pipeline([('vect', CountVectorizer(analyzer=split_into_lemmas, ngram_range=(2,4), stop_words='english',lowercase=True)),
                     ('tfidf', TfidfTransformer()),
                     ('clf', LogisticRegression()),
])
text_clf_LogisticRegression.fit(X_train, y_train)





Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer=<function split_into_lemmas at 0x1020E300>,
                                 binary=False, decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(2, 4), preprocessor=None,
                                 stop_words='english', strip_accents=None,
                                 token_pattern='(?u)\\b\\w...
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('clf',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_inte

In [60]:
text_clf_MultinomialNB = Pipeline([('vect', CountVectorizer(analyzer=split_into_lemmas, ngram_range=(2,4), stop_words='english',lowercase=True)),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB()),
])
text_clf_MultinomialNB.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer=<function split_into_lemmas at 0x0B320390>,
                                 binary=False, decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(2, 4), preprocessor=None,
                                 stop_words='english', strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('clf',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

In [49]:
text_clf_SVC = Pipeline([('vect', CountVectorizer(analyzer=split_into_lemmas, ngram_range=(2,4), stop_words='english',lowercase=True)),
                     ('tfidf', TfidfTransformer()),
                     ('clf', SVC(kernel='linear')),
])
text_clf_SVC.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer=<function split_into_lemmas at 0x1020E300>,
                                 binary=False, decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(2, 4), preprocessor=None,
                                 stop_words='english', strip_accents=None,
                                 token_pattern='(?u)\\b\\w...
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('clf',
                 SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
                     decision_function_shape

In [50]:
predicted_SVC = text_clf_SVC.predict(X_test)

In [51]:
predicted_MultinomialNB = text_clf_MultinomialNB.predict(X_test)

In [52]:
predicted_LogisticRegression = text_clf_LogisticRegression.predict(X_test)

In [53]:
predicted_SGDClassifier = text_clf_SGDClassifier.predict(X_test)

In [54]:
from sklearn.metrics import classification_report
#shows the accuracies listed for the data
print (classification_report(y_test, predicted_SVC))
print (classification_report(y_test, predicted_MultinomialNB))
print (classification_report(y_test, predicted_LogisticRegression))
print (classification_report(y_test, predicted_SGDClassifier))

              precision    recall  f1-score   support

    democrat       0.20      0.15      0.17      6421
  republican       0.41      0.50      0.45      7658

    accuracy                           0.34     14079
   macro avg       0.31      0.33      0.31     14079
weighted avg       0.32      0.34      0.33     14079

              precision    recall  f1-score   support

    democrat       0.11      0.06      0.08      6421
  republican       0.44      0.62      0.52      7658

    accuracy                           0.37     14079
   macro avg       0.28      0.34      0.30     14079
weighted avg       0.29      0.37      0.32     14079

              precision    recall  f1-score   support

    democrat       0.22      0.16      0.19      6421
  republican       0.42      0.51      0.46      7658

    accuracy                           0.35     14079
   macro avg       0.32      0.33      0.32     14079
weighted avg       0.33      0.35      0.33     14079

              preci