# Predictive modeling for House of representatives data


In [1]:

import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Varun\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
df = pd.read_csv(r"CleanDemRepHouse(1).csv")
df[:1]
len(df)

159631

In [3]:
df = df.dropna()
df.reset_index(drop=True)

Unnamed: 0,TweetId,CleanText,party
0,1.197257e+18,left impeachment hearing room yet another witn...,republican
1,1.197644e+18,great visiting bayside academy students capito...,republican
2,1.197557e+18,minute opening statement guy overheard phone c...,republican
3,1.197276e+18,big partisan show saw today impeachment hearin...,republican
4,1.196921e+18,democrats focused impeaching realdonaldtrump n...,republican
...,...,...,...
159621,1.142582e+18,sat community leaders milwaukee community brai...,democrat
159622,1.142214e+18,must remain aware informed trump administratio...,democrat
159623,1.142185e+18,victory wisconsin women threat abortion care t...,democrat
159624,1.141859e+18,fighting years ensure impoverished women great...,democrat


In [4]:
import numpy as np
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df.CleanText, df.party, test_size=0.30, random_state=42)
name = df.party.name



In [5]:
from textblob import TextBlob
import nltk
def split_into_lemmas(message):
    words = TextBlob(message).words
    return [word.lemma for word in words]

In [6]:


from sklearn.pipeline import Pipeline
text_clf_SGDClassifier = Pipeline([('vect', CountVectorizer(analyzer=split_into_lemmas, ngram_range=(2,4), stop_words='english',lowercase=True)),
                     ('tfidf', TfidfTransformer()),
                     ('clf', SGDClassifier()),
])


text_clf_SGDClassifier.fit(X_train, y_train)



Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer=<function split_into_lemmas at 0x063B2348>,
                                 binary=False, decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(2, 4), preprocessor=None,
                                 stop_words='english', strip_accents=None,
                                 token_pattern='(?u)\\b\\w...
                 SGDClassifier(alpha=0.0001, average=False, class_weight=None,
                               early_stopping=False, epsilon=0.1, eta0=0.0,
                               fit_intercept=True, l1_ratio=0.15,
                               learning_rate='optimal', loss='hinge',
                               max_iter=1000, n_iter_no_change=5, n_jobs=None

In [7]:


text_clf_LogisticRegression = Pipeline([('vect', CountVectorizer(analyzer=split_into_lemmas, ngram_range=(2,4), stop_words='english',lowercase=True)),
                     ('tfidf', TfidfTransformer()),
                     ('clf', LogisticRegression()),
])


text_clf_LogisticRegression.fit(X_train, y_train)





Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer=<function split_into_lemmas at 0x063B2348>,
                                 binary=False, decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(2, 4), preprocessor=None,
                                 stop_words='english', strip_accents=None,
                                 token_pattern='(?u)\\b\\w...
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('clf',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_inte

In [8]:
text_clf_MultinomialNB = Pipeline([('vect', CountVectorizer(analyzer=split_into_lemmas, ngram_range=(2,4), stop_words='english',lowercase=True)),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB()),
])
text_clf_MultinomialNB.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer=<function split_into_lemmas at 0x063B2348>,
                                 binary=False, decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(2, 4), preprocessor=None,
                                 stop_words='english', strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('clf',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

In [9]:
#text_clf_SVC = Pipeline([('vect', CountVectorizer(analyzer=split_into_lemmas, ngram_range=(2,4), stop_words='english',lowercase=True)),
#                     ('tfidf', TfidfTransformer()),
#                     ('clf', SVC(kernel='linear')),
#])
#text_clf_SVC.fit(X_train, y_train)

In [11]:
#predicted_SVC = text_clf_SVC.predict(X_test)

In [12]:
predicted_MultinomialNB = text_clf_MultinomialNB.predict(X_test)

In [13]:
predicted_LogisticRegression = text_clf_LogisticRegression.predict(X_test)


In [14]:
predicted_SGDClassifier = text_clf_SGDClassifier.predict(X_test)

In [22]:
from sklearn.metrics import classification_report
#shows the accuracies listed for the data
#print (classification_report(y_test, predicted_SVC))
print (classification_report(y_test, predicted_MultinomialNB))
print (classification_report(y_test, predicted_LogisticRegression))
print (classification_report(y_test, predicted_SGDClassifier))
from sklearn import metrics 


              precision    recall  f1-score   support

    democrat       0.81      0.86      0.84     25098
  republican       0.84      0.78      0.81     22790

    accuracy                           0.82     47888
   macro avg       0.82      0.82      0.82     47888
weighted avg       0.82      0.82      0.82     47888

              precision    recall  f1-score   support

    democrat       0.82      0.82      0.82     25098
  republican       0.81      0.81      0.81     22790

    accuracy                           0.81     47888
   macro avg       0.81      0.81      0.81     47888
weighted avg       0.81      0.81      0.81     47888

              precision    recall  f1-score   support

    democrat       0.80      0.80      0.80     25098
  republican       0.77      0.77      0.77     22790

    accuracy                           0.79     47888
   macro avg       0.79      0.79      0.79     47888
weighted avg       0.79      0.79      0.79     47888



In [39]:
import pandas as pd
X_test.head()
X = X_test.head(10)
print(X)
print(text_clf_SGDClassifier.predict(X))
print(text_clf_LogisticRegression.predict(X))
print(text_clf_MultinomialNB.predict(X))

42770     want win vip tickets friday rally president tr...
62447      remember service men women sacrificed much great
63955     hurricane harvey boat heroes saved many lives ...
51346     certified community behavioral health clinic p...
152633    travel us virgin islands means cars ferries pl...
124699    congrats clarkcountynv schools earning top hon...
74818     great news us japan reached trade agreement be...
58391     east tennessee home great minor league basebal...
52533     thank mikekellypa robert_aderholt standing fai...
131327    voted vote voted still time please make use ri...
Name: CleanText, dtype: object
['democrat' 'republican' 'republican' 'democrat' 'democrat' 'democrat'
 'republican' 'republican' 'democrat' 'democrat']
['democrat' 'republican' 'republican' 'democrat' 'democrat' 'democrat'
 'republican' 'republican' 'republican' 'democrat']
['democrat' 'republican' 'republican' 'democrat' 'democrat' 'democrat'
 'republican' 'republican' 'republican' 'democrat']
