
# Linear SVM performed on Senate Data

In [1]:

import pandas as pd


In [2]:
data = pd.read_csv(R"CleanDemRepSen.csv")

In [3]:
data.head(5)

Unnamed: 0,TweetId,CleanText,party
0,1198374984902676480,approval rate republican party record thank,republican
1,1198370028153950209,thank kevin nice words https,republican
2,1198334870738604033,sondland said trump told none move needle anyo...,republican
3,1198328606684012544,direct evidence president trump anything impea...,republican
4,1198328568838840322,new york post editorial worse pointless hours ...,republican


In [4]:
from sklearn.model_selection import train_test_split

In [5]:
X_train,X_test,y_train,y_test = train_test_split(data,list(data['party'].values),test_size=0.3)

In [6]:
print(X_train.shape,X_test.shape)
print(len(data))

X_train.head()


(32850, 3) (14079, 3)
46929


Unnamed: 0,TweetId,CleanText,party
4724,1177631737435709440,victor orban embraced autocracy antidemocratic...,republican
4548,1192157273784803328,glad see district court blocked president trum...,republican
31913,1179515886257881088,get distracted republicans going attack whistl...,democrat
23368,1149048536635457536,really excited bipartisan bill supporting men ...,republican
20549,1187461281974050821,taxpayers foot bill lavish travel federal offi...,republican


In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer# Create feature vectors
vectorizer = TfidfVectorizer(min_df = 5,
                             max_df = 0.8,
                             sublinear_tf = True,
                             use_idf = True)
train_vectors = vectorizer.fit_transform(X_train['CleanText'])
test_vectors = vectorizer.transform(X_test['CleanText'])

In [8]:
import time
from sklearn import svm
from sklearn.metrics import classification_report# Perform classification with SVM, kernel=linear
classifier_linear = svm.SVC(kernel='linear')
t0 = time.time()
classifier_linear.fit(train_vectors, X_train['party'])
t1 = time.time()
prediction_linear = classifier_linear.predict(test_vectors)
t2 = time.time()
time_linear_train = t1-t0 
time_linear_predict = t2-t1# results
print("Training time: %fs; Prediction time: %fs" % (time_linear_train, time_linear_predict))
report = classification_report(X_test['party'], prediction_linear, output_dict=True)
print('Demorcrat: ', report['democrat'])
print('Republican: ', report['republican'])

Training time: 614.949050s; Prediction time: 163.439055s
Demorcrat:  {'precision': 0.25217207035388856, 'recall': 0.1854160174509193, 'f1-score': 0.21370207416719045, 'support': 6418}
Republican:  {'precision': 0.44145299145299144, 'recall': 0.5393551755645477, 'f1-score': 0.485517889665707, 'support': 7661}


In [11]:
print(classification_report(y_test, prediction_linear))



              precision    recall  f1-score   support

    democrat       0.25      0.19      0.21      6418
  republican       0.44      0.54      0.49      7661

    accuracy                           0.38     14079
   macro avg       0.35      0.36      0.35     14079
weighted avg       0.36      0.38      0.36     14079

['republican']


In [21]:
tweet = """prisons should be for profit"""
review_vector = vectorizer.transform([tweet])
print(classifier_linear.predict(review_vector))


['republican']


In [22]:
tweet = """i hate democrats"""
review_vector = vectorizer.transform([tweet])
print(classifier_linear.predict(review_vector))


['republican']
