In [192]:
import sqlite3
import nltk
import random
import numpy as np
from collections import Counter, defaultdict
import os
import re
import emoji
import pandas as pd

from collections import Counter, defaultdict
from nltk.corpus import stopwords
from string import punctuation
from wordcloud import WordCloud 

from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer

In [193]:
import re
from string import punctuation

In [194]:
convention_db = sqlite3.connect("2020_Conventions.db")
convention_cur = convention_db.cursor()

# Part 1: Exploratory Naive Bayes

In [195]:
# Some punctuation variations
punctuation = set(punctuation) # speeds up comparison
tw_punct = punctuation - {"#"}

# Stopwords
sw = set(stopwords.words("english"))

def remove_stop(tokens) :
    stopwords_dict = Counter(sw)
    text = ' '.join([word for word in tokens if word not in stopwords_dict])
    return(text)

def remove_punctuation(text, punct_set=tw_punct) : 
    return("".join([ch for ch in text if ch not in punct_set]))

def tokenize(text) : 
    return re.split('\s+',text)

def prepare(text, pipeline) : 
    tokens = str(text)
    for transform in pipeline : 
        tokens = transform(tokens)
    return(tokens)

In [196]:
query_results = convention_cur.execute("SELECT * FROM Conventions")
pipeline = [str.lower, remove_punctuation, tokenize, remove_stop]
convention_data = []
sublist=[]
# fill this list up with items that are themselves lists. The 
# first element in the sublist should be the cleaned and tokenized
# text in a single string. The second element should be the party. 

for row in query_results :
    record = list(row)
    cleaned_string = prepare(record[5],pipeline=pipeline)
    sublist = [cleaned_string, record[0]]
    convention_data.append(sublist)

In [197]:
random.choices(convention_data,k=10)

[['need president stands america one takes knee strong proud america safe america safe enemies safe war one who’s seen face war desires see many fellow americans already honored hallowed grounds arlington want peace must strong weakness provocative president trump’s strength kept us war joe biden won’t stand america donald trump november let’s stand president vote keep america great',
  'Republican'],
 ['ground beneath feet seated pain old new soil always find way grow together earlier summer city charleston removed statue honoring john c calhoun honored advocate slavery construction underway international african american museum gadsden’s wharf much like country whole stepping shadows past beginning lay groundwork future won’t easy succeed move forward together need president sees unifying people requirement job president understands true meaning community build trust humility',
  'Democratic'],
 ['singing tony 3602 last four years experienced failed leadership donald j trump chris f 

In [198]:
word_cutoff = 5

tokens = [w for t, p in convention_data for w in t.split()]

word_dist = nltk.FreqDist(tokens)

feature_words = set()

for word, count in word_dist.items() :
    if count > word_cutoff :
        feature_words.add(word)
        
print(f"With a word cutoff of {word_cutoff}, we have {len(feature_words)} as features in the model.")

With a word cutoff of 5, we have 2391 as features in the model.


In [199]:
# Searching through words to return a dict
def conv_features(text,fw) :
    ret_dict = dict()
    for key in fw:
        if re.search('\\b'+key+'\\b',text):
            ret_dict[key]=True
    return(ret_dict)

In [200]:
assert(len(feature_words)>0)
assert(conv_features("donald is the president",feature_words)==
       {'donald':True,'president':True})
assert(conv_features("people are american in america",feature_words)==
                     {'america':True,'american':True,"people":True})

In [201]:
featuresets = [(conv_features(text,feature_words), party) for (text, party) in convention_data]

In [202]:
random.seed(20220507)
random.shuffle(featuresets)
test_size = 500

In [203]:
test_set, train_set = featuresets[:test_size], featuresets[test_size:]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, test_set))

0.492


In [204]:
# Printing most informative features
classifier.show_most_informative_features(25)

Most Informative Features
                   china = True           Republ : Democr =     27.1 : 1.0
                   votes = True           Democr : Republ =     23.8 : 1.0
             enforcement = True           Republ : Democr =     21.5 : 1.0
                 destroy = True           Republ : Democr =     19.2 : 1.0
                freedoms = True           Republ : Democr =     18.2 : 1.0
                 climate = True           Democr : Republ =     17.8 : 1.0
                supports = True           Republ : Democr =     17.1 : 1.0
                   crime = True           Republ : Democr =     16.1 : 1.0
                   media = True           Republ : Democr =     15.8 : 1.0
                 beliefs = True           Republ : Democr =     13.0 : 1.0
               countries = True           Republ : Democr =     13.0 : 1.0
                 defense = True           Republ : Democr =     13.0 : 1.0
                  defund = True           Republ : Democr =     13.0 : 1.0

# Write a little prose

### It's very interesting to see what features classify what party. It's very representative of what beliefs each party holds like Flag & Freedom for republicans and Voting & Climate for Democratics.

# My Observations

# Part 2: Classifying Congressional Tweets

In [205]:
cong_db = sqlite3.connect("congressional_data.db")
cong_cur = cong_db.cursor()

In [206]:
results = cong_cur.execute(
        '''
           SELECT DISTINCT 
                  cd.candidate, 
                  cd.party,
                  tw.tweet_text
           FROM candidate_data cd 
           INNER JOIN tweets tw ON cd.twitter_handle = tw.handle 
               AND cd.candidate == tw.candidate 
               AND cd.district == tw.district
           WHERE cd.party in ('Republican','Democratic') 
               AND tw.tweet_text NOT LIKE '%RT%'
        ''')

results = list(results) # Just to store it, since the query is time consuming

In [207]:
tweet_data = []
pipeline = [str.lower, remove_punctuation, tokenize, remove_stop]
sublist=[]
for row in results :
    record = list(row)
    cleaned_string = prepare(record,pipeline=pipeline)
    sublist = [cleaned_string, record[1]]
    tweet_data.append(sublist)

In [208]:
random.seed(20201014)

tweet_data_sample = random.choices(tweet_data,k=10)

In [209]:
word_cutoff = 5

tokens = [w for t, p in tweet_data for w in t.split()]

word_dist = nltk.FreqDist(tokens)

feature_words = set()

for word, count in word_dist.items() :
    if count > word_cutoff :
        feature_words.add(word)

print(f"With a word cutoff of {word_cutoff}, we have {len(feature_words)} as features in the model.")

With a word cutoff of 5, we have 55288 as features in the model.


In [210]:
# Searching through words to return a dict
def conv_features(text,fw) :
    ret_dict = dict()
    for key in fw:
        if re.search('\\b'+key+'\\b',text):
            ret_dict[key]=True
    return(ret_dict)

In [211]:
featuresets = conv_features(tweet,feature_words)

In [254]:
print(featuresets)

{'plans': True, 'amp': True, 'far': True, 'bdea': True, 'gun': True, 'smith': True, 'show': True, 'usurp': True, 'rights': True, 'constitutional': True, 'investigate': True, 'track': True, 'attendees': True, 'go': True, 'jason': True, 'admin': True, 'republican': True}


In [255]:
for tweet, party in tweet_data_sample :
    estimated_party = classifier.classify(featuresets)
    print(f"Here's our (cleaned) tweet: {tweet}")
    print(f"Actual party is {party} and our classifer says {estimated_party}.")
    print("")
    

Here's our (cleaned) tweet: jimmy panetta democratic bearlier today spoke house floor abt protecting health care women praised ppmarmonte work central coast httpstcowqgtrzt7vv
Actual party is Democratic and our classifer says Republican.

Here's our (cleaned) tweet: marcy kaptur democratic bgo tribe #rallytogether httpstco0nxutfl9l5
Actual party is Democratic and our classifer says Republican.

Here's our (cleaned) tweet: debbie wasserman schultz democratic bapparently trump thinks easy students overwhelmed crushing burden debt pay student loans #trumpbudget httpstcockyqo5t0qh
Actual party is Democratic and our classifer says Republican.

Here's our (cleaned) tweet: dave brat republican bwexe2x80x99re grateful first responders rescue personnel firefighters police volunteers working tirelessly keep people safe provide muchneeded help putting lives linennhttpstcoezpv0vmiz3
Actual party is Republican and our classifer says Republican.

Here's our (cleaned) tweet: antonio sabàto jr republi

In [371]:
# dictionary of counts by actual party and estimated party. 
# first key is actual, second is estimated
parties = ['Republican','Democratic']
results = defaultdict(lambda: defaultdict(int))

for p in parties :
    for p1 in parties :
        results[p][p1] = 0

num_to_score = 10000
random.shuffle(tweet_data)

for idx, tp in enumerate(tweet_data):
    tweet, party = tp
    for tp in tweet_data_sample :
        estimated_party = classifier.classify(featuresets)
    results[party][estimated_party] += 1

    if idx > num_to_score : 
        break

In [372]:
results

defaultdict(<function __main__.<lambda>()>,
            {'Republican': defaultdict(int,
                         {'Republican': 4295, 'Democratic': 0}),
             'Democratic': defaultdict(int,
                         {'Republican': 5707, 'Democratic': 0})})

# Reflections

I'm not sure why the parties aren't printing right in the scoring above, but the model did not perform all too well as predicted. However, when I printed out the scoring, it seems that democrats were scored from there tweets incorrectly more times than republicans than vice versa. Maybe the word choice used by republicans stands out more in tweets so the model can more easily identify these than other democratic tweets.