In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
biden = pd.read_csv('/kaggle/input/us-election-2020-tweets/hashtag_joebiden.csv', lineterminator='\n', 
                    parse_dates=True)
trump = pd.read_csv('/kaggle/input/us-election-2020-tweets/hashtag_donaldtrump.csv', lineterminator='\n', 
                    parse_dates=True)

In this project, we're interested in seeing if tweets can predict election results.

We're only interested in the tweets before election day (November 3, 2020)

In [None]:
#converted tweet_created into a datetime object and made it into a new column
import datetime

datetime_obj = []

for elem in biden['created_at']:
    obj = datetime.datetime.strptime(elem, '%Y-%m-%d %H:%M:%S').date()
    datetime_obj.append(obj)
    
biden['datetime'] = datetime_obj

#get all tweets before 11/3 election day
biden = biden[(biden['datetime'] < datetime.date(2020, 11, 3))]
biden.head()

We're also only interested in tweets in the U.S.

In [None]:
#get all tweets from US
biden = biden[biden['country'] == 'United States of America']
biden.head()

We're going to use Vader to get the polarity score of each tweet.

In [None]:
#vader sentimenal analysis
vader = pd.read_csv('/kaggle/input/vader-sentiment/vader_lexicon.txt', sep = '\t', index_col = 0, 
                   header = None).drop([2,3], axis = 1).rename(columns={1: 'polarity'})

#Get rid of all punctutations
punct_re = r'[^(\w)(\s)]'
biden['no_punc'] = biden['tweet'].str.lower().replace(punct_re, ' ', regex = True) 

#Make new dataframe
tidy_format = pd.DataFrame(biden['no_punc'].str.split(expand = True).stack()).reset_index(level = 1).rename(columns = {'level_1' : 'num', 0 : 'word'})

tidy_format2 = tidy_format
tidy_format2['index'] = tidy_format.index

#make polarity column
biden['polarity_vader'] = tidy_format2.merge(vader, how = 'left', left_on = 'word', right_on = 0).fillna(0).groupby('index').sum()['polarity']

We're also going to use TextBlob to get the polarity score of each tweet.

In [None]:
#used TextBlob sentimental analysis library to get polarity score of every single tweet
from textblob import TextBlob

each_polarity = []

for elem in biden['tweet']:
    blob = TextBlob(elem)
    polarity = blob.sentiment.polarity
    each_polarity.append(polarity) 
    
biden['polarity_textblob'] = each_polarity
biden.head()

Average the 2 polarity scores together.

In [None]:
biden['average_polarity'] = (biden['polarity_vader'] + biden['polarity_textblob']) / 2
biden.head()

Drop all tweets with an average polarity score of 0 which indicates that the tweet is neutral.

In [None]:
#get rid of tweets with polarity score of 0
biden = biden[biden['average_polarity'] != 0.0]
biden = biden.reset_index()
biden.head()

Create a y_label which is a column with 0 (indicating they support Trump) and 1 (indicating they support biden). This label is created based on the average polarity score. Since this dataframe contains tweets about Biden, if the polarity score is positive, then we can assume the tweet supports Biden and vice versa. 

In [None]:
#1 if polarity score is greater than 0 which means the tweet supports biden
#0 if polarity score is less than 0 which means the tweet supports trump
for_biden = []

for polarity in biden['average_polarity']:
    if polarity > 0:
        for_biden.append(1)
    else:
        for_biden.append(0)

biden['biden_or_trump'] = for_biden
biden.head()

Drop unnecessary columns.

In [None]:
#drop unnecessary columns
drop_cols = ['index', 'created_at', 'tweet_id', 'likes', 'retweet_count', 'source', 'user_id', 'user_name',
            'user_screen_name', 'user_description', 'user_join_date', 'user_followers_count', 'user_location',
            'lat', 'long', 'city', 'country', 'continent', 'state_code', 'collected_at', 'datetime']
biden = biden.drop(drop_cols, axis = 1)
biden.head()

Repeat these steps for Trump's dataframe.

Get tweets before November 3, 2020.

In [None]:
#repeat for trump
datetime_obj_trump = []

for elem in trump['created_at']:
    obj = datetime.datetime.strptime(elem, '%Y-%m-%d %H:%M:%S').date()
    datetime_obj_trump.append(obj)
    
#created new column that contains datetime object
trump['datetime'] = datetime_obj_trump

#got rows between October 20 and November 3 inclusive
trump = trump[(trump['datetime'] < datetime.date(2020, 11, 3))]
trump.head()

Get tweets in the US only.

In [None]:
#get all tweets from US
trump = trump[trump['country'] == 'United States of America']
trump.head()

Get Vader polarity scores for each tweet.

In [None]:
trump['no_punc'] = trump['tweet'].str.lower().replace(punct_re, ' ', regex = True) 

#Make new dataframe
tidy_format = pd.DataFrame(trump['no_punc'].str.split(expand = True).stack()).reset_index(level = 1).rename(columns = {'level_1' : 'num', 0 : 'word'})

tidy_format2 = tidy_format
tidy_format2['index'] = tidy_format.index

#make polarity column
trump['polarity_vader'] = tidy_format2.merge(vader, how = 'left', left_on = 'word', right_on = 0).fillna(0).groupby('index').sum()['polarity']

Get Textblob polarity score for each tweet.

In [None]:
#used TextBlob sentimental analysis library to get polarity score of every single tweet
from textblob import TextBlob

each_polarity_trump = []

for elem in trump['tweet']:
    blob = TextBlob(elem)
    polarity = blob.sentiment.polarity
    each_polarity_trump.append(polarity) 
    
trump['polarity_textblob'] = each_polarity_trump
trump.head()

Average Textblob and Vader polarity scores.

In [None]:
trump['average_polarity'] = (trump['polarity_vader'] + trump['polarity_textblob']) / 2
trump.head()

Remove tweets with an average polarity score of 0.0.

In [None]:
trump = trump[trump['average_polarity'] != 0.0]
trump = trump.reset_index()
trump.head()

Create y label with 0 indicating they support Trump and 1 indicating they support Biden.

In [None]:
#0 if polarity is greater than 0, tweet supports trump which is consistent with earlier
#1 if polarity is less than 0, tweet does not support trump 

for_trump = []

for polarity in trump['average_polarity']:
    if polarity > 0:
        for_trump.append(0)
    else:
        for_trump.append(1)
        
trump['biden_or_trump'] = for_trump
trump.head()

In [None]:
trump = trump.drop(drop_cols, axis = 1)
trump.head()

In [None]:
biden_and_trump = biden.append(trump, ignore_index = True)
biden_and_trump.head()

Create a Wordcloud for all tweets without punctuation.

In [None]:
all_words = ''

for tweet in biden_and_trump['no_punc']:
    all_words += tweet
    all_words += ' '

In [None]:
#create a wordcloud to see how frequent non-stop words appear
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt

stopwords = set(STOPWORDS)

wordcloud = WordCloud(width = 800, height = 800, background_color = 'white', stopwords = stopwords, 
                     min_font_size = 10).generate(all_words)

plt.figure(figsize = (8, 8), facecolor = None) 
plt.imshow(wordcloud) 
plt.axis("off") 
plt.tight_layout(pad = 0) 
  
plt.show()

Create a Wordcloud for all hashtags that were used in all the tweets.

In [None]:
#extract all hashtags from every single tweet and place it into all_hashtags
import re

hashtag = r'#[a-zA-Z]{1,}'

all_hashtags = ''
for tweet in biden_and_trump['tweet']:
    array_hash = re.findall(hashtag, tweet)
    for elem in array_hash:
        all_hashtags += elem
        all_hashtags += ' '

In [None]:
wordcloud = WordCloud(width = 800, height = 800, background_color = 'white', stopwords = stopwords, 
                     min_font_size = 10).generate(all_hashtags)

plt.figure(figsize = (8, 8), facecolor = None) 
plt.imshow(wordcloud) 
plt.axis("off") 
plt.tight_layout(pad = 0) 
  
plt.show()

Split data into train and test data. Test data only contains tweet from Pennsylvania, Michigan, and Wisconsin. Rest is train data.

In [None]:
#get all non-swing states (train)
train = biden_and_trump.copy()
train = train[(train['state'] != 'Michigan') & (train['state'] != 'Pennsylvania') & (train['state'] != 'Wisconsin')]

#get all swing states here (test data)
test = biden_and_trump.copy()
test = test[(test['state'] == 'Michigan') | (test['state'] == 'Pennsylvania') | (test['state'] == 'Wisconsin')]


In [None]:
#print most negative tweets 
print('Most negative tweets:')
for t in train.sort_values('average_polarity').head(10)['tweet']:
    print('\n  ', t)

In [None]:
#print most positive tweets 
print('Most positive tweets:')
for t in train.sort_values('average_polarity', ascending = False).head(10)['tweet']:
    print('\n  ', t)

Define a function that creates a matrix where each row represents a tweet and each column is a word. This function essentially creates a one-hot encoding using words as features.

In [None]:
#function takes in a list of (words) and a series of tweets (texts) and outputs a matrix 
#each row corresponds to a single tweet in the pandas series
#row contains 0 or 1 for each word in the list depending if word exists
def words_in_texts(words, texts):
    n = len(texts)
    p = len(words)
    new_array = []
    
    for num in range(p):
        new_array.append(texts.str.contains(words[num], regex = False).values)
    
    indicator_array = np.array(new_array).T.astype(int)
    return indicator_array

Select words/hashtags from above WordCloud as features. 

In [None]:
# words = ['democracy', 'president trump', 'democrat', 'trump supporter', 'corruption', 'trump win', 'vote trump',
#         'cnn', 'republican', 'racist']

words = ['#VoteBidenHarris2020', '#VoteBidenHarrisToSaveAmerica', 'empathetic', 'wise', 'respectful', 'intuitive', 
         'Devotion', 'Compassion', '#voteTrumpPence', '#tEAMtRUMP', '#VoteTrump', 'catastrophe', 'apocalyptic', 
         'joke', 'Lies', 'WHINE', 'SHAME', 'Failure', 'evil', 'racist', 'homophobic', 'corrupt', 'sexist', 'god', 
         'bless', 'blessed', 'jesus', 'lord', 'christ', 'sin', 'worship', 'preach', 'idiot', 'creepy', 'horrible', 
         'strange', 'racist', 'racism', 'protesters', 'transgender', 'democratic', '#VoteHimOut', '#TrumpIsLosing', 
         '#TrumpMeltdown', '#VoteRedToSaveAmerica', '#VoteRed', '#TrumpSupporters', '#HunterBidenEmails', 
         '#HunterBiden', '#BidenCrimeFamily', '#VoteBlue', '#BidenHarrisToSaveAmerica', '#RepublicansForBiden', 
         '#VoteResponsibly', '#CountEveryVote', '#RiggedElection']

X_train = words_in_texts(words, train['tweet'])
Y_train = train['biden_or_trump']

X_test = words_in_texts(words, test['tweet'])
Y_test = test['biden_or_trump']

Train models and predict on the y_label.

In [None]:
from sklearn.linear_model import LogisticRegression

log_model =  LogisticRegression(max_iter = 1000)
log_model.fit(X_train, Y_train)

training_accuracy = log_model.score(X_train, Y_train)
print("Training Accuracy: ", training_accuracy)

In [None]:
test_accuracy = log_model.score(X_test, Y_test)
print("Test Accuracy: ", test_accuracy)

In [None]:
from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier(hidden_layer_sizes=(50, 10), activation='logistic', 
                    random_state=42, max_iter=500, solver='adam')
mlp.fit(X_train, Y_train)
mlp.score(X_train, Y_train)

In [None]:
test_accuracy_mlp = mlp.score(X_test, Y_test)
print("Test Accuracy: ", test_accuracy_mlp)

As you can see, our models accuracy isn't good. Instead, we will try to use our y label to determine which side the swing states will favor more. 

In [None]:
michigan = biden_and_trump[biden_and_trump['state'] == 'Michigan']
pennsylvania = biden_and_trump[biden_and_trump['state'] == 'Pennsylvania']
wisconsin = biden_and_trump[biden_and_trump['state'] == 'Wisconsin']

In [None]:
m_tf = michigan['biden_or_trump'].value_counts()
m_biden = m_tf[1] / (m_tf[0] + m_tf[1])
print(m_biden)

In [None]:
p_tf = pennsylvania['biden_or_trump'].value_counts()
p_biden = p_tf[1] / (p_tf[0] + p_tf[1])
print(p_biden)

In [None]:
w_tf = wisconsin['biden_or_trump'].value_counts()
w_biden = w_tf[1] / (w_tf[0] + w_tf[1])
print(w_biden)

It seems that there are more tweets that favor Biden over Trump for all three states. When compared to the actual election result, the percentages do align. All three states did favor Biden over Trump by a slight margin.