In [1]:
import pandas as pd
import numpy as np

In [2]:
biden_df = pd.read_csv('hashtag_joebiden.csv', lineterminator='\n')
trump_df = pd.read_csv('hashtag_donaldtrump.csv', lineterminator='\n')

In [3]:
#converted tweet_created into a datetime object
import datetime
def clean_date(df):
    datetime_obj = []
    for elem in df['created_at']:
        obj = datetime.datetime.strptime(elem, '%Y-%m-%d %H:%M:%S').date()
        datetime_obj.append(obj)
    df['datetime'] = datetime_obj
clean_date(biden_df)   
clean_date(trump_df)  

#Only keep tweets on or before election day; select rows that were before November 3 inclusive
biden_df = biden_df[biden_df['datetime'] <= datetime.date(2020, 11, 3)]
trump_df = trump_df[trump_df['datetime'] <= datetime.date(2020, 11, 3)]

In [4]:
#Only keep US states, removing any Non-US tweets and US territories. 
biden_df = biden_df[biden_df['country']=='United States of America']
biden_df = biden_df[(biden_df['state'] != 'Guam') & (biden_df['state'] != 'Puerto Rico')]

trump_df = trump_df[trump_df['country']=='United States of America']
trump_df = trump_df[(trump_df['state'] != 'Guam') & (trump_df['state'] != 'Puerto Rico')]

In [5]:
#install VADER
!pip install vaderSentiment



In [6]:
# VADER sentimenal analysis
# We used the 'compound' score, from the polarity_score of the 
# SentimentIntensityAnalyzer(). 
# This compound score is computed by summing the valence scores 
# of each word in the lexicon, adjusted according to the rules, 
# and then normalized to be between -1 (most extreme negative) 
# and +1 (most extreme positive)

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
def vader_polarity(tweets):
    analyser = SentimentIntensityAnalyzer()
    polarity = []
    for tweet in tweets:
        polarity.append(analyser.polarity_scores(tweet)['compound'])
    return polarity 

In [7]:
# Add VADER polarity score to each tweet
biden_df['vader_polarity'] = vader_polarity(biden_df['tweet'])
trump_df['vader_polarity'] = vader_polarity(trump_df['tweet'])

In [8]:
# install TextBlob
!pip install -U textblob

Requirement already up-to-date: textblob in /Users/aisetoyama/opt/anaconda3/lib/python3.7/site-packages (0.15.3)


In [9]:
# The sentiment property returns a namedtuple of the form Sentiment(polarity, subjectivity). 
# The polarity score is a float within the range [-1.0, 1.0]. 
# The subjectivity is a float within the range [0.0, 1.0] where 0.0 is very objective and 1.0 is very subjective.

from textblob import TextBlob
def textblob_polarity(tweets): 
    polarity = []
    for tweet in tweets:
        testimonial = TextBlob(tweet)
        polarity.append(testimonial.sentiment.polarity)
    return polarity

In [10]:
# Add TextBlob polarity score to each tweet
biden_df['textblob_polarity'] = textblob_polarity(biden_df['tweet'])
trump_df['textblob_polarity'] = textblob_polarity(trump_df['tweet'])

In [11]:
#check polarity ranges

print("Biden VADER score, min:", min(biden_df['vader_polarity']), ", max: ", max(biden_df['vader_polarity']))
print("trump VADER score, min:", min(trump_df['vader_polarity']), ", max: ", max(trump_df['vader_polarity']))

print("Biden textblob score, min:", min(biden_df['textblob_polarity']), ", max: ", max(biden_df['textblob_polarity']))
print("trump textblob score, min:", min(trump_df['textblob_polarity']), ", max: ", max(trump_df['textblob_polarity']))



Biden VADER score, min: -0.995 , max:  0.9996
trump VADER score, min: -0.9987 , max:  0.9999
Biden textblob score, min: -1.0 , max:  1.0
trump textblob score, min: -1.0 , max:  1.0


In [12]:
#average the two polarity scores: VADER and textblob
biden_df['avg_polarity'] = biden_df[['vader_polarity', 'textblob_polarity']].mean(axis=1)
trump_df['avg_polarity'] = trump_df[['vader_polarity', 'textblob_polarity']].mean(axis=1)


In [13]:
print('#joebiden tweet count: ', len(biden_df))
print('Polarity score 0 count: ', len(biden_df[biden_df['avg_polarity']==0]))

#joebiden tweet count:  96528
Polarity score 0 count:  25294


In [14]:
print('#donaltrump tweet count: ', len(trump_df))
print('Polarity score 0 count: ', len(trump_df[trump_df['avg_polarity']==0]))

#donaltrump tweet count:  125783
Polarity score 0 count:  25833


In [15]:
# Only keep the following features: 
# user_id
# tweet
# state
# likes
# retweet_count
# avg_polarity

clean_biden_df = biden_df[['user_id','tweet', 'state', 'likes', 'retweet_count', 'avg_polarity']]
clean_trump_df = trump_df[['user_id','tweet', 'state', 'likes', 'retweet_count', 'avg_polarity']]


In [16]:
clean_biden_df.shape

(96528, 6)

In [17]:
clean_biden_df

Unnamed: 0,user_id,tweet,state,likes,retweet_count,avg_polarity
0,3.606665e+08,#Elecciones2020 | En #Florida: #JoeBiden dice ...,Florida,0.0,0.0,0.000000
2,3.494182e+09,@IslandGirlPRV @BradBeauregardJ @MeidasTouch T...,Montana,0.0,0.0,0.000000
4,1.032807e+18,#censorship #HunterBiden #Biden #BidenEmails #...,California,1.0,0.0,0.000000
6,1.994033e+07,"In 2020, #NYPost is being #censorship #CENSORE...",Illinois,0.0,0.0,-0.339855
11,9.607387e+17,"FBI Allegedly Obtained Hunter Biden Computer, ...",Kentucky,0.0,0.0,-0.050000
...,...,...,...,...,...,...
352651,1.237748e+18,Kamala Is My Sorority Sister Shirt\nBuy Now: h...,New York,0.0,0.0,0.000000
352653,9.965851e+17,#VoteBlueToEndTheNightmare \n#BidenHarrisToSav...,Arizona,0.0,0.0,0.000000
352662,1.208948e+09,God bless #JoeBiden who is fighting the good f...,Massachusetts,0.0,0.0,0.425950
352663,4.053900e+09,Here. We. Go. #ElectionNight \n\nðŸ”´ #Trump 0\nðŸ”µ...,Minnesota,0.0,0.0,0.000000


In [18]:
clean_trump_df.shape

(125783, 6)

In [19]:
clean_trump_df

Unnamed: 0,user_id,tweet,state,likes,retweet_count,avg_polarity
0,3.606665e+08,#Elecciones2020 | En #Florida: #JoeBiden dice ...,Florida,0.0,0.0,0.000000
2,8.436472e+06,"#Trump: As a student I used to hear for years,...",Oregon,2.0,1.0,0.461917
4,4.741380e+07,You get a tie! And you get a tie! #Trump â€˜s ra...,District of Columbia,4.0,3.0,0.000000
5,1.138416e+09,@CLady62 Her 15 minutes were over long time ag...,California,2.0,0.0,-0.323204
7,9.007611e+17,@DeeviousDenise @realDonaldTrump @nypost There...,Ohio,0.0,0.0,-0.041564
...,...,...,...,...,...,...
525435,6.248908e+08,@TimOBrien #Trump is a Squatter. He tryna evad...,California,0.0,0.0,-0.229400
525438,8.142944e+17,@BretBaier When Harris interviewed those repub...,California,0.0,0.0,0.401783
525452,1.277273e+18,#Trump #Trump2020 #MAGA #4MoreYears I told you...,Pennsylvania,1.0,0.0,0.000000
525459,4.053900e+09,Here. We. Go. #ElectionNight \n\nðŸ”´ #Trump 0\nðŸ”µ...,Minnesota,0.0,0.0,0.000000


In [20]:
#export to csv
clean_biden_df.to_csv('clean_biden.csv')
clean_trump_df.to_csv('clean_trump.csv')