# Dataset Evaluation - US Election 2020 Tweets

In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Load Datasets

In [2]:
data_path = os.path.join('..', 'data')

In [3]:
dtype = {
    'tweet_id': int,
    'likes': int,
    'retweet_count': int,
    'user_id': int,
    'user_followers_count': int,
    'lat': float,
    'long': float
}
parse_dates = ['created_at', 'collected_at']

## 1. Trump

In [4]:
path_trump = os.path.join(data_path, 'hashtag_donaldtrump.csv')
df_trump = pd.read_csv(path_trump, dtype=dtype, parse_dates=parse_dates, lineterminator='\n')

In [5]:
df_trump.insert(1, 'candidate', 'Trump')

In [6]:
df_trump.drop_duplicates(subset=['created_at', 'tweet_id', 'user_id'], inplace=True)
df_trump.sort_values('created_at', inplace=True, ignore_index=True)

In [7]:
df_trump

Unnamed: 0,created_at,candidate,tweet_id,tweet,likes,retweet_count,source,user_id,user_name,user_screen_name,...,user_followers_count,user_location,lat,long,city,country,continent,state,state_code,collected_at
0,2020-10-15 00:00:01,Trump,1316529221557252096,#Elecciones2020 | En #Florida: #JoeBiden dice ...,0,0,TweetDeck,360666534,El Sol Latino News,elsollatinonews,...,1860,"Philadelphia, PA / Miami, FL",25.774270,-80.193660,,United States of America,North America,Florida,FL,2020-10-21 00:00:00.000000000
1,2020-10-15 00:00:01,Trump,1316529222748430336,"Usa 2020, Trump contro Facebook e Twitter: cop...",26,9,Social Mediaset,331617619,Tgcom24,MediasetTgcom24,...,1067661,,,,,,,,,2020-10-21 00:00:00.373216530
2,2020-10-15 00:00:02,Trump,1316529228091846912,"#Trump: As a student I used to hear for years,...",2,1,Twitter Web App,8436472,snarke,snarke,...,1185,Portland,45.520247,-122.674195,Portland,United States of America,North America,Oregon,OR,2020-10-21 00:00:00.746433060
3,2020-10-15 00:00:02,Trump,1316529227471237120,2 hours since last tweet from #Trump! Maybe he...,0,0,Trumpytweeter,828355589206056960,Trumpytweeter,trumpytweeter,...,32,,,,,,,,,2020-10-21 00:00:01.119649591
4,2020-10-15 00:00:08,Trump,1316529252301451264,You get a tie! And you get a tie! #Trump ‘s ra...,4,3,Twitter for iPhone,47413798,Rana Abtar - رنا أبتر,Ranaabtar,...,5393,Washington DC,38.894992,-77.036558,Washington,United States of America,North America,District of Columbia,DC,2020-10-21 00:00:01.492866121
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
969572,2020-11-08 23:59:48,Trump,1325588866045194240,JOE BIDEN WHY JOE BIDEN DID WINNED ?? TRUMP TR...,4,0,Twitter Web App,3245258109,ahzy,ahhhzy,...,19,,,,,,,,,2020-11-09 17:47:55.715870000
969573,2020-11-08 23:59:53,Trump,1325588885515227136,#AfD|ler reagieren panisch bis hysterisch auf ...,25,4,Twitter Web App,986610758573178752,watchdog,wilke_tobias,...,3014,Saxony/GER,,,,,,,,2020-11-09 17:47:55.975868000
969574,2020-11-08 23:59:54,Trump,1325588892905402368,"@sammelbis1998 @iheartmindy @bnorthg First, yo...",0,0,Twitter for iPad,768781141,Debra,drdeblk,...,1079,,,,,,,,,2020-11-09 17:47:56.012674000
969575,2020-11-08 23:59:55,Trump,1325588894482534400,OK just had to do it !\n#Trump #CatapultTrump ...,105,28,Twitter Web App,451979107,Dunken K Bliths,DunkenKBliths,...,12192,✔ Official Twitter Account,,,,,,,,2020-11-09 17:47:55.807500000


In [8]:
df_trump.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 969577 entries, 0 to 969576
Data columns (total 22 columns):
 #   Column                Non-Null Count   Dtype         
---  ------                --------------   -----         
 0   created_at            969577 non-null  datetime64[ns]
 1   candidate             969577 non-null  object        
 2   tweet_id              969577 non-null  int64         
 3   tweet                 969577 non-null  object        
 4   likes                 969577 non-null  int64         
 5   retweet_count         969577 non-null  int64         
 6   source                968701 non-null  object        
 7   user_id               969577 non-null  int64         
 8   user_name             969561 non-null  object        
 9   user_screen_name      969577 non-null  object        
 10  user_description      868568 non-null  object        
 11  user_join_date        969577 non-null  object        
 12  user_followers_count  969577 non-null  int64         
 13 

## 2. Biden

In [9]:
path_biden = os.path.join(data_path, 'hashtag_joebiden.csv')
df_biden = pd.read_csv(path_biden, dtype=dtype, parse_dates=parse_dates, lineterminator='\n')

In [10]:
df_biden.insert(1, 'candidate', 'Biden')

In [11]:
df_biden.drop_duplicates(subset=['created_at', 'tweet_id', 'user_id'], inplace=True)
df_biden.sort_values('created_at', inplace=True, ignore_index=True)

In [12]:
df_biden

Unnamed: 0,created_at,candidate,tweet_id,tweet,likes,retweet_count,source,user_id,user_name,user_screen_name,...,user_followers_count,user_location,lat,long,city,country,continent,state,state_code,collected_at
0,2020-10-15 00:00:01,Biden,1316529221557252096,#Elecciones2020 | En #Florida: #JoeBiden dice ...,0,0,TweetDeck,360666534,El Sol Latino News,elsollatinonews,...,1860,"Philadelphia, PA / Miami, FL",25.774270,-80.193660,,United States of America,North America,Florida,FL,2020-10-21 00:00:00.000000000
1,2020-10-15 00:00:18,Biden,1316529295859290112,#HunterBiden #HunterBidenEmails #JoeBiden #Joe...,0,0,Twitter for iPad,809904438,Cheri A. 🇺🇸,Biloximeemaw,...,6628,,,,,,,,,2020-10-21 00:00:00.517827283
2,2020-10-15 00:00:20,Biden,1316529305006952448,@IslandGirlPRV @BradBeauregardJ @MeidasTouch T...,0,0,Twitter Web App,3494182277,Flag Waver,Flag_Wavers,...,1536,Golden Valley Arizona,46.304036,-109.171431,,United States of America,North America,Montana,MT,2020-10-21 00:00:01.035654566
3,2020-10-15 00:00:21,Biden,1316529308081557504,@chrislongview Watching and setting dvr. Let’s...,0,0,Twitter for iPhone,824259601201852416,Michelle Ferg,MichelleFerg4,...,27,,,,,,,,,2020-10-21 00:00:01.553481849
4,2020-10-15 00:00:22,Biden,1316529312741253120,#censorship #HunterBiden #Biden #BidenEmails #...,1,0,Twitter Web App,1032806955356545024,the Gold State,theegoldstate,...,390,"California, USA",36.701463,-118.755997,,United States of America,North America,California,CA,2020-10-21 00:00:02.071309132
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
775031,2020-11-08 23:59:38,Biden,1325588825654095872,Ωχ ελπίζω να μη μας βγει σαν τους οπαδούς του...,0,0,Twitter for Android,403281875,οχι άλλο κάρβουνο 🇬🇷🗣🗣🗣,anapodoi,...,772,,,,,,,,,2020-11-09 18:32:45.947617000
775032,2020-11-08 23:59:41,Biden,1325588838123757568,L'OTAN va sortir de sa léthargie et redevenir ...,48,14,Twitter for Android,781918265400492032,🇫🇷 Alt-Droite (matricule 6921) ✝️ 🇬🇷 🇮🇹 🇦🇲,CtrlAltDroite,...,15806,France,46.603354,1.888334,,France,Europe,,,2020-11-09 18:32:45.627335000
775033,2020-11-08 23:59:52,Biden,1325588881346015232,🌎\n\n“#congiuntifuoriregione”\n\n‘Sono felice ...,1,1,Twitter for iPhone,529331509,Angelo Tani,AngeloTani,...,5974,🌎,,,,,,,,2020-11-09 18:32:45.599846000
775034,2020-11-08 23:59:57,Biden,1325588902032322560,#JoeBiden 😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂 https://t.co/Ym...,0,0,Twitter for iPhone,114847946,Nataša,PewPeeew,...,33,Deutschland,51.083420,10.423447,,Germany,Europe,,,2020-11-09 18:26:04.550843000


In [13]:
df_biden.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 775036 entries, 0 to 775035
Data columns (total 22 columns):
 #   Column                Non-Null Count   Dtype         
---  ------                --------------   -----         
 0   created_at            775036 non-null  datetime64[ns]
 1   candidate             775036 non-null  object        
 2   tweet_id              775036 non-null  int64         
 3   tweet                 775036 non-null  object        
 4   likes                 775036 non-null  int64         
 5   retweet_count         775036 non-null  int64         
 6   source                774325 non-null  object        
 7   user_id               775036 non-null  int64         
 8   user_name             775018 non-null  object        
 9   user_screen_name      775036 non-null  object        
 10  user_description      693174 non-null  object        
 11  user_join_date        775036 non-null  object        
 12  user_followers_count  775036 non-null  int64         
 13 

## Join Datasets

In [14]:
df = pd.concat([df_trump, df_biden], copy=False, ignore_index=True)
df.sort_values('created_at', inplace=True, ignore_index=True)

In [15]:
df

Unnamed: 0,created_at,candidate,tweet_id,tweet,likes,retweet_count,source,user_id,user_name,user_screen_name,...,user_followers_count,user_location,lat,long,city,country,continent,state,state_code,collected_at
0,2020-10-15 00:00:01,Trump,1316529221557252096,#Elecciones2020 | En #Florida: #JoeBiden dice ...,0,0,TweetDeck,360666534,El Sol Latino News,elsollatinonews,...,1860,"Philadelphia, PA / Miami, FL",25.774270,-80.193660,,United States of America,North America,Florida,FL,2020-10-21 00:00:00.000000000
1,2020-10-15 00:00:01,Biden,1316529221557252096,#Elecciones2020 | En #Florida: #JoeBiden dice ...,0,0,TweetDeck,360666534,El Sol Latino News,elsollatinonews,...,1860,"Philadelphia, PA / Miami, FL",25.774270,-80.193660,,United States of America,North America,Florida,FL,2020-10-21 00:00:00.000000000
2,2020-10-15 00:00:01,Trump,1316529222748430336,"Usa 2020, Trump contro Facebook e Twitter: cop...",26,9,Social Mediaset,331617619,Tgcom24,MediasetTgcom24,...,1067661,,,,,,,,,2020-10-21 00:00:00.373216530
3,2020-10-15 00:00:02,Trump,1316529228091846912,"#Trump: As a student I used to hear for years,...",2,1,Twitter Web App,8436472,snarke,snarke,...,1185,Portland,45.520247,-122.674195,Portland,United States of America,North America,Oregon,OR,2020-10-21 00:00:00.746433060
4,2020-10-15 00:00:02,Trump,1316529227471237120,2 hours since last tweet from #Trump! Maybe he...,0,0,Trumpytweeter,828355589206056960,Trumpytweeter,trumpytweeter,...,32,,,,,,,,,2020-10-21 00:00:01.119649591
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1744608,2020-11-08 23:59:54,Trump,1325588892905402368,"@sammelbis1998 @iheartmindy @bnorthg First, yo...",0,0,Twitter for iPad,768781141,Debra,drdeblk,...,1079,,,,,,,,,2020-11-09 17:47:56.012674000
1744609,2020-11-08 23:59:55,Trump,1325588894482534400,OK just had to do it !\n#Trump #CatapultTrump ...,105,28,Twitter Web App,451979107,Dunken K Bliths,DunkenKBliths,...,12192,✔ Official Twitter Account,,,,,,,,2020-11-09 17:47:55.807500000
1744610,2020-11-08 23:59:56,Trump,1325588898299453440,@nbcbayarea Who doesn’t like dogs or any kind ...,1,1,Twitter for iPhone,118263866,Dianna Maria,DiannaMaria,...,1462,United States,39.783730,-100.445882,,United States,North America,,,2020-11-09 17:46:06.939099000
1744611,2020-11-08 23:59:57,Biden,1325588902032322560,#JoeBiden 😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂😂 https://t.co/Ym...,0,0,Twitter for iPhone,114847946,Nataša,PewPeeew,...,33,Deutschland,51.083420,10.423447,,Germany,Europe,,,2020-11-09 18:26:04.550843000


In [16]:
np.min(df['created_at'])

Timestamp('2020-10-15 00:00:01')

In [17]:
np.max(df['created_at'])

Timestamp('2020-11-08 23:59:58')

In [18]:
next_time = np.min(df['created_at']) + np.timedelta64(60, 's')
next_time

Timestamp('2020-10-15 00:01:01')

In [19]:
df_slice = df[np.logical_and(np.min(df['created_at'])<=df['created_at'], df['created_at']<next_time)]

In [20]:
df_slice[df_slice['candidate']=='Trump']

Unnamed: 0,created_at,candidate,tweet_id,tweet,likes,retweet_count,source,user_id,user_name,user_screen_name,...,user_followers_count,user_location,lat,long,city,country,continent,state,state_code,collected_at
0,2020-10-15 00:00:01,Trump,1316529221557252096,#Elecciones2020 | En #Florida: #JoeBiden dice ...,0,0,TweetDeck,360666534,El Sol Latino News,elsollatinonews,...,1860,"Philadelphia, PA / Miami, FL",25.77427,-80.19366,,United States of America,North America,Florida,FL,2020-10-21 00:00:00.000000000
2,2020-10-15 00:00:01,Trump,1316529222748430336,"Usa 2020, Trump contro Facebook e Twitter: cop...",26,9,Social Mediaset,331617619,Tgcom24,MediasetTgcom24,...,1067661,,,,,,,,,2020-10-21 00:00:00.373216530
3,2020-10-15 00:00:02,Trump,1316529228091846912,"#Trump: As a student I used to hear for years,...",2,1,Twitter Web App,8436472,snarke,snarke,...,1185,Portland,45.520247,-122.674195,Portland,United States of America,North America,Oregon,OR,2020-10-21 00:00:00.746433060
4,2020-10-15 00:00:02,Trump,1316529227471237120,2 hours since last tweet from #Trump! Maybe he...,0,0,Trumpytweeter,828355589206056960,Trumpytweeter,trumpytweeter,...,32,,,,,,,,,2020-10-21 00:00:01.119649591
5,2020-10-15 00:00:08,Trump,1316529252301451264,You get a tie! And you get a tie! #Trump ‘s ra...,4,3,Twitter for iPhone,47413798,Rana Abtar - رنا أبتر,Ranaabtar,...,5393,Washington DC,38.894992,-77.036558,Washington,United States of America,North America,District of Columbia,DC,2020-10-21 00:00:01.492866121
6,2020-10-15 00:00:17,Trump,1316529291052675072,@CLady62 Her 15 minutes were over long time ag...,2,0,Twitter for Android,1138416104,Farris Flagg,FarrisFlagg,...,2363,"Perris,California",33.782519,-117.228648,,United States of America,North America,California,CA,2020-10-21 00:00:01.866082651
7,2020-10-15 00:00:17,Trump,1316529289949569024,@richardmarx Glad u got out of the house! DICK...,0,0,Twitter for iPhone,767401841030209536,Michael Wilson,wilsonfire9,...,75,"Powell, TN",,,,,,,,2020-10-21 00:00:02.239299182
9,2020-10-15 00:00:18,Trump,1316529293497962496,@DeeviousDenise @realDonaldTrump @nypost There...,0,0,Twitter for iPhone,900761071631429632,Stacey Gulledge 🇺🇸 Patriot ♥️ KAG 🙏 👮‍♀️♥️,sm_gulledge,...,766,"Ohio, USA",40.225357,-82.68814,,United States of America,North America,Ohio,OH,2020-10-21 00:00:02.612515712
10,2020-10-15 00:00:20,Trump,1316529301332918272,One of the single most effective remedies to e...,0,0,Twitter Web App,540476889,Jamieo,jamieo33,...,151,"Pennsylvania, USA",40.969989,-77.727883,,United States of America,North America,Pennsylvania,PA,2020-10-21 00:00:02.985732243
13,2020-10-15 00:00:21,Trump,1316529308576309248,#Election2020 #Trump \n#FreedomOfSpeech https:...,0,0,Twitter Web App,1305532976998969088,Johnny Quest,JohnnyQuest22,...,8,,,,,,,,,2020-10-21 00:00:03.358948773
