<a href="https://colab.research.google.com/github/mrf444/Coronavirus_Analysis/blob/master/january_daily_tweets.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import os
import time
import numpy as np
import pandas as pd
from google.colab import drive
import gzip
import json
import pickle
import glob

In [3]:
drive.mount('/content/gdrive', force_remount = True)
#clone repo
#!git clone https://github.com/echen102/COVID-19-TweetIDs.git
#move folder to 
#!mv '/content/COVID-19-TweetIDs' 'gdrive/My Drive/AdvPy/'
os.chdir("gdrive/My Drive/AdvPy/COVID-19-TweetIDs")

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/gdrive


In [0]:
def get_tweets_json(filename):
  f = gzip.open(filename, 'rb')
  response = f.read().decode('utf-8')
  f.close()
  return response.split('\n')

In [0]:
def metadata_df(tweets, RT=False):
  total_tweets = 0
  latlong_tweets = 0

  #create dataframe to store tweet metadata
  df = pd.DataFrame(columns=['tweet_id',
                                      'created_at',
                                      'user_id',
                                      'user_name',
                                      'location',
                                      'country',
                                      'latitude',
                                      'longitude',
                                      'text'])

  for i in range(len(tweets)):
    try:
      tweet = json.loads(tweets[i])
    except:
      #already formatted in JSON
      if tweets[i] != '': #if tweet is not empty
        tweet = tweets[i]

    #NOTE: this line filters out retweets
    if not RT: #if we don't want retweets
      if 'RT' in tweet['full_text']: continue

    try:    
      #first, get location (city) and country depending on what info is available
      if tweet['place'] != None:
          location = tweet['place']['full_name']
          country = tweet['place']['country_code']
          
      else:
          location = tweet['user']['location'] #if actual city doesn't exist, get location provided by the user
          country = np.nan
          
      #get latitude and longitude depending on whether an exact location or bounding box is provided
      if tweet['coordinates'] != None: #exact location
          longitude, latitude = tweet['coordinates']['coordinates']
          
      elif tweet['place'] != None: #bounding box --> take center point (average) of box
          bounding_box = np.array(tweet['place']['bounding_box']['coordinates'][0])
          longitude, latitude = np.mean(bounding_box, axis=0)
          
      else: #no location provided --> make nan
          longitude = np.nan
          latitude = np.nan

      #append this tweet to the dataframe
      df = df.append({'tweet_id':tweet['id'],
                      'created_at':tweet['created_at'],
                      'user_id':tweet['user']['id'],
                      'user_name':tweet['user']['screen_name'],
                      'location':location,
                      'country':country,
                      'latitude':latitude,
                      'longitude':longitude,
                      'text':tweet['full_text'],
                    }, ignore_index=True)
      
    except:
      continue

  return df

In [0]:
# '2020-01-21' <---- must be a string of this format
def daily_tweets(day):
  daily_df = pd.DataFrame(columns=['tweet_id',
                                    'created_at',
                                    'user_id',
                                    'user_name',
                                    'location',
                                    'country',
                                    'latitude',
                                    'longitude',
                                    'text'])
  month_dir = day[:7]
  hours = ["%02d" % n for n in range(24)]

  for hour in hours:
    filename = './'+month_dir+'/coronavirus-tweet-id-'+ \
                day+'-'+hour+'.jsonl.gz'
    
    if not os.path.exists(filename): #if file doesn't exist
      continue
    else:
      print("\rReading tweets for:",day+'-'+hour, end='')
      tweets = get_tweets_json(filename)
      hourly_df = metadata_df(tweets) #RT = False
      daily_df = pd.concat([daily_df,hourly_df])

  return daily_df

In [40]:
day = '2020-01-23'
daily_df = daily_tweets(day)
daily_df

Reading tweets for: 2020-01-23-23

Unnamed: 0,tweet_id,created_at,user_id,user_name,location,country,latitude,longitude,text
0,1220147193006432257,Thu Jan 23 00:52:14 +0000 2020,1050406362226847748,Jonatha43241211,"North Carolina, USA",,,,@DudeDudeologist @ThatShaneB @March_for_Life T...
1,1220147242327330820,Thu Jan 23 00:52:26 +0000 2020,2723456510,TinfoilTricorn,"Valley Forge, PA",,,,@AnonsSynonymous @JackPosobiec CDC cannot be t...
2,1220147392156262402,Thu Jan 23 00:53:02 +0000 2020,100903475,justintimkim,,,,,CDC to screen at three US airports for signs o...
3,1220147630900228102,Thu Jan 23 00:53:59 +0000 2020,1050406362226847748,Jonatha43241211,"North Carolina, USA",,,,@DudeDudeologist @ThatShaneB @March_for_Life 1...
4,1220147737112563713,Thu Jan 23 00:54:24 +0000 2020,1156624759637118976,HumanClimateGen,"Pennsylvania, USA",,,,CDC details first U.S. case of novel virus spr...
...,...,...,...,...,...,...,...,...,...
2028,1220481355819225088,Thu Jan 23 23:00:05 +0000 2020,11851702,dhughes,Charlottetown,,,,@Goatboy641 @juliaoftoronto Yes the 2019-nCov ...
2029,1220481364270563328,Thu Jan 23 23:00:07 +0000 2020,30846824,TOICitiesNews,,,,,What is novel coronavirus (2019-nCoV)? https:/...
2030,1220481548039880707,Thu Jan 23 23:00:51 +0000 2020,848638792206516224,PorterMedium,"New York, USA",,,,"Breaking: According to Public Health England, ..."
2031,1220482683261857793,Thu Jan 23 23:05:21 +0000 2020,308148081,BubblesBurster,WW,,,,@realDonaldTrump What is crowdstrike?\nWhy did...


In [0]:
#year_month must be in format '2020-01'
#folder is '2020-01-daily-dfs'
def save_monthly_tweets(year_month,folder):
  days = [year_month+'-%02d' % n for n in range(1,32)]
  for day in days:
    #if no tweet data for that day
    if not [f for f in os.listdir('./'+year_month) if f.startswith("coronavirus-tweet-id-"+day)]:
      continue
    else: #get the daily data frame and pickle it
      daily_df = daily_tweets(day)
      pickle_filename = './'+folder+'/'+day+'-tweets.pkl'
      daily_df.to_pickle(pickle_filename)
  return

In [0]:
save_monthly_tweets('2020-01','2020-01-daily-dfs')

Reading tweets for: 2020-01-25-08