In [31]:
import pandas as pd
# pip install twython
from twython import Twython
import re, os, time
import dateutil.parser as dateparser
import json
from datetime import datetime

In [33]:
class edu_tweet_downloader(object):
    ## initialize the Tweet downloader
    def __init__(self, date, consumer_key, last, consumer_secret, MAX_LOOKUP_NUMBER=100, SLEEP_TIME=60,
                 keywords=["teach", "educat", "school", "student", "university", "college"]):

        # accept user input
        self.MAX_LOOKUP_NUMBER = MAX_LOOKUP_NUMBER
        self.SLEEP_TIME = SLEEP_TIME
        self.date = dateparser.parse(date).date()  
        self.consumer_key = consumer_key
        self.consumer_secret = consumer_secret
        self.keywords = [keyword.lower() for keyword in keywords]
        self.last = last  # maximum number of Tweets to look up in a original daily dataset

        ## initialize the API client
        self.twitter = Twython(self.consumer_key, self.consumer_secret)

        ## load a list of daily Tweet IDs to hydrate
        self.tid_list = []
        path = f"./raw/{self.date}_clean-dataset.tsv"
        daily = pd.read_csv(path, header=0, sep='\t')
        self.tid_list = daily['tweet_id'][:int(last)].astype('str')

        ## the total number of Tweets to hydrate
        tid_number = len(self.tid_list)
        ## compute the number of batches to download
        max_round = tid_number // self.MAX_LOOKUP_NUMBER + 1
        ## initialize time counters
        now, then = 0, 0
        ## loop over batches
        self.statuses = []
        for i in range(max_round):
            ## slice out the Tweet ids for this batch
            lookup_tids = self.tid_list[i * self.MAX_LOOKUP_NUMBER:
                                        (i + 1) * self.MAX_LOOKUP_NUMBER]
            ## advance time counters
            then = now
            now = time.time()
            ## compute remaining time
            REMAINING_SLEEP = self.SLEEP_TIME - int(now - then) + 1
            ## only sleep if we have already made previous calls
            if then:
                time.sleep(REMAINING_SLEEP)
            ## hydrate the daily Tweets
            self.statuses.extend(
                self.twitter.lookup_status(id=",".join(lookup_tids), tweet_mode='extended', trim_user='false'))
            then = time.time()

            ## filter the daily Tweets on the list and download the filtered

    def download(self):
        ## filter the Tweets with user-input keywords
        filtered_ids = []
        filtered_hashtags = []
        filtered_statuses = []
        for status in self.statuses:
            if re.search('|'.join(self.keywords), status['full_text'].lower()) and status['lang'] == 'en':
                filtered_ids.append(status['id'])
                filtered_hashtags.extend([i['text'] for i in status['entities']['hashtags']])
                filtered_statuses.append(status)
        ##construct the database
        os.system("mkdir './data/'")
        with open(f"./data/{self.date}_ids.txt", "w") as f:
            for i in filtered_ids:
                f.write(str(i) + "\n")
        with open(f"./data/{self.date}_full.json", "w") as f:
            for status in filtered_statuses:
                f.write(json.dumps(status) + "\n")

In [None]:
## example: download daily educational COVID Tweets from May 3rd, 2020
downloader = edu_tweet_downloader('2020/'+str(5)+'/'+str(3), last=100000, consumer_key='...', 
                                  consumer_secret='...')
downloader.download()

In [46]:
## merge daily files 
def merge_JsonFiles(filename, outputfilename): 
    data=[]
    for f in filename: 
        file = open(f'./data/{f}_full.json',)     
        for line in file: 
            data.append(json.loads(line))
        file.close()
    with open(f"./data/{outputfilename}_full.json", "w") as f: 
        for line in data: 
            f.write(json.dumps(line) + "\n")

In [49]:
## example: merge educational COVID Tweets from Phase 1 (May 3-9, 2020)
filename=['2020-05-03', '2020-05-04', '2020-05-05', '2020-05-06', '2020-05-07', '2020-05-08', '2020-05-09']
merge_JsonFiles(filename, "phase1")

In [34]:
## json to excel 
df = pd.DataFrame()
with open('./data/phase1_full.json', "r") as f:
    for line in f.readlines():
        result=json.loads(line)
        l = [result]
        df = df.append(pd.DataFrame(l), sort=True)
df['created_at'] = pd.to_datetime(df['created_at'])
df['created_at'] = df['created_at'].apply(lambda a: datetime.strftime(a,"%Y-%m-%d %H:%M:%S"))
df['created_at'] = pd.to_datetime(df['created_at'])
df.to_excel('./data/phase1_full.xlsx')

In [None]:
## example: read excel data from phase 1
phase1data=pd.read_excel('./data/phase1_full.xlsx')  