In [None]:
import os
import json
import calendar
import pandas as pd
from searchtweets import gen_request_parameters, load_credentials, collect_results

# 2020's and 2021's "Vaccine OR Vaccination" tweets

In [None]:
search_args = load_credentials("./twitter_keys_2m.yaml",
                                       yaml_key="search_tweets_v2",
                                       env_overwrite=False)

search_args['endpoint'] = "https://api.twitter.com/2/tweets/search/recent"

### Automatization

In [None]:
def split_text(text):
    splitted_text = text.split()
    hashtags = []
    mentions = []
    links = []
    raw_text = []
    for i in splitted_text:
        if i.startswith('#'):
            hashtags.append(i)
        elif i.startswith('http'):
            links.append(i)
        elif i.startswith('@'):
            mentions.append(i)
        else:
            raw_text.append(i)
    
    return [' '.join(raw_text), ' '.join(hashtags), ' '.join(mentions), ' '.join(links)]

In [None]:
columns = ['id', 'author_id', 'created_at', 'language', 'text', 'hashtags', 'mentions', 
           'links', 'geo_place_id', 'coordinates_type', 'longitude', 'latitude']

In [None]:
calendar_obj = calendar.Calendar()

for year in [2020, 2021]:
    for month in list(range(1,13)):
        days = [i for i in calendar_obj.itermonthdays(year, month) if i != 0]
        for day in days:
            start = f"{year}-{month:02}-{day:02}T00:00"
            saving_path = f"/data/processed/daily_world_en_csv/{start[:10]}.csv"

            if day == days[-1]:
                day = 0
                month = month+1

            if month == 13:
                month = 1
                year = year+1

            end = f"{year}-{month:02}-{day+1:02}T00:00"

            if os.path.isfile(saving_path):
                continue

            print("start:", start, "end:", end)

            # place_country:US, lang:en
            query = gen_request_parameters("(vaccine OR vaccination) lang:en -is:retweet",
                                           results_per_call=500, granularity="",
                                           tweet_fields="id,text,created_at,geo,author_id,lang",
                                           start_time=start, end_time=end)

            vaccine_vaccination_tweets = collect_results(query,
                                                         max_tweets=5000000,
                                                         result_stream_args=search_args) # change this if you need to

            with open(f"/data/processed/daily_world_en_json/{start[:10]}.json", 'w') as f:
                json.dump(vaccine_vaccination_tweets, f)

            daily_tweets = []
            for query in vaccine_vaccination_tweets:
                for atweet in query['data']:
                    id_ = atweet['id']
                    author_id = atweet['author_id']
                    language = atweet['lang']
                    text = split_text(atweet['text'])
                    created_at = atweet['created_at']
                    geo = atweet.get('geo')
                    if geo:
                        geo_coordinates = geo.get('coordinates')
                        place_id = geo.get('place_id', '')
                        if geo_coordinates:
                            geo_array = [place_id, geo['coordinates']['type'], *geo['coordinates']['coordinates']]
                        else:
                            geo_array = [place_id, '', '', '']
                    else:
                        geo_array = ['', '', '', '']

                    daily_tweets.append([id_, author_id, created_at, language, *text, *geo_array])

            # Vaccine_vaccination_daily_csv
            df = pd.DataFrame(daily_tweets, columns=columns)
            df.to_csv(saving_path, index=False)