In [None]:
import os
import pickle
import pandas as pd
from tqdm import tqdm
from searchtweets import gen_request_parameters, load_credentials, collect_results

In [None]:
credentials_path = "path_to_credentials"
search_args = load_credentials(f"{credentials_path}/twitter_keys_unlimited.yaml",
                                       yaml_key="search_tweets_v2",
                                       env_overwrite=False)

search_args['endpoint'] = "https://api.twitter.com/2/tweets/counts/all"

In [None]:
countries = pd.read_csv('/data/raw/country_list.csv')

def find_tweet_counts_per_country(country):
    # place_country:US, lang:en
    query = gen_request_parameters(f"(vaccine OR vaccination) place_country:{country} has:geo lang:en -is:retweet", granularity="", start_time="2020-01-01T00:00", end_time="2022-01-01T00:00")

    vaccine_vaccination_tweets = collect_results(query,
                                                 result_stream_args=search_args)
    
    return vaccine_vaccination_tweets

In [None]:
country_tweet_counts_path = '/data/raw/country_tweet_counts'
os.makedirs(country_tweet_counts_path, exist_ok=True)

In [11]:
for country in tqdm(countries['Code'].values):
    country_saving_path = f'{country_tweet_counts_path}/{country}.db'
    if os.path.exists(country_saving_path):
        continue

    try:
        counts = find_tweet_counts_per_country(country)
        with open(country_saving_path, 'wb') as f:
            pickle.dump(counts, f)
    except:
        pass

 24%|██▍       | 61/249 [00:06<00:20,  9.28it/s]ERROR:searchtweets.result_stream: HTTP Error code: 400: {"errors":[{"parameters":{"query":["(vaccine OR vaccination) place_country:nan has:geo lang:en -is:retweet"]},"message":"There were errors processing your request: Place Country Code: 'nan', is longer than the required two character country code (at position 1)"}],"title":"Invalid Request","detail":"One or more parameters to your request was invalid.","type":"https://api.twitter.com/2/problems/invalid-request"} | Bad Request
ERROR:searchtweets.result_stream: Request payload: {'query': '(vaccine OR vaccination) place_country:nan has:geo lang:en -is:retweet', 'start_time': '2020-01-01T00:00:00Z', 'end_time': '2022-01-01T00:00:00Z'}
ERROR:searchtweets.result_stream:Quitting... 
100%|██████████| 249/249 [00:18<00:00, 13.32it/s]


## Analyzing Tweets

In [17]:
country_counts = os.listdir(country_tweet_counts_path)

In [21]:
countries_and_counts = {}

for country in tqdm(country_counts):
    code = country.split('.')[0]

    country_saving_path = f'{country_tweet_counts_path}/{country}'
    with open(country_saving_path, 'rb') as f:
        counts = pickle.load(f)

    countries_and_counts[code] = 0
    for request in counts:
        countries_and_counts[code] += request['meta']['total_tweet_count']

100%|██████████| 248/248 [00:05<00:00, 48.18it/s]


In [22]:
srtd_counts = pd.DataFrame([[k, v] for k,v in sorted(countries_and_counts.items(), key=lambda item: item[1], reverse=True)],
                          columns=['country', 'count'])

srtd_counts['ratio'] = srtd_counts['count'].values / srtd_counts['count'].values.sum()

In [26]:
srtd_counts = srtd_counts.rename(columns={'country':'Code'})
srtd_counts = srtd_counts.join(countries.set_index('Code'), on="Code")[["Name", 'Code', 'count', 'ratio']].rename(columns={"Name":'name', "Code":'code'})

In [27]:
srtd_counts

Unnamed: 0,name,code,count,ratio
0,United States,US,618752,0.486684
1,United Kingdom,GB,197172,0.155087
2,India,IN,129941,0.102206
3,Canada,CA,84584,0.066530
4,Australia,AU,41550,0.032681
...,...,...,...,...
243,Svalbard and Jan Mayen,SJ,0,0.000000
244,Tokelau,TK,0,0.000000
245,Tuvalu,TV,0,0.000000
246,Wallis and Futuna,WF,0,0.000000


In [28]:
table_saving_path = "/data/tables"
srtd_counts.to_excel(f'{table_saving_path}/country_tweet_counts_and_ratio.xlsx', index=False)