# Country-wise comparison

**V1.1 includes S3 integration, whereas the original notebook did not.**

Comparison with New Zealand, Australia, and Canada.

In [1]:
import os
import re
import csv
import pandas as pd
from datetime import datetime, timedelta

import boto3
from s3fs.core import S3FileSystem

import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme(style="whitegrid")
sns.set(rc={'figure.figsize':(11.7, 8.27)})

In [2]:
sess = boto3.Session(profile_name="xmiles")

def upload_to_s3(in_fpath, out_key):
    s3 = sess.client('s3')
    bucket_name = 'statsnz-covid-xmiles'
    
    s3.put_object(Body=open(in_fpath, 'rb'), Bucket=bucket_name, Key=out_key)
    

def read_file_from_s3(bucket, key):
    s3 = sess.client('s3')
    
    obj = s3.get_object(Bucket=bucket, Key=key)
    content = obj['Body'].read().decode()
    
    return content

def list_all_objects_s3(bucket, prefix):
    """
    Necessary since the list_objects_v2() function only lists the first 1000 
    objects, and requires a continuation token to get the next 1000 objects.
    """
    s3 = sess.client('s3')
    keys = []
    truncated = True
    next_cont_token = ""

    while truncated:
        if next_cont_token:
            resp = s3.list_objects_v2(Bucket=bucket, Prefix=prefix, 
                                      ContinuationToken=next_cont_token)
        else:
            resp = s3.list_objects_v2(Bucket=bucket, Prefix=prefix)

        keys += [x['Key'] for x in resp['Contents'] 
                 if ".ipynb_checkpoints" not in x['Key']]

        truncated = resp['IsTruncated']
        if truncated:
            next_cont_token = resp['NextContinuationToken']
            
    return keys

In [3]:
def merge_csvs(mergefile, infpaths, overwrite=False):
    if os.path.exists(mergefile) and not overwrite:
        return
    
    with open(mergefile, 'w', newline="") as outfile:
        outwriter = csv.writer(outfile, delimiter=',')
        outwriter.writerow(headers)
        for fpath in infpaths:
            with open(fpath) as infile:
                inwriter = csv.reader(infile, delimiter=',')
                outwriter.writerows(inwriter)
                

def merge_csvs_from_s3(mergefile, inkeys, bucket, headers, overwrite=False):
    if os.path.exists(mergefile) and not overwrite:
        return
    
    # Clear existing CSV
    with open(mergefile, 'w') as f:
        f.write('')
    
    for i, key in enumerate(inkeys):
        content = read_csv_from_s3(bucket, key)
        with open(mergefile, mode='a') as f:
            f.write(content)

In [4]:
headers = [
    'gkg_id', 'date', 'source', 'source_name', 'doc_id', 
    'themes', 'locations', 'persons', 'orgs', 
    'tone', 'pos', 'neg', 'polarity', 'ard', 'srd',
    'wc', 
    'lexicode_neg', 'lexicode_pos', # c3.*
    'MACROECONOMICS', 'ENERGY', 'FISHERIES', 
    'TRANSPORTATION', 'CRIME', 'SOCIAL_WELFARE',
    'HOUSING', 'FINANCE', 'DEFENCE', 'SSTC',
    'FOREIGN_TRADE', 'CIVIL_RIGHTS', 
    'INTL_AFFAIRS', 'GOVERNMENT_OPS',
    'LAND-WATER-MANAGEMENT', 'CULTURE',
    'PROV_LOCAL', 'INTERGOVERNMENTAL',
    'CONSTITUTIONAL_NATL_UNITY', 'ABORIGINAL',
    'RELIGION', 'HEALTHCARE', 'AGRICULTURE',
    'FORESTRY', 'LABOUR', 'IMMIGRATION',
    'EDUCATION', 'ENVIRONMENT',
    'finstab_pos', 'finstab_neg', 'finstab_neutral',
    'finsent_neg', 'finsent_pos', 'finsent_unc',
    'opin_neg', 'opin_pos',
    'sent_pos', 'sent_neg', 'sent_pol'
]

In [5]:
country_to_prefix = { 
    'nz': "processed_gdelt_nz/",
    'au': "processed_gdelt_au/",
    'ca': "processed_gdelt_ca/"
}
countries = ['nz', 'au', 'ca']

**Loading the merged 2020/2021 CSV files for New Zealand, Australia, and Canada exceeds the available RAM so jupyter crashes.**

In [6]:
def get_time_aggregated_dfs(csv_file):
    """
    Returns four DataFrames for the given country
    - daily_tone: tone, pos, neg (daily)
    - weekly_tone: tone, pos, neg (weekly)
    - daily_count: number of articles (daily)
    - weekly_count: number of articles (weekly)
    """
    gdelt = pd.read_csv(csv_file)
    
    print("read")
    print(gdelt.head())
    gdelt['date'] = pd.to_datetime(gdelt['date'], format="%Y%m%d%H%M%S")
    gdelt = gdelt.sort_values(by=["gkg_id"]).reset_index()
    print("tidied")
    
    daily_tone = gdelt.resample('D', on='date')[['tone', 'pos', 'neg']].mean()
    daily_count = gdelt.resample('D', on='date')['gkg_id'].count()
    
    weekly_tone = gdelt.resample('W-Mon', on='date')[['tone', 'pos', 'neg']].mean()
    weekly_count = gdelt.resample('W-Mon', on='date')['gkg_id'].count()
    # Remove partial weeks at beginning and end of weekly-aggregation
#     first_monday = 
#     final_sunday = 
#     weekly_tone = weekly_tone[first_monday <= weekly_tone['date'] <= final_sunday]
#     weekly_count = weekly_count[first_monday <= weekly_count['date'] <= final_sunday]
    print("compiled")
    
    return daily_tone, weekly_tone, daily_count, weekly_count

In [7]:
def get_time_aggregated_dfs_s3(bucket, csv_key):
    """
    Returns four DataFrames for the given country
    - daily_tone: tone, pos, neg (daily)
    - weekly_tone: tone, pos, neg (weekly)
    - daily_count: number of articles (daily)
    - weekly_count: number of articles (weekly)
    """
    gdelt_raw = read_file_from_s3(bucket, csv_key)
    gdelt = pd.DataFrame([
        x for x in line.split(',')
        for line in gdelt_raw.split('\r\n')
    ])
    
    print("read")
    print(gdelt.head())
    gdelt['date'] = pd.to_datetime(gdelt['date'], format="%Y%m%d%H%M%S")
    gdelt = gdelt.sort_values(by=["gkg_id"]).reset_index()
    print("tidied")
    
    daily_tone = gdelt.resample('D', on='date')[['tone', 'pos', 'neg']].mean()
    daily_count = gdelt.resample('D', on='date')['gkg_id'].count()
    
    weekly_tone = gdelt.resample('W-Mon', on='date')[['tone', 'pos', 'neg']].mean()
    weekly_count = gdelt.resample('W-Mon', on='date')['gkg_id'].count()
    # Remove partial weeks at beginning and end of weekly-aggregation
#     first_monday = 
#     final_sunday = 
#     weekly_tone = weekly_tone[first_monday <= weekly_tone['date'] <= final_sunday]
#     weekly_count = weekly_count[first_monday <= weekly_count['date'] <= final_sunday]
    print("compiled")
    
    return daily_tone, weekly_tone, daily_count, weekly_count

In [None]:
%%time
nz_dfs = get_time_aggregated_dfs('gdelt-nz-20-21.csv')
au_dfs = get_time_aggregated_dfs('gdelt-au-20-21.csv')
ca_dfs = get_time_aggregated_dfs('gdelt-ca-20-21.csv')

In [None]:
tone_columns = pd.MultiIndex.from_product([countries, ['tone', 'pos', 'neg']])

daily_tone = pd.concat([nz_dfs[0], au_dfs[0], ca_dfs[0]], axis=1)
daily_tone.columns = tone_columns

weekly_tone = pd.concat([nz_dfs[1], au_dfs[1], ca_dfs[1]], axis=1)
weekly_tone.columns = tone_columns

daily_tone.head()

In [None]:
daily_tone_long = daily_tone.xs('tone', axis=1, level=1) \
                            .reset_index() \
                            .melt("date",
                                  var_name="country", 
                                  value_name="tone", 
                                  value_vars=['nz','au','ca'])
weekly_tone_long = weekly_tone.xs('tone', axis=1, level=1) \
                              .reset_index() \
                              .melt("date",
                                    var_name="country", 
                                    value_name="tone", 
                                    value_vars=['nz','au','ca'])

In [None]:
g = sns.lineplot(data=daily_tone_long, x="date", y="tone", hue="country")
g.set(title="Daily tone of News (2020-present)");

In [None]:
g = sns.lineplot(data=weekly_tone_long, x="date", y="tone", hue="country")
g.set(title="Weekly tone of News (2020-present)");

In [None]:
daily_count = pd.concat({'nz': nz_dfs[2], 'au': au_dfs[2], 'ca': ca_dfs[2]}, axis=1)
weekly_count = pd.concat({'nz': nz_dfs[3], 'au': au_dfs[3], 'ca': ca_dfs[3]}, axis=1)

daily_count_long = daily_count.reset_index() \
                              .melt("date",
                                    var_name="country", 
                                    value_name="num_articles", 
                                    value_vars=['nz','au','ca'])
weekly_count_long = weekly_count.reset_index() \
                                .melt("date",
                                      var_name="country", 
                                      value_name="num_articles", 
                                      value_vars=['nz','au','ca'])

In [None]:
daily_count_long

In [None]:
g = sns.lineplot(data=daily_count_long, x="date", y="num_articles", hue="country")
g.set(title="Number of news articles (daily)");

In [None]:
g = sns.lineplot(data=weekly_count_long, x="date", y="num_articles", hue="country")
g.set(title="Number of news articles (weekly)");

## Verify that seasonality is weekly

In [None]:
from statsmodels.tsa.seasonal import seasonal_decompose

In [None]:
daily_count.index.strftime("%b %y")
['Jan 20', 'Mar 20', 'Jul 20', 'Oct 20', 'Jan 21', 'Mar 21']

In [None]:
tone_trends = pd.concat({
    country: seasonal_decompose(daily_count.dropna()[country],
                                model='additive'
                                ).trend
    for country in countries
}, axis=1)

g = sns.lineplot(data=tone_trends, dashes=False)
g.set(title="Daily number of articles - trend component");

In [None]:
daily_count['Day'] = daily_count.index.day_name().astype("category").reorder_categories(['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday'])
first_monday = daily_count[daily_count['Day'] == "Monday"].index[0]
first_monday_idx = (first_monday - datetime(2020, 1, 1)).days

first_monday, first_monday_idx

In [None]:
tone_seasonals = pd.concat({
    country: seasonal_decompose(daily_count.dropna()[country],#, 'tone'], 
                                model='additive'
                                ).seasonal[first_monday_idx:(first_monday_idx+6)]
    for country in countries
}, axis=1)

g = sns.lineplot(data=tone_seasonals, dashes=False)
g.set(title="Daily number of articles - seasonal component (Mon - Sun)",
      xticks=[], xlabel='');

# Check previous years

In [10]:
%%time
nz_18_21_dfs = get_time_aggregated_dfs_s3("statsnz-covid-xmiles", "merged_gdelt/gdelt-nz-18-21.csv")

OverflowError: signed integer is greater than maximum

In [None]:
%%time
au_18_21_dfs = get_time_aggregated_dfs(fs.open("s3://statsnz-covid-xmiles/merged_gdelt/gdelt-au-18-21.csv"))

In [None]:
%%time
ca_18_21_dfs = get_time_aggregated_dfs(fs.open("s3://statsnz-covid-xmiles/merged_gdelt/gdelt-ca-18-21.csv"))

In [None]:
nz_18_21_dfs2 = get_time_aggregated_dfs_s3("statsnz-covid-xmiles", "merged_gdelt/gdelt-nz-18-21.csv")

In [None]:
daily_hist_count = pd.concat({'nz': nz_18_21_dfs[2], 'au': au_18_21_dfs[2], 'ca': ca_18_21_dfs[2]}, axis=1)

daily_hist_count_long = daily_hist_count.reset_index() \
                                        .melt("date", var_name="country",
                                              value_name="num_articles") \
                                        .dropna()

g = sns.lineplot(data=daily_hist_count_long.dropna(), x="date", y="num_articles", hue="country")
g.set(title="Daily number of news articles - historical");

In [None]:
weekly_hist_count = pd.concat({'nz': nz_18_21_dfs[3], 'au': au_18_21_dfs[3], 'ca': ca_18_21_dfs[3]}, axis=1)

weekly_hist_count_long = weekly_hist_count.reset_index() \
                                          .melt("date", var_name="country", 
                                                value_name="num_articles") \
                                          .dropna()

g = sns.lineplot(data=weekly_hist_count_long, x="date", y="num_articles", hue="country")
g.set(title="Weekly number of news articles - historical");

In [None]:
weekly_hist_count_long['year'] = weekly_hist_count_long['date'].dt.year
weekly_hist_count_long['daymonth'] = pd.to_datetime(2040*10000 + weekly_hist_count_long['date'].dt.month*100 + weekly_hist_count_long['date'].dt.day,
                                                    format="%Y%m%d")

fig, axes = plt.subplots(2, 2, figsize=(24, 16), sharey=True)
ax = axes.ravel()
for axi in ax[:3]:
    axi.xaxis.set_major_formatter(mpl.dates.DateFormatter("%d-%b"))

g_nz = sns.lineplot(data=weekly_hist_count_long[weekly_hist_count_long['country'] == 'nz'],
                    x="daymonth", y="num_articles", hue="year", ax=ax[0])
g_nz.set(title="Weekly number of NZ news articles, by year",
         xlabel="", ylabel="Number of articles")

g_au = sns.lineplot(data=weekly_hist_count_long[weekly_hist_count_long['country'] == 'au'],
                    x="daymonth", y="num_articles", hue="year", ax=ax[1])
g_au.set(title="Weekly number of AU news articles, by year", 
         xlabel="", ylabel="Number of articles")

g_ca = sns.lineplot(data=weekly_hist_count_long[weekly_hist_count_long['country'] == 'ca'],
                    x="daymonth", y="num_articles", hue="year", ax=ax[2])
g_ca.set(title="Weekly number of CA news articles, by year", 
         xlabel="", ylabel="Number of articles");

In [None]:
weekly_hist_count_long['daymonth'].dt.year.unique()

In [None]:
pd.to_datetime({'year': itertools.repeat(2040), 'month': weekly_hist_count_long['date'].dt.month, 'day'})