In [10]:
import os
import urllib.request
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle

In [11]:
top_50_country_list = pickle.load( open( "top_50_country_list.p", "rb" ) )

## Importing the Training Data

In [12]:
# Main source for the training data
DATA_URL = 'https://raw.githubusercontent.com/OxCGRT/covid-policy-tracker/master/data/OxCGRT_latest.csv'
# Local files
data_path = 'examples/predictors/ryan_predictor/data'
DATA_FILE = data_path + '/OxCGRT_latest.csv'


if not os.path.exists(data_path):
    os.mkdir(data_path)
urllib.request.urlretrieve(DATA_URL, DATA_FILE)

('examples/predictors/ryan_predictor/data/OxCGRT_latest.csv',
 <http.client.HTTPMessage at 0x7f83424134f0>)

In [13]:
df = pd.read_csv(DATA_FILE, 
                 parse_dates=['Date'],
                 encoding="ISO-8859-1",
                 dtype={"RegionName": str,
                        "RegionCode": str},
                 error_bad_lines=False)
# df[cases_df['RegionName'] == 'California']

In [14]:
HYPOTHETICAL_SUBMISSION_DATE = np.datetime64("2020-07-31")
df = df[df.Date <= HYPOTHETICAL_SUBMISSION_DATE]

In [15]:
# Add RegionID column that combines CountryName and RegionName for easier manipulation of data
df['GeoID'] = df['CountryName'] + '__' + df['RegionName'].astype(str)

In [16]:
# Add new cases column
df['NewCases'] = df.groupby('GeoID').ConfirmedCases.diff().fillna(0)

In [17]:
# Keep only columns of interest
id_cols = ['CountryName',
           'RegionName',
           'GeoID',
           'Date']
cases_col = ['NewCases']
npi_cols = ['C1_School closing',
            'C2_Workplace closing',
            'C3_Cancel public events',
            'C4_Restrictions on gatherings',
            'C5_Close public transport',
            'C6_Stay at home requirements',
            'C7_Restrictions on internal movement',
            'C8_International travel controls',
            'H1_Public information campaigns',
            'H2_Testing policy',
            'H3_Contact tracing',
            'H6_Facial Coverings']
df = df[id_cols + cases_col + npi_cols]

In [18]:
# Fill any missing case values by interpolation and setting NaNs to 0
df.update(df.groupby('GeoID').NewCases.apply(
    lambda group: group.interpolate()).fillna(0))

In [19]:
# Fill any missing NPIs by assuming they are the same as previous day
for npi_col in npi_cols:
    df.update(df.groupby('GeoID')[npi_col].ffill().fillna(0))

## Making the Model

In [20]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split

In [21]:
scores_df = pd.DataFrame(columns = ['Country', 'TrainMAE', 'TestMAE'])

# Helpful function to compute mae
def mae(pred, true):
    return np.mean(np.abs(pred - true))

model = RandomForestRegressor(max_depth=2, random_state=0)
for country in df['CountryName'].unique().tolist():
    
    country_df = df[df['CountryName'] == country]
    
    # Set number of past days to use to make predictions
    nb_lookback_days = 30

    # Create training data across all countries for predicting one day ahead
    X_cols = cases_col + npi_cols
    y_col = cases_col
    X_samples = []
    y_samples = []
    geo_ids = country_df.GeoID.unique()
    for g in geo_ids:
        gdf = country_df[country_df.GeoID == g]
        all_case_data = np.array(gdf[cases_col])
        all_npi_data = np.array(gdf[npi_cols])

        # Create one sample for each day where we have enough data
        # Each sample consists of cases and npis for previous nb_lookback_days
        nb_total_days = len(gdf)
        for d in range(nb_lookback_days, nb_total_days - 1):
            X_cases = all_case_data[d-nb_lookback_days:d]

            # Take negative of npis to support positive
            # weight constraint in Lasso.
            X_npis = -all_npi_data[d - nb_lookback_days:d]

            # Flatten all input data so it fits Lasso input format.
            X_sample = np.concatenate([X_cases.flatten(),
                                       X_npis.flatten()])
            y_sample = all_case_data[d + 1]
            X_samples.append(X_sample)
            y_samples.append(y_sample)

    X_samples = np.array(X_samples)
    y_samples = np.array(y_samples).flatten()
    
    # Split data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X_samples, y_samples, test_size=0.2, random_state=42)
    
    # Fit model
    model.fit(X_train, y_train)
    
    # Evaluate model
    train_preds = model.predict(X_train)
    train_preds = np.maximum(train_preds, 0) # Don't predict negative cases
#     print('Train MAE:', mae(train_preds, y_train))

    test_preds = model.predict(X_test)
    test_preds = np.maximum(test_preds, 0) # Don't predict negative cases
#     print('Test MAE:', mae(test_preds, y_test))
    
    score_df = pd.DataFrame([[country,
                              mae(train_preds, y_train),
                              mae(test_preds, y_test)]],
                            columns=['Country', 'TrainMAE', 'TestMAE'])
    scores_df = scores_df.append(score_df)
    
scores_df

Unnamed: 0,Country,TrainMAE,TestMAE
0,Aruba,0.586089,0.828543
0,Afghanistan,42.485853,55.167921
0,Angola,2.897659,3.216678
0,Albania,6.562182,6.602446
0,Andorra,2.047612,5.392337
...,...,...,...
0,Vanuatu,0.000000,0.000000
0,Yemen,4.862545,5.580506
0,South Africa,512.993864,494.911827
0,Zambia,29.666590,31.123916


## evaluating the scores

In [22]:
linear_results_df = pd.read_csv('case_pred_errors_as_percent.csv')

In [26]:
linear_results_df.drop(columns=['index'])

Unnamed: 0,Country,TrainMAE,TestMAE,ConfirmedCases,TrainMPE,TestMPE
0,United States,203.377205,187.260836,1.988169e+09,0.000010,0.000009
1,Brazil,397.563187,428.478188,1.263662e+09,0.000031,0.000034
2,Canada,30.131341,31.077376,7.738953e+07,0.000039,0.000040
3,United Kingdom,154.302454,160.560882,2.016495e+08,0.000077,0.000080
4,Russia,310.542012,314.734937,2.364210e+08,0.000131,0.000133
...,...,...,...,...,...,...
175,Bahamas,2.040444,2.270344,3.650400e+04,0.005590,0.006219
176,Zambia,27.409937,36.221025,5.728320e+05,0.004785,0.006323
177,Kyrgyz Republic,250.130037,182.012063,2.489994e+06,0.010045,0.007310
178,Gambia,1.538566,2.641466,2.000700e+04,0.007690,0.013203


In [27]:
linear_results_df.drop(columns=['index']).merge(scores_df, how='left', left_on='Country', right_on='Country')

Unnamed: 0,Country,TrainMAE_x,TestMAE_x,ConfirmedCases,TrainMPE,TestMPE,TrainMAE_y,TestMAE_y
0,United States,203.377205,187.260836,1.988169e+09,0.000010,0.000009,489.845164,448.583587
1,Brazil,397.563187,428.478188,1.263662e+09,0.000031,0.000034,607.251056,668.145489
2,Canada,30.131341,31.077376,7.738953e+07,0.000039,0.000040,32.092414,33.218141
3,United Kingdom,154.302454,160.560882,2.016495e+08,0.000077,0.000080,171.486494,225.008731
4,Russia,310.542012,314.734937,2.364210e+08,0.000131,0.000133,443.486582,542.151531
...,...,...,...,...,...,...,...,...
175,Bahamas,2.040444,2.270344,3.650400e+04,0.005590,0.006219,1.640215,1.444154
176,Zambia,27.409937,36.221025,5.728320e+05,0.004785,0.006323,29.666590,31.123916
177,Kyrgyz Republic,250.130037,182.012063,2.489994e+06,0.010045,0.007310,143.962036,86.048405
178,Gambia,1.538566,2.641466,2.000700e+04,0.007690,0.013203,1.907181,2.720085


In [28]:
linear_results_df.drop(columns=['index']).merge(scores_df, how='left', left_on='Country', right_on='Country').to_csv('lin_vs_rand_forest.csv')