# Linear Model Tests
This notebook generates a lasso model and produces scores for each of the countries. The scores are the train/test MAEs and the "MPEs" (Mean Percentage Error: 100*MAE/Population). These scores are saved in `countrypops.csv`

In [1]:
import pickle
import os
import urllib.request
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

## Importing the Training Data

In [2]:
# Main source for the training data
DATA_URL = 'https://raw.githubusercontent.com/OxCGRT/covid-policy-tracker/master/data/OxCGRT_latest.csv'
# Local files
data_path = 'examples/predictors/ryan_predictor/data'
DATA_FILE = data_path + '/OxCGRT_latest.csv'


if not os.path.exists(data_path):
    os.mkdir(data_path)
urllib.request.urlretrieve(DATA_URL, DATA_FILE)

('examples/predictors/ryan_predictor/data/OxCGRT_latest.csv',
 <http.client.HTTPMessage at 0x7f9a683ea3a0>)

In [3]:
df = pd.read_csv(DATA_FILE, 
                 parse_dates=['Date'],
                 encoding="ISO-8859-1",
                 dtype={"RegionName": str,
                        "RegionCode": str},
                 error_bad_lines=False)
# df[cases_df['RegionName'] == 'California']

In [4]:
df.columns

Index(['CountryName', 'CountryCode', 'RegionName', 'RegionCode',
       'Jurisdiction', 'Date', 'C1_School closing', 'C1_Flag',
       'C2_Workplace closing', 'C2_Flag', 'C3_Cancel public events', 'C3_Flag',
       'C4_Restrictions on gatherings', 'C4_Flag', 'C5_Close public transport',
       'C5_Flag', 'C6_Stay at home requirements', 'C6_Flag',
       'C7_Restrictions on internal movement', 'C7_Flag',
       'C8_International travel controls', 'E1_Income support', 'E1_Flag',
       'E2_Debt/contract relief', 'E3_Fiscal measures',
       'E4_International support', 'H1_Public information campaigns',
       'H1_Flag', 'H2_Testing policy', 'H3_Contact tracing',
       'H4_Emergency investment in healthcare', 'H5_Investment in vaccines',
       'H6_Facial Coverings', 'H6_Flag', 'H7_Vaccination policy', 'H7_Flag',
       'M1_Wildcard', 'ConfirmedCases', 'ConfirmedDeaths', 'StringencyIndex',
       'StringencyIndexForDisplay', 'StringencyLegacyIndex',
       'StringencyLegacyIndexForDispla

In [5]:
HYPOTHETICAL_SUBMISSION_DATE = np.datetime64("2020-07-31")
df = df[df.Date <= HYPOTHETICAL_SUBMISSION_DATE]

In [6]:
# Add RegionID column that combines CountryName and RegionName for easier manipulation of data
df['GeoID'] = df['CountryName'] + '__' + df['RegionName'].astype(str)

In [7]:
# Add new cases column
df['NewCases'] = df.groupby('GeoID').ConfirmedCases.diff().fillna(0)

In [8]:
# Keep only columns of interest
id_cols = ['CountryName',
           'RegionName',
           'GeoID',
           'Date']
cases_col = ['NewCases']
npi_cols = ['C1_School closing',
            'C2_Workplace closing',
            'C3_Cancel public events',
            'C4_Restrictions on gatherings',
            'C5_Close public transport',
            'C6_Stay at home requirements',
            'C7_Restrictions on internal movement',
            'C8_International travel controls',
            'H1_Public information campaigns',
            'H2_Testing policy',
            'H3_Contact tracing',
            'H6_Facial Coverings']
df = df[id_cols + cases_col + npi_cols]

In [9]:
# Fill any missing case values by interpolation and setting NaNs to 0
df.update(df.groupby('GeoID').NewCases.apply(
    lambda group: group.interpolate()).fillna(0))

In [10]:
# Fill any missing NPIs by assuming they are the same as previous day
for npi_col in npi_cols:
    df.update(df.groupby('GeoID')[npi_col].ffill().fillna(0))

In [11]:
df

Unnamed: 0,CountryName,RegionName,GeoID,Date,NewCases,C1_School closing,C2_Workplace closing,C3_Cancel public events,C4_Restrictions on gatherings,C5_Close public transport,C6_Stay at home requirements,C7_Restrictions on internal movement,C8_International travel controls,H1_Public information campaigns,H2_Testing policy,H3_Contact tracing,H6_Facial Coverings
0,Aruba,,Aruba__nan,2020-01-01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Aruba,,Aruba__nan,2020-01-02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Aruba,,Aruba__nan,2020-01-03,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Aruba,,Aruba__nan,2020-01-04,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Aruba,,Aruba__nan,2020-01-05,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
98137,Zimbabwe,,Zimbabwe__nan,2020-07-27,192.0,3.0,1.0,2.0,3.0,1.0,2.0,2.0,4.0,2.0,1.0,1.0,4.0
98138,Zimbabwe,,Zimbabwe__nan,2020-07-28,113.0,3.0,1.0,2.0,3.0,1.0,2.0,2.0,4.0,2.0,1.0,1.0,4.0
98139,Zimbabwe,,Zimbabwe__nan,2020-07-29,62.0,3.0,1.0,2.0,3.0,1.0,2.0,2.0,4.0,2.0,1.0,1.0,4.0
98140,Zimbabwe,,Zimbabwe__nan,2020-07-30,213.0,3.0,1.0,2.0,3.0,1.0,2.0,2.0,4.0,2.0,1.0,1.0,4.0


## Making the Model

In [12]:
scores_df = pd.DataFrame(columns = ['Country', 'TrainMAE', 'TestMAE'])

# Helpful function to compute mae
def mae(pred, true):
    return np.mean(np.abs(pred - true))

model = Lasso(alpha=0.1, precompute=True, max_iter=10000, positive=True, selection='random')
for country in df['CountryName'].unique().tolist():
    
    country_df = df[df['CountryName'] == country]
    
    # Set number of past days to use to make predictions
    nb_lookback_days = 30

    # Create training data across all countries for predicting one day ahead
    X_cols = cases_col + npi_cols
    y_col = cases_col
    X_samples = []
    y_samples = []
    geo_ids = country_df.GeoID.unique()
    for g in geo_ids:
        gdf = country_df[country_df.GeoID == g]
        all_case_data = np.array(gdf[cases_col])
        all_npi_data = np.array(gdf[npi_cols])

        # Create one sample for each day where we have enough data
        # Each sample consists of cases and npis for previous nb_lookback_days
        nb_total_days = len(gdf)
        for d in range(nb_lookback_days, nb_total_days - 1):
            X_cases = all_case_data[d-nb_lookback_days:d]

            # Take negative of npis to support positive
            # weight constraint in Lasso.
            X_npis = -all_npi_data[d - nb_lookback_days:d]

            # Flatten all input data so it fits Lasso input format.
            X_sample = np.concatenate([X_cases.flatten(),
                                       X_npis.flatten()])
            y_sample = all_case_data[d + 1]
            X_samples.append(X_sample)
            y_samples.append(y_sample)

    X_samples = np.array(X_samples)
    y_samples = np.array(y_samples).flatten()
    
    # Split data into train and test sets
    X_train, X_test, y_train, y_test = train_test_split(X_samples, y_samples, test_size=0.2, random_state=42)
    
    # Fit model
    model.fit(X_train, y_train)
    
    # Evaluate model
    train_preds = model.predict(X_train)
    train_preds = np.maximum(train_preds, 0) # Don't predict negative cases
#     print('Train MAE:', mae(train_preds, y_train))

    test_preds = model.predict(X_test)
    test_preds = np.maximum(test_preds, 0) # Don't predict negative cases
#     print('Test MAE:', mae(test_preds, y_test))
    
    score_df = pd.DataFrame([[country,
                              mae(train_preds, y_train),
                              mae(test_preds, y_test)]],
                            columns=['Country', 'TrainMAE', 'TestMAE'])
    scores_df = scores_df.append(score_df)
    
scores_df

  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(
  model = cd_fast.enet_coordinate_descent_gram(


Unnamed: 0,Country,TrainMAE,TestMAE
0,Aruba,0.670401,0.855306
0,Afghanistan,49.801459,51.351457
0,Angola,3.037700,3.604136
0,Albania,6.842001,7.985353
0,Andorra,2.933999,6.940295
...,...,...,...
0,Vanuatu,0.000000,0.000000
0,Yemen,5.639379,9.390156
0,South Africa,524.610277,502.554451
0,Zambia,27.409937,36.221025


In [36]:
og_df = pd.read_csv(DATA_FILE, 
                 parse_dates=['Date'],
                 encoding="ISO-8859-1",
                 dtype={"RegionName": str,
                        "RegionCode": str},
                 error_bad_lines=False)
og_df['GeoID'] = og_df['CountryName'] + '__' + og_df['RegionName'].astype(str)
geoid_cases = og_df.groupby('GeoID').agg({'ConfirmedCases':np.median}).reset_index()
geoid_cases = geoid_cases.merge(og_df[['GeoID','CountryName']], how='left', left_on='GeoID', right_on='GeoID')
geoid_cases = geoid_cases.groupby('CountryName').agg({'ConfirmedCases':np.sum}).reset_index()
geoid_cases

Unnamed: 0,CountryName,ConfirmedCases
0,Afghanistan,11464362.0
1,Albania,989469.0
2,Algeria,5440500.0
3,Andorra,300105.0
4,Angola,121446.0
...,...,...
179,Venezuela,2369250.0
180,Vietnam,124605.0
181,Yemen,438048.0
182,Zambia,572832.0


In [44]:
scores_df = scores_df.merge(geoid_cases, how='left', left_on='Country', right_on='CountryName').drop(['CountryName'], axis=1)
scores_df

Unnamed: 0,Country,TrainMAE,TestMAE,ConfirmedCases
0,Timor-Leste,0.254475,0.152499,8424.0
1,Greenland,0.129370,0.173346,4563.0
2,Macao,0.257970,0.189839,16146.0
3,Dominica,0.177353,0.193705,6318.0
4,Papua New Guinea,0.598286,0.202319,3861.0
...,...,...,...,...
175,Pakistan,402.460563,550.205477,79074333.0
176,India,625.245603,689.635949,236280915.0
177,Peru,775.515658,799.185622,104977080.0
178,Ecuador,358.336956,913.370391,21598785.0


In [51]:
scores_df['TrainMPE'] = 100*scores_df['TrainMAE']/scores_df['ConfirmedCases']
scores_df['TestMPE'] = 100*scores_df['TestMAE']/scores_df['ConfirmedCases']
scores_df.sort_values(by='TestMPE').reset_index()

Unnamed: 0,index,Country,TrainMAE,TestMAE,ConfirmedCases,TrainMPE,TestMPE
0,162,United States,203.377205,187.260836,1.988169e+09,0.000010,0.000009
1,170,Brazil,397.563187,428.478188,1.263662e+09,0.000031,0.000034
2,109,Canada,30.131341,31.077376,7.738953e+07,0.000039,0.000040
3,156,United Kingdom,154.302454,160.560882,2.016495e+08,0.000077,0.000080
4,168,Russia,310.542012,314.734937,2.364210e+08,0.000131,0.000133
...,...,...,...,...,...,...,...
175,23,Bahamas,2.040444,2.270344,3.650400e+04,0.005590,0.006219
176,119,Zambia,27.409937,36.221025,5.728320e+05,0.004785,0.006323
177,160,Kyrgyz Republic,250.130037,182.012063,2.489994e+06,0.010045,0.007310
178,28,Gambia,1.538566,2.641466,2.000700e+04,0.007690,0.013203


In [53]:
scores_df.sort_values(by='TestMPE').reset_index().to_csv('case_pred_errors_as_percent.csv', index=False)

In [58]:
top_50_country_list = scores_df.sort_values(by='TestMPE')['Country'].tolist()[:50]

In [59]:
import pickle
pickle.dump(top_50_country_list, open( "top_50_country_list.p", "wb" ) )

## Evaluating the Scores

In [13]:
scores_df = scores_df[scores_df['TestMAE'] != 0].sort_values(by='TestMAE')

In [14]:
country_pops = pd.read_csv('countrypops.csv')
country_pops = country_pops[['Country', 'Population']]

In [15]:
scores_df.head()

Unnamed: 0,Country,TrainMAE,TestMAE
0,Timor-Leste,0.254475,0.152499
0,Greenland,0.12937,0.173346
0,Macao,0.25797,0.189839
0,Dominica,0.177353,0.193705
0,Papua New Guinea,0.598286,0.202319


In [16]:
scores_w_pops = scores_df.merge(country_pops, how = 'left', left_on = 'Country', right_on = 'Country')

In [17]:
population_list = scores_w_pops['Population'].tolist()
for i, val in enumerate(population_list):
    if type(val) not in [int, float]:
        population_list[i] = float(val.replace(',',''))
#     else:
#         population_list[i] = 0
scores_w_pops['Population'] = population_list

In [18]:
#MPE = Mean Percentage Error (I made this term up)
scores_w_pops['TrainMPE'] = 100*scores_w_pops['TrainMAE']/scores_w_pops['Population']
scores_w_pops['TestMPE'] = 100*scores_w_pops['TestMAE']/scores_w_pops['Population']
scores_w_pops[['Country','TestMAE','Population', 'TestMPE']].sort_values(by='TestMPE')

Unnamed: 0,Country,TestMAE,Population,TestMPE
4,Papua New Guinea,0.202319,8947024.0,0.000002
26,Vietnam,2.425364,97338579.0,0.000002
6,Laos,0.246404,7275560.0,0.000003
29,Myanmar,2.890321,54409800.0,0.000005
16,Taiwan,1.729714,23816775.0,0.000007
...,...,...,...,...
97,Democratic Republic of Congo,23.371566,,
104,Palestine,27.094452,,
106,Cote d'Ivoire,27.507111,,
120,Czech Republic,45.966915,,


In [None]:
scores_w_pops['TrainMPE'] = 100*scores_w_pops['TrainMAE']/scores_w_pops['Population']
scores_w_pops['TestMPE'] = 100*scores_w_pops['TestMAE']/scores_w_pops['Population']

In [19]:
scores_w_pops.sort_values(by='TestMPE').to_csv('case_pred_errors_as_percent.csv')