In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rcParams['figure.dpi']= 150

In [3]:
import numpy as np
import pandas as pd

import ipywidgets as widgets
from ipywidgets import interact, fixed


# Data
# from transat.data import HYPOTHETICAL_SUBMISSION_DATE
from transat.data.load import download_historical, load_historical
from transat.data.split import split_historical
from transat.data.transform import preprocess_historical_basic, dataframe_to_array

# Metric
from transat.metric import mae

# Scenario/Simulation
from transat.data.scenario import generate_scenario

In [4]:
import tensorflow as tf
tf.config.list_physical_devices('GPU')

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [29]:
LATEST_DATA_URL = 'https://raw.githubusercontent.com/OxCGRT/covid-policy-tracker/master/data/OxCGRT_latest.csv'
NPI_COLUMNS = ['C1_School closing',
               'C2_Workplace closing',
               'C3_Cancel public events',
               'C4_Restrictions on gatherings',
               'C5_Close public transport',
               'C6_Stay at home requirements',
               'C7_Restrictions on internal movement',
               'C8_International travel controls',
               'H1_Public information campaigns',
               'H2_Testing policy',
               'H3_Contact tracing',
               'H6_Facial Coverings']
ID_COLUMNS = ['CountryName',
              'RegionName',
              'GeoID',
              'Date']
CASES_COLUMN = ['NewCases']
NUM_PREV_DAYS_TO_INCLUDE = 6
WINDOW_SIZE = 7
TRAINING_LAST_DATE = '2020-10-31'


In [5]:
HYPOTHETICAL_SUBMISSION_DATE = np.datetime64("2020-09-30")

In [25]:
download_historical()
df = load_historical()

In [19]:
df.columns

Index(['CountryName', 'CountryCode', 'RegionName', 'RegionCode',
       'Jurisdiction', 'Date', 'C1_School closing', 'C1_Flag',
       'C2_Workplace closing', 'C2_Flag', 'C3_Cancel public events', 'C3_Flag',
       'C4_Restrictions on gatherings', 'C4_Flag', 'C5_Close public transport',
       'C5_Flag', 'C6_Stay at home requirements', 'C6_Flag',
       'C7_Restrictions on internal movement', 'C7_Flag',
       'C8_International travel controls', 'E1_Income support', 'E1_Flag',
       'E2_Debt/contract relief', 'E3_Fiscal measures',
       'E4_International support', 'H1_Public information campaigns',
       'H1_Flag', 'H2_Testing policy', 'H3_Contact tracing',
       'H4_Emergency investment in healthcare', 'H5_Investment in vaccines',
       'H6_Facial Coverings', 'H6_Flag', 'H7_Vaccination policy', 'H7_Flag',
       'M1_Wildcard', 'ConfirmedCases', 'ConfirmedDeaths', 'StringencyIndex',
       'StringencyIndexForDisplay', 'StringencyLegacyIndex',
       'StringencyLegacyIndexForDispla

In [26]:
df['GeoID'] = df['CountryName'] + '__' + df['RegionName'].astype(str)

In [35]:
type(np.array(1))==np.ndarray

True

In [36]:
1==np.ndarray

False

In [30]:
data = df
nb_lookback_days = 30
# Create training data across all countries for predicting one day ahead
x_samples = []
y_samples = []
data = data.dropna(subset=['ConfirmedCases'])
geo_ids = data.GeoID.unique()
for g in geo_ids:
    gdf = data[data.GeoID == g]
    all_case_data = np.array(gdf[CASES_COLUMN])
    all_npi_data = np.array(gdf[NPI_COLUMNS])

    # Create one sample for each day where we have enough data
    # Each sample consists of cases and npis for previous nb_lookback_days
    nb_total_days = len(gdf)
    for d in range(nb_lookback_days, nb_total_days - 1):
        x_cases = all_case_data[d - nb_lookback_days:d]

        # Take negative of npis to support positive
        # weight constraint in Lasso.
        x_npis = -all_npi_data[d - nb_lookback_days:d]

        # Flatten all input data so it fits Lasso input format.
        x_sample = np.concatenate([x_cases.flatten(),
                                   x_npis.flatten()])
        y_sample = all_case_data[d + 1]
        x_samples.append(x_sample)
        y_samples.append(y_sample)

x_samples = np.array(x_samples)
y_samples = np.array(y_samples).flatten()


# Split data into train and test sets
x_train, x_test, y_train, y_test = train_test_split(x_samples,
                                                    y_samples,
                                                    test_size=0.2,
                                                    random_state=301)

KeyError: "None of [Index(['NewCases'], dtype='object')] are in the [columns]"

In [None]:
def train(self, data, verbose=False):
        """
            Train the model

            :param data: Training data
            :type data: pandas.DataFrame
            :param verbose: Whether to show traces for debug (True) or run in quiet mode (False)
            :type verbose: bool
        """
        # Set number of past days to use to make predictions
        nb_lookback_days = 30

        # Create training data across all countries for predicting one day ahead
        x_samples = []
        y_samples = []
        data = data.dropna(subset=['ConfirmedCases'])
        geo_ids = data.GeoID.unique()
        for g in geo_ids:
            gdf = data[data.GeoID == g]
            all_case_data = np.array(gdf[CASES_COLUMN])
            all_npi_data = np.array(gdf[NPI_COLUMNS])

            # Create one sample for each day where we have enough data
            # Each sample consists of cases and npis for previous nb_lookback_days
            nb_total_days = len(gdf)
            for d in range(nb_lookback_days, nb_total_days - 1):
                x_cases = all_case_data[d - nb_lookback_days:d]

                # Take negative of npis to support positive
                # weight constraint in Lasso.
                x_npis = -all_npi_data[d - nb_lookback_days:d]

                # Flatten all input data so it fits Lasso input format.
                x_sample = np.concatenate([x_cases.flatten(),
                                           x_npis.flatten()])
                y_sample = all_case_data[d + 1]
                x_samples.append(x_sample)
                y_samples.append(y_sample)

        x_samples = np.array(x_samples)
        y_samples = np.array(y_samples).flatten()
        
        
        # Split data into train and test sets
        x_train, x_test, y_train, y_test = train_test_split(x_samples,
                                                            y_samples,
                                                            test_size=0.2,
                                                            random_state=301)

        # Create and train Lasso model.
        # Set positive=True to enforce assumption that cases are positively correlated
        # with future cases and npis are negatively correlated.
        self._model = RandomForestRegressor(max_depth=10, max_leaf_nodes=50, n_estimators=10, criterion='mae')
        # Fit model
        self._model.fit(x_train, y_train)


        # Evaluate model
        train_preds = self._model.predict(x_train)
        train_preds = np.maximum(train_preds, 0)  # Don't predict negative cases
        if verbose:
            print('Train MAE:', self.mae(train_preds, y_train))

        test_preds = self._model.predict(x_test)
        test_preds = np.maximum(test_preds, 0)  # Don't predict negative cases
        if verbose:
            print('Test MAE:', self.mae(test_preds, y_test))

        # %%

        # Inspect the learned feature coefficients for the model
        # to see what features it's paying attention to.

        # Give names to the features
        x_col_names = []
        for d in range(-nb_lookback_days, 0):
            x_col_names.append('Day ' + str(d) + ' ' + CASES_COLUMN[0])
        for d in range(-nb_lookback_days, 1):
            for col_name in NPI_COLUMNS:
                x_col_names.append('Day ' + str(d) + ' ' + col_name)
        
        if verbose:
            print('Intercept', self._model.intercept_)

        # Save model to file
        self.save()

        # Save used data
        data_path = os.path.join(self._models_base_path, self._data_file)
        data.to_csv(data_path, index=False)