In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from xgboost import XGBRegressor, plot_importance, plot_tree
from tqdm import tqdm_notebook
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
import math
import datetime as dt
import joblib

%matplotlib qt5

In [2]:
def get_mov_avg_std(df, col, N):
    """
    Given a dataframe, get mean and std dev at timestep t using values from t-1, t-2, ..., t-N.
    Inputs
        df         : dataframe. Can be of any length.
        col        : name of the column you want to calculate mean and std dev
        N          : get mean and std dev at timestep t using values from t-1, t-2, ..., t-N
    Outputs
        df_out     : same as df but with additional column containing mean and std dev
    """
    mean_list = df[col].rolling(window = N, min_periods=1).mean() # len(mean_list) = len(df)
    std_list = df[col].rolling(window = N, min_periods=1).std()   # first value will be NaN, because normalized by N-1
    
    # Add one timestep to the predictions
    mean_list = np.concatenate((np.array([np.nan]), np.array(mean_list[:-1])))
    std_list = np.concatenate((np.array([np.nan]), np.array(std_list[:-1])))
    
    # Append mean_list to df
    df_out = df.copy()
    df_out[col + '_mean'] = mean_list
    df_out[col + '_std'] = std_list
    
    return df_out

def scale_row(row, feat_mean, feat_std):
    """
    Given a pandas series in row, scale it to have 0 mean and var 1 using feat_mean and feat_std
    Inputs
        row      : pandas series. Need to scale this.
        feat_mean: mean  
        feat_std : standard deviation
    Outputs
        row_scaled : pandas series with same length as row, but scaled
    """
    # If feat_std = 0 (this happens if adj_close doesn't change over N days), 
    # set it to a small number to avoid division by zero
    feat_std = 0.001 if feat_std == 0 else feat_std
    
    row_scaled = (row-feat_mean) / feat_std
    
    return row_scaled

In [None]:
def get_mov_avg_std(df, col, N):
    """
    Given a dataframe, get mean and std dev at timestep t using values from t-1, t-2, ..., t-N.
    Inputs
        df         : dataframe. Can be of any length.
        col        : name of the column you want to calculate mean and std dev
        N          : get mean and std dev at timestep t using values from t-1, t-2, ..., t-N
    Outputs
        df_out     : same as df but with additional column containing mean and std dev
    """
    mean_list = df[col].rolling(window = N, min_periods=1).mean() # len(mean_list) = len(df)
    std_list = df[col].rolling(window = N, min_periods=1).std()   # first value will be NaN, because normalized by N-1
    
    # Add one timestep to the predictions
    mean_list = np.concatenate((np.array([np.nan]), np.array(mean_list[:-1])))
    std_list = np.concatenate((np.array([np.nan]), np.array(std_list[:-1])))
    
    # Append mean_list to df
    df_out = df.copy()
    df_out[col + '_mean'] = mean_list
    df_out[col + '_std'] = std_list
    
    return df_out

def do_scaling(df, N):
    """
    Do scaling for the adj_close and lag cols
    """
    df.loc[:, 'adj_close_scaled'] = (df['adj_close'] - df['adj_close_mean']) / df['adj_close_std']
    for n in range(N,0,-1):
        df.loc[:, 'adj_close_scaled_lag_'+str(n)] = \
            (df['adj_close_lag_'+str(n)] - df['adj_close_mean']) / df['adj_close_std']
        
        # Remove adj_close_lag column which we don't need anymore
        df.drop(['adj_close_lag_'+str(n)], axis=1, inplace=True)

    return df

def pred_xgboost(model, N, H, prev_vals, prev_mean_val, prev_std_val):
    """
    Do recursive forecasting using xgboost
    Inputs
        model              : the xgboost model
        N                  : for feature at day t, we use lags from t-1, t-2, ..., t-N as features
        H                  : forecast horizon
        prev_vals          : numpy array. If predict at time t, 
                             prev_vals will contain the N unscaled values at t-1, t-2, ..., t-N
        prev_mean_val      : the mean of the unscaled values at t-1, t-2, ..., t-N
        prev_std_val       : the std deviation of the unscaled values at t-1, t-2, ..., t-N
    Outputs
        Times series of predictions. Numpy array of shape (H,). This is unscaled.
    """
    forecast = prev_vals.copy()

    for n in range(H):
        forecast_scaled = (forecast[-N:] - prev_mean_val) / prev_std_val
        
        # Create the features dataframe
        X = defaultdict(list)
        for n in range(N,0,-1):
            X['adj_close_scaled_lag_'+str(n)] = [forecast_scaled[-n]]
        X = pd.DataFrame(X)
             
        # Do prediction
        est_scaled = model.predict(X)
        
        # Unscale the prediction
        forecast = np.concatenate([forecast, 
                                   np.array((est_scaled * prev_std_val) + prev_mean_val).reshape(1,)])
        
        # Comp. new mean and std
        prev_mean_val = np.mean(forecast[-N:])
        prev_std_val = np.std(forecast[-N:])
           
    return forecast[-H:]

def train_pred_eval_model(X_train_scaled,
                          y_train_scaled,
                          y_test,
                          N,
                          H,
                          prev_vals,
                          prev_mean_val,
                          prev_std_val,
                          seed=100,
                          n_estimators=100,
                          max_depth=3,
                          learning_rate=0.1,
                          min_child_weight=1,
                          subsample=1,
                          colsample_bytree=1,
                          colsample_bylevel=1,
                          gamma=0):
    '''
    Train model, do prediction, scale back to original range and do evaluation
    Use XGBoost here.
    Inputs
        X_train_scaled     : features for training. Scaled to have mean 0 and variance 1
        y_train_scaled     : target for training. Scaled to have mean 0 and variance 1
        y_test             : target for test. Actual values, not scaled.
        N                  : for feature at day t, we use lags from t-1, t-2, ..., t-N as features
        H                  : forecast horizon
        prev_vals          : numpy array. If scaled[0] is at time t, prev_vals will contain the N-1 unscaled values at t-1, t-2, ...
        prev_mean_val      : the mean of the unscaled values at t-1, t-2, ..., t-N
        prev_std_val       : the std deviation of the unscaled values at t-1, t-2, ..., t-N
        seed               : model seed
        n_estimators       : number of boosted trees to fit
        max_depth          : maximum tree depth for base learners
        learning_rate      : boosting learning rate (xgb’s “eta”)
        min_child_weight   : minimum sum of instance weight(hessian) needed in a child
        subsample          : subsample ratio of the training instance
        colsample_bytree   : subsample ratio of columns when constructing each tree
        colsample_bylevel  : subsample ratio of columns for each split, in each level
        gamma              : 
    Outputs
        rmse               : root mean square error of y_test and est
        mape               : mean absolute percentage error of y_test and est
        mae                : mean absolute error of y_test and est
        est                : predicted values. Same length as y_test
    '''

    model = XGBRegressor(objective ='reg:squarederror',
                         seed=model_seed,
                         n_estimators=n_estimators,
                         max_depth=max_depth,
                         learning_rate=learning_rate,
                         min_child_weight=min_child_weight,
                         subsample=subsample,
                         colsample_bytree=colsample_bytree,
                         colsample_bylevel=colsample_bylevel,
                         gamma=gamma)
    
    # Train the model
    model.fit(X_train_scaled, y_train_scaled)
    
    # Get predicted labels and scale back to original range
    est = pred_xgboost(model, N, H, prev_vals, prev_mean_val, prev_std_val)

    # Calculate RMSE, MAPE, MAE
    rmse = get_rmse(y_test, est)
    mape = get_mape(y_test, est)
    mae = get_mae(y_test, est)
    
    return rmse, mape, mae, est, model.feature_importances_

def add_lags(df, N, lag_cols):
    """
    Add lags up to N number of days to use as features
    The lag columns are labelled as 'adj_close_lag_1', 'adj_close_lag_2', ... etc.
    """
    # Use lags up to N number of days to use as features
    df_w_lags = df.copy()
    df_w_lags.loc[:, 'order_day'] = [x for x in list(range(len(df)))] # Add a column 'order_day' to indicate the order of the rows by date
    merging_keys = ['order_day'] # merging_keys
    shift_range = [x+1 for x in range(N)]
    for shift in shift_range:
        train_shift = df_w_lags[merging_keys + lag_cols].copy()
    
        # E.g. order_day of 0 becomes 1, for shift = 1.
        # So when this is merged with order_day of 1 in df_w_lags, this will represent lag of 1.
        train_shift['order_day'] = train_shift['order_day'] + shift
    
        foo = lambda x: '{}_lag_{}'.format(x, shift) if x in lag_cols else x
        train_shift = train_shift.rename(columns=foo)

        df_w_lags = pd.merge(df_w_lags, train_shift, on=merging_keys, how='left') #.fillna(0)
    del train_shift
    
    return df_w_lags

def get_error_metrics(df,
                      train_size,
                      N,
                      H,
                      seed=100,
                      n_estimators=100,
                      max_depth=3,
                      learning_rate=0.1,
                      min_child_weight=1,
                      subsample=1,
                      colsample_bytree=1,
                      colsample_bylevel=1,
                      gamma=0):
    """
    Given a series consisting of both train+validation, do predictions of forecast horizon H on the validation set, 
    at H/2 intervals.
    Inputs
        df                 : train + val dataframe. len(df) = train_size + val_size
        train_size         : size of train set
        N                  : for feature at day t, we use lags from t-1, t-2, ..., t-N as features
        H                  : forecast horizon
        seed               : model seed
        n_estimators       : number of boosted trees to fit
        max_depth          : maximum tree depth for base learners
        learning_rate      : boosting learning rate (xgb’s “eta”)
        min_child_weight   : minimum sum of instance weight(hessian) needed in a child
        subsample          : subsample ratio of the training instance
        colsample_bytree   : subsample ratio of columns when constructing each tree
        colsample_bylevel  : subsample ratio of columns for each split, in each level
        gamma              : 

    Outputs
        mean of rmse, mean of mape, mean of mae, dictionary of predictions
    """
    rmse_list = [] # root mean square error
    mape_list = [] # mean absolute percentage error
    mae_list = []  # mean absolute error
    preds_dict = {}
    
    # Add lags up to N number of days to use as features
    df = add_lags(df, N, ['adj_close'])
    
    # Get mean and std dev at timestamp t using values from t-1, ..., t-N
    df = get_mov_avg_std(df, 'adj_close', N)
    
    # Do scaling
    df = do_scaling(df, N)
    
    # Get list of features
    features = []
    for n in range(N,0,-1):
        features.append("adj_close_scaled_lag_"+str(n))
    
    for i in range(train_size, len(df)-H+1, int(H/2)):
        # Split into train and test
        train = df[i-train_size:i].copy()
        test = df[i:i+H].copy()
    
        # Drop the NaNs in train
        train.dropna(axis=0, how='any', inplace=True)
    
        # Split into X and y
        X_train_scaled = train[features]
        y_train_scaled = train['adj_close_scaled']
        y_test = test['adj_close']
        prev_vals = train[-N:]['adj_close'].to_numpy()
        prev_mean_val = test.iloc[0]['adj_close_mean']
        prev_std_val = test.iloc[0]['adj_close_std']
            
        rmse, mape, mae, est, _ = train_pred_eval_model(X_train_scaled,
                                                        y_train_scaled,
                                                        y_test,
                                                        N,
                                                        H,
                                                        prev_vals,
                                                        prev_mean_val,
                                                        prev_std_val,
                                                        seed=seed,
                                                        n_estimators=n_estimators,
                                                        max_depth=max_depth,
                                                        learning_rate=learning_rate,
                                                        min_child_weight=min_child_weight,
                                                        subsample=subsample,
                                                        colsample_bytree=colsample_bytree,
                                                        colsample_bylevel=colsample_bylevel,
                                                        gamma=gamma)
#         print("N = " + str(N) + ", i = " + str(i) + ", rmse = " + str(rmse) + ", mape = " + str(mape) + ", mae = " + str(mae))
        
        rmse_list.append(rmse)
        mape_list.append(mape)
        mae_list.append(mae)
        preds_dict[i] = est
    
    return np.mean(rmse_list), np.mean(mape_list), np.mean(mae_list), preds_dict 

def get_error_metrics_one_pred(df,
                               train_size,
                               N,
                               H,
                               seed=100,
                               n_estimators=100,
                               max_depth=3,
                               learning_rate=0.1,
                               min_child_weight=1,
                               subsample=1,
                               colsample_bytree=1,
                               colsample_bylevel=1,
                               gamma=0):
    """
    Given a series consisting of both train+test, do one prediction of forecast horizon H on the test set.
    Inputs
        df                 : train + test dataframe. len(df) = train_size + test_size
        train_size         : size of train set
        N                  : for feature at day t, we use lags from t-1, t-2, ..., t-N as features
        H                  : forecast horizon
        seed               : model seed
        n_estimators       : number of boosted trees to fit
        max_depth          : maximum tree depth for base learners
        learning_rate      : boosting learning rate (xgb’s “eta”)
        min_child_weight   : minimum sum of instance weight(hessian) needed in a child
        subsample          : subsample ratio of the training instance
        colsample_bytree   : subsample ratio of columns when constructing each tree
        colsample_bylevel  : subsample ratio of columns for each split, in each level
        gamma              : 

    Outputs
        rmse, mape, mae, predictions
    """    
    # Add lags up to N number of days to use as features
    df = add_lags(df, N, ['adj_close'])
    
    # Get mean and std dev at timestamp t using values from t-1, ..., t-N
    df = get_mov_avg_std(df, 'adj_close', N)
    
    # Do scaling
    df = do_scaling(df, N)
    
    # Get list of features
    features = [] # features contain all features, including adj_close_lags
    for n in range(N,0,-1):
        features.append("adj_close_scaled_lag_"+str(n))
    
    # Split into train and test
    train = df[:train_size].copy()
    test = df[train_size:train_size+H].copy()
    
    # Drop the NaNs in train
    train.dropna(axis=0, how='any', inplace=True)
    
    # Split into X and y
    X_train_scaled = train[features]
    y_train_scaled = train['adj_close_scaled']
    y_test = test['adj_close']
    prev_vals = train[-N:]['adj_close'].to_numpy()
    prev_mean_val = test.iloc[0]['adj_close_mean']
    prev_std_val = test.iloc[0]['adj_close_std']
            
    rmse, mape, mae, est, feature_importances = train_pred_eval_model(X_train_scaled,
                                                                      y_train_scaled,
                                                                      y_test,
                                                                      N,
                                                                      H,
                                                                      prev_vals,
                                                                      prev_mean_val,
                                                                      prev_std_val,
                                                                      seed=seed,
                                                                      n_estimators=n_estimators,
                                                                      max_depth=max_depth,
                                                                      learning_rate=learning_rate,
                                                                      min_child_weight=min_child_weight,
                                                                      subsample=subsample,
                                                                      colsample_bytree=colsample_bytree,
                                                                      colsample_bylevel=colsample_bylevel,
                                                                      gamma=gamma)
    
    return rmse, mape, mae, est, feature_importances, features

In [None]:
def fit_save_model(df,
                   train_size,
                   N,
                   H,
                   outpath,
                   seed=100,
                   n_estimators=100,
                   max_depth=3,
                   learning_rate=0.1,
                   min_child_weight=1,
                   subsample=1,
                   colsample_bytree=1,
                   colsample_bylevel=1,
                   gamma=0):
    """
    Fit and save model
    Inputs
        df                 : train + test dataframe. len(df) = train_size + test_size
        train_size         : size of train set
        N                  : for feature at day t, we use lags from t-1, t-2, ..., t-N as features
        H                  : forecast horizon
        outpath            : 
        seed               : model seed
        n_estimators       : number of boosted trees to fit
        max_depth          : maximum tree depth for base learners
        learning_rate      : boosting learning rate (xgb’s “eta”)
        min_child_weight   : minimum sum of instance weight(hessian) needed in a child
        subsample          : subsample ratio of the training instance
        colsample_bytree   : subsample ratio of columns when constructing each tree
        colsample_bylevel  : subsample ratio of columns for each split, in each level
        gamma              : 

    Outputs
        rmse, mape, mae, predictions
    """    
    # Add lags up to N number of days to use as features
    df = add_lags(df, N, ['adj_close'])
    
    # Get mean and std dev at timestamp t using values from t-1, ..., t-N
    df = get_mov_avg_std(df, 'adj_close', N)
    
    # Do scaling
    df = do_scaling(df, N)
    
    # Get list of features
    features = [] # features contain all features, including adj_close_lags
    for n in range(N,0,-1):
        features.append("adj_close_scaled_lag_"+str(n))
    
    # Split into train and test
    train = df[:train_size].copy()
    test = df[train_size:train_size+H].copy()
    
    # Drop the NaNs in train
    train.dropna(axis=0, how='any', inplace=True)
    
    # Split into X and y
    X_train_scaled = train[features]
    y_train_scaled = train['adj_close_scaled']
   
    # Generate model
    model = XGBRegressor(objective ='reg:squarederror',
                         seed=model_seed,
                         n_estimators=n_estimators,
                         max_depth=max_depth,
                         learning_rate=learning_rate,
                         min_child_weight=min_child_weight,
                         subsample=subsample,
                         colsample_bytree=colsample_bytree,
                         colsample_bylevel=colsample_bylevel,
                         gamma=gamma)
    
    # Train the model
    model.fit(X_train_scaled, y_train_scaled)
    
            
    joblib.dump(model, outpath)

In [3]:
df_covid = pd.read_csv("./input/covid_19_clear.csv", parse_dates=['Date'], infer_datetime_format=True)

In [4]:
df_covid.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1793 entries, 0 to 1792
Data columns (total 8 columns):
 #   Column                                                              Non-Null Count  Dtype         
---  ------                                                              --------------  -----         
 0   Country                                                             1793 non-null   object        
 1   Date                                                                1793 non-null   datetime64[ns]
 2   Confirmed                                                           1793 non-null   int64         
 3   Deaths                                                              1793 non-null   int64         
 4   Recovered                                                           1642 non-null   float64       
 5   Case fatality rate of COVID-19 (%)                                  1793 non-null   float64       
 6   Daily new confirmed cases of COVID-19 (rolling 3-day aver

In [5]:
ax = df_covid[df_covid['Country'] == 'Brazil'].plot(x='Date', y='Confirmed', style='b-', grid=True)
ax.set_xlabel("date")
ax.set_ylabel("Confirmed")

Text(0, 0.5, 'Confirmed')

In [6]:
df_info = pd.read_csv("./input/country_info.csv", parse_dates=['Lockdown Start Date'], infer_datetime_format=True)

In [7]:
df_info.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54 entries, 0 to 53
Data columns (total 23 columns):
 #   Column                               Non-Null Count  Dtype         
---  ------                               --------------  -----         
 0   Country                              54 non-null     object        
 1   Age 0-9                              50 non-null     float64       
 2   Age 10-19                            50 non-null     float64       
 3   Age 20-29                            50 non-null     float64       
 4   Age 30-39                            50 non-null     float64       
 5   Age 40-49                            50 non-null     float64       
 6   Age 50-59                            50 non-null     float64       
 7   Age 60-69                            50 non-null     float64       
 8   Age 70-79                            50 non-null     float64       
 9   Age >80                              50 non-null     float64       
 10  # People        

In [8]:
df = pd.merge(df_covid, df_info, left_on='Country', right_on='Country', how='inner', suffixes=('', ''))

In [9]:
del df_covid
del df_info

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1276 entries, 0 to 1275
Data columns (total 30 columns):
 #   Column                                                              Non-Null Count  Dtype         
---  ------                                                              --------------  -----         
 0   Country                                                             1276 non-null   object        
 1   Date                                                                1276 non-null   datetime64[ns]
 2   Confirmed                                                           1276 non-null   int64         
 3   Deaths                                                              1276 non-null   int64         
 4   Recovered                                                           1125 non-null   float64       
 5   Case fatality rate of COVID-19 (%)                                  1276 non-null   float64       
 6   Daily new confirmed cases of COVID-19 (rolling 3-day ave

In [11]:
df.loc[df['Date'].dt.month == 1, 'Country Temperature ºC'] = df['Temperature Jan (ºC)']
df.loc[df['Date'].dt.month == 2, 'Country Temperature ºC'] = df['Temperature Feb (ºC)']
df.loc[df['Date'].dt.month == 3, 'Country Temperature ºC'] = df['Temperature Mar (ºC)']
df.loc[df['Date'].dt.month == 4, 'Country Temperature ºC'] = df['Temperature Apr (ºC)']

In [12]:
df.drop(columns=['Temperature Jan (ºC)', 'Temperature Feb (ºC)', 'Temperature Mar (ºC)', 'Temperature Apr (ºC)', 'Deaths', 'Recovered', 'Daily new confirmed deaths due to COVID-19 (rolling 3-day average)', 'Hospital beds (per 1,000 people)', 'Lockdown Start Date'], inplace=True)

In [13]:
df.head()

Unnamed: 0,Country,Date,Confirmed,Case fatality rate of COVID-19 (%),Daily new confirmed cases of COVID-19 (rolling 3-day average),Age 0-9,Age 10-19,Age 20-29,Age 30-39,Age 40-49,...,Age 70-79,Age >80,# People,GDP,GDP per capta,Life expectancy,# Flight Passengers,Population density (people per km²),Lockdown Level,Country Temperature ºC
0,Argentina,2020-03-20,128,2.34375,26.333333,7431085.0,7110303.0,6989730.0,6393900.0,5596155.0,...,2234324.0,818779.0,4441.8446,813726.335884,10398.092465,76.372,18081937.0,16.176856,3,51.6
1,Argentina,2020-03-21,158,1.898734,42.666667,7431085.0,7110303.0,6989730.0,6393900.0,5596155.0,...,2234324.0,818779.0,4441.8446,813726.335884,10398.092465,76.372,18081937.0,16.176856,3,51.6
2,Argentina,2020-03-22,266,1.777778,46.0,7431085.0,7110303.0,6989730.0,6393900.0,5596155.0,...,2234324.0,818779.0,4441.8446,813726.335884,10398.092465,76.372,18081937.0,16.176856,3,51.6
3,Argentina,2020-03-23,301,1.503759,47.666667,7431085.0,7110303.0,6989730.0,6393900.0,5596155.0,...,2234324.0,818779.0,4441.8446,813726.335884,10398.092465,76.372,18081937.0,16.176856,3,51.6
4,Argentina,2020-03-24,387,1.328904,54.0,7431085.0,7110303.0,6989730.0,6393900.0,5596155.0,...,2234324.0,818779.0,4441.8446,813726.335884,10398.092465,76.372,18081937.0,16.176856,3,51.6


In [14]:
from sklearn.preprocessing import LabelEncoder

lb_make = LabelEncoder()
df["country_code"] = lb_make.fit_transform(df["Country"])
df[["Country", "country_code"]]

Unnamed: 0,Country,country_code
0,Argentina,0
1,Argentina,0
2,Argentina,0
3,Argentina,0
4,Argentina,0
...,...,...
1271,Wuhan,53
1272,Wuhan,53
1273,Wuhan,53
1274,Wuhan,53


In [15]:
df.sort_values(by=['Country','Date'], inplace=True)

In [16]:
df

Unnamed: 0,Country,Date,Confirmed,Case fatality rate of COVID-19 (%),Daily new confirmed cases of COVID-19 (rolling 3-day average),Age 0-9,Age 10-19,Age 20-29,Age 30-39,Age 40-49,...,Age >80,# People,GDP,GDP per capta,Life expectancy,# Flight Passengers,Population density (people per km²),Lockdown Level,Country Temperature ºC,country_code
0,Argentina,2020-03-20,128,2.343750,26.333333,7431085.0,7110303.0,6989730.0,6393900.0,5596155.0,...,818779.0,4441.8446,813726.335884,10398.092465,76.372,18081937.0,16.176856,3,51.6,0
1,Argentina,2020-03-21,158,1.898734,42.666667,7431085.0,7110303.0,6989730.0,6393900.0,5596155.0,...,818779.0,4441.8446,813726.335884,10398.092465,76.372,18081937.0,16.176856,3,51.6,0
2,Argentina,2020-03-22,266,1.777778,46.000000,7431085.0,7110303.0,6989730.0,6393900.0,5596155.0,...,818779.0,4441.8446,813726.335884,10398.092465,76.372,18081937.0,16.176856,3,51.6,0
3,Argentina,2020-03-23,301,1.503759,47.666667,7431085.0,7110303.0,6989730.0,6393900.0,5596155.0,...,818779.0,4441.8446,813726.335884,10398.092465,76.372,18081937.0,16.176856,3,51.6,0
4,Argentina,2020-03-24,387,1.328904,54.000000,7431085.0,7110303.0,6989730.0,6393900.0,5596155.0,...,818779.0,4441.8446,813726.335884,10398.092465,76.372,18081937.0,16.176856,3,51.6,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1271,Wuhan,2020-03-28,67801,4.685772,4.666667,,,,,,...,,,224000.000000,16528.000000,,,7000.000000,3,10.6,53
1272,Wuhan,2020-03-29,67801,4.693146,4.333333,,,,,,...,,,224000.000000,16528.000000,,,7000.000000,3,10.6,53
1273,Wuhan,2020-03-30,67801,4.699046,4.000000,,,,,,...,,,224000.000000,16528.000000,,,7000.000000,3,10.6,53
1274,Wuhan,2020-03-31,67801,4.700521,3.333333,,,,,,...,,,224000.000000,16528.000000,,,7000.000000,3,10.6,53


# Feature Engeneering 

In [17]:
# Add a column 'order_day' to indicate the order of the rows by date
for country in df['Country'].unique():
    df.loc[df['Country'] == country, 'order_day'] = [x for x in list(range(len( df.loc[df['Country'] == country, :])))]

# merging_keys
merging_keys = ['Country', 'country_code','order_day']

# List of columns that we will use to create lags
lag_cols = df.columns.values.tolist()[2:22]
lag_cols

['Confirmed',
 'Case fatality rate of COVID-19 (%)',
 'Daily new confirmed cases of COVID-19 (rolling 3-day average)',
 'Age 0-9',
 'Age 10-19',
 'Age 20-29',
 'Age 30-39',
 'Age 40-49',
 'Age 50-59',
 'Age 60-69',
 'Age 70-79',
 'Age >80',
 '# People',
 'GDP',
 'GDP per capta',
 'Life expectancy',
 '# Flight Passengers',
 'Population density (people per km²)',
 'Lockdown Level',
 'Country Temperature ºC']

In [18]:
N = 4
shift_range = [x+1 for x in range(N)]

for shift in tqdm_notebook(shift_range):
    train_shift = df[merging_keys + lag_cols].copy()
    
    for country in df['Country'].unique():    
        # E.g. order_day of 0 becomes 1, for shift = 1.
        # So when this is merged with order_day of 1 in df, this will represent lag of 1.

        train_shift.loc[train_shift['Country'] == country, 'order_day'] =train_shift.loc[train_shift['Country'] == country, 'order_day'] + shift

    foo = lambda x: '{}_lag_{}'.format(x, shift) if x in lag_cols else x
    train_shift = train_shift.rename(columns=foo)

    df = pd.merge(df, train_shift, on=merging_keys, how='left')
    
del train_shift

# Remove the first N rows which contain NaNs
df = df[N:]


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  after removing the cwd from sys.path.


HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))




In [19]:
df.dropna(inplace=True)

In [4]:
df[df['Country'] == 'United States'].head()

NameError: name 'df' is not defined

In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 824 entries, 4 to 1204
Columns: 104 entries, Country to Country Temperature ºC_lag_4
dtypes: datetime64[ns](1), float64(99), int64(3), object(1)
memory usage: 675.9+ KB


# Get mean and std dev at timestamp t using values from t-1, ..., t-N

In [22]:
cols_list = lag_cols

for col in cols_list:
    df = get_mov_avg_std(df, col, N)
df.head()

Unnamed: 0,Country,Date,Confirmed,Case fatality rate of COVID-19 (%),Daily new confirmed cases of COVID-19 (rolling 3-day average),Age 0-9,Age 10-19,Age 20-29,Age 30-39,Age 40-49,...,Life expectancy_mean,Life expectancy_std,# Flight Passengers_mean,# Flight Passengers_std,Population density (people per km²)_mean,Population density (people per km²)_std,Lockdown Level_mean,Lockdown Level_std,Country Temperature ºC_mean,Country Temperature ºC_std
4,Argentina,2020-03-24,387,1.328904,54.0,7431085.0,7110303.0,6989730.0,6393900.0,5596155.0,...,,,,,,,,,,
5,Argentina,2020-03-25,387,1.550388,78.666667,7431085.0,7110303.0,6989730.0,6393900.0,5596155.0,...,76.372,,18081937.0,,16.176856,,3.0,,51.6,
6,Argentina,2020-03-26,502,1.593625,96.0,7431085.0,7110303.0,6989730.0,6393900.0,5596155.0,...,76.372,0.0,18081937.0,0.0,16.176856,0.0,3.0,0.0,51.6,0.0
7,Argentina,2020-03-27,589,2.037351,101.0,7431085.0,7110303.0,6989730.0,6393900.0,5596155.0,...,76.372,0.0,18081937.0,0.0,16.176856,0.0,3.0,0.0,51.6,0.0
8,Argentina,2020-03-28,690,2.463768,81.0,7431085.0,7110303.0,6989730.0,6393900.0,5596155.0,...,76.372,0.0,18081937.0,0.0,16.176856,0.0,3.0,0.0,51.6,0.0


# Split into train, validation and test set

In [23]:
valid_date = dt.datetime.today() - dt.timedelta(days=7)

mask_valid = (df['Date'] >= valid_date )
mask_valid_br = ((df['Date'] >= valid_date ) & (df['Country'] == 'Brazil'))
mask_train = (df['Date'] < valid_date )

# Split into train, valid, and test
train = df.loc[mask_train]
valid = df.loc[mask_valid]
valid_br = df[mask_valid_br]

print("train.shape = " + str(train.shape))
print("valid.shape = " + str(valid.shape))
print("valid_br.shape = " + str(valid_br.shape))

train.shape = (554, 144)
valid.shape = (270, 144)
valid_br.shape = (6, 144)


# Scale the train, validation and test set¶

In [24]:
cols_to_scale = ['Confirmed']

for i in range(1,N+1):
    for col in lag_cols:
        print(col + "_lag_"+ str(i))
        cols_to_scale.append(col + "_lag_" + str(i))

# Do scaling for train set
# Here we only scale the train dataset, and not the entire dataset to prevent information leak
scaler = StandardScaler()
train_scaled = scaler.fit_transform(train[cols_to_scale])
print("scaler.mean_ = " + str(scaler.mean_))
print("scaler.var_ = " + str(scaler.var_))
print("train_scaled.shape = " + str(train_scaled.shape))

# Convert the numpy array back into pandas dataframe
train_scaled = pd.DataFrame(train_scaled, columns=cols_to_scale)
train_scaled[['Date', 'country_code']] = train.reset_index()[['Date', 'country_code']]
print("train_scaled.shape = " + str(train_scaled.shape))
train_scaled.head()

Confirmed_lag_1
Case fatality rate of COVID-19 (%)_lag_1
Daily new confirmed cases of COVID-19 (rolling 3-day average)_lag_1
Age 0-9_lag_1
Age 10-19_lag_1
Age 20-29_lag_1
Age 30-39_lag_1
Age 40-49_lag_1
Age 50-59_lag_1
Age 60-69_lag_1
Age 70-79_lag_1
Age >80_lag_1
# People_lag_1
GDP_lag_1
GDP per capta_lag_1
Life expectancy_lag_1
# Flight Passengers_lag_1
Population density (people per km²)_lag_1
Lockdown Level_lag_1
Country Temperature ºC_lag_1
Confirmed_lag_2
Case fatality rate of COVID-19 (%)_lag_2
Daily new confirmed cases of COVID-19 (rolling 3-day average)_lag_2
Age 0-9_lag_2
Age 10-19_lag_2
Age 20-29_lag_2
Age 30-39_lag_2
Age 40-49_lag_2
Age 50-59_lag_2
Age 60-69_lag_2
Age 70-79_lag_2
Age >80_lag_2
# People_lag_2
GDP_lag_2
GDP per capta_lag_2
Life expectancy_lag_2
# Flight Passengers_lag_2
Population density (people per km²)_lag_2
Lockdown Level_lag_2
Country Temperature ºC_lag_2
Confirmed_lag_3
Case fatality rate of COVID-19 (%)_lag_3
Daily new confirmed cases of COVID-19 (roll

Unnamed: 0,Confirmed,Confirmed_lag_1,Case fatality rate of COVID-19 (%)_lag_1,Daily new confirmed cases of COVID-19 (rolling 3-day average)_lag_1,Age 0-9_lag_1,Age 10-19_lag_1,Age 20-29_lag_1,Age 30-39_lag_1,Age 40-49_lag_1,Age 50-59_lag_1,...,# People_lag_4,GDP_lag_4,GDP per capta_lag_4,Life expectancy_lag_4,# Flight Passengers_lag_4,Population density (people per km²)_lag_4,Lockdown Level_lag_4,Country Temperature ºC_lag_4,Date,country_code
0,-0.491338,-0.472478,-0.167025,-0.446831,-0.369996,-0.375977,-0.388546,-0.399951,-0.40937,-0.421238,...,-0.407187,-0.523345,-1.061123,-0.872975,-0.583427,-0.31496,0.736817,1.296907,2020-03-24,0
1,-0.491338,-0.468567,-0.255131,-0.442232,-0.369996,-0.375977,-0.388546,-0.399951,-0.40937,-0.421238,...,-0.407187,-0.523345,-1.061123,-0.872975,-0.583427,-0.31496,0.736817,1.296907,2020-03-25,0
2,-0.486264,-0.468567,-0.14353,-0.424324,-0.369996,-0.375977,-0.388546,-0.399951,-0.40937,-0.421238,...,-0.407187,-0.523345,-1.061123,-0.872975,-0.583427,-0.31496,0.736817,1.296907,2020-03-26,0
3,-0.497383,-0.477071,0.044261,-0.460867,-0.443287,-0.448159,-0.445943,-0.438738,-0.442834,-0.440541,...,-0.450754,-0.478752,1.072216,0.825359,-0.330667,-0.323516,0.736817,1.32719,2020-03-14,1
4,-0.495309,-0.474797,-0.157409,-0.451671,-0.443287,-0.448159,-0.445943,-0.438738,-0.442834,-0.440541,...,-0.450754,-0.478752,1.072216,0.825359,-0.330667,-0.323516,0.736817,1.32719,2020-03-15,1


In [25]:
valid_scaled = valid[['Date', 'country_code']]
for col in tqdm_notebook(cols_list):
    feat_list = [col + '_lag_' + str(shift) for shift in range(1, N+1)]
    temp = valid.apply(lambda row: scale_row(row[feat_list], row[col+'_mean'], row[col+'_std']), axis=1)
    valid_scaled = pd.concat([valid_scaled, temp], axis=1)
    
# Now the entire valid set is scaled
valid_scaled.head()

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))




Unnamed: 0,Date,country_code,Confirmed_lag_1,Confirmed_lag_2,Confirmed_lag_3,Confirmed_lag_4,Case fatality rate of COVID-19 (%)_lag_1,Case fatality rate of COVID-19 (%)_lag_2,Case fatality rate of COVID-19 (%)_lag_3,Case fatality rate of COVID-19 (%)_lag_4,...,Population density (people per km²)_lag_3,Population density (people per km²)_lag_4,Lockdown Level_lag_1,Lockdown Level_lag_2,Lockdown Level_lag_3,Lockdown Level_lag_4,Country Temperature ºC_lag_1,Country Temperature ºC_lag_2,Country Temperature ºC_lag_3,Country Temperature ºC_lag_4
7,2020-03-27,0,1.154701,-0.57735,-0.57735,-1.872623,0.722856,0.418386,-1.141242,0.090044,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,2020-03-28,0,1.250497,0.364198,-0.807347,-0.807347,1.380777,-0.114367,-0.260058,-1.006352,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,2020-03-29,0,1.149419,0.365018,-0.310654,-1.203783,1.287655,0.293822,-0.740352,-0.841125,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,2020-03-30,0,1.052597,0.542528,-0.394144,-1.20098,0.884504,0.687701,-0.281718,-1.290487,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11,2020-03-31,0,1.121034,0.34968,-0.215979,-1.254735,0.290547,0.777579,0.398811,-1.466937,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [26]:
valid_scaled.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 270 entries, 7 to 1204
Data columns (total 82 columns):
 #   Column                                                               Non-Null Count  Dtype         
---  ------                                                               --------------  -----         
 0   Date                                                                 270 non-null    datetime64[ns]
 1   country_code                                                         270 non-null    int64         
 2   Confirmed_lag_1                                                      270 non-null    float64       
 3   Confirmed_lag_2                                                      270 non-null    float64       
 4   Confirmed_lag_3                                                      270 non-null    float64       
 5   Confirmed_lag_4                                                      270 non-null    float64       
 6   Case fatality rate of COVID-19 (%)_lag_1         

In [27]:
valid_br_scaled = valid_br[['Date', 'country_code']]
for col in tqdm_notebook(cols_list):
    feat_list = [col + '_lag_' + str(shift) for shift in range(1, N+1)]
    temp = valid_br.apply(lambda row: scale_row(row[feat_list], row[col+'_mean'], row[col+'_std']), axis=1)
    valid_br_scaled = pd.concat([valid_br_scaled, temp], axis=1)
    
# Now the entire valid_br set is scaled
valid_br_scaled.head()

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))




Unnamed: 0,Date,country_code,Confirmed_lag_1,Confirmed_lag_2,Confirmed_lag_3,Confirmed_lag_4,Case fatality rate of COVID-19 (%)_lag_1,Case fatality rate of COVID-19 (%)_lag_2,Case fatality rate of COVID-19 (%)_lag_3,Case fatality rate of COVID-19 (%)_lag_4,...,Population density (people per km²)_lag_3,Population density (people per km²)_lag_4,Lockdown Level_lag_1,Lockdown Level_lag_2,Lockdown Level_lag_3,Lockdown Level_lag_4,Country Temperature ºC_lag_1,Country Temperature ºC_lag_2,Country Temperature ºC_lag_3,Country Temperature ºC_lag_4
98,2020-03-27,4,1.233418,0.27987,-0.39934,-1.113948,1.190021,0.39999,-0.512347,-1.077664,...,9.237056e-11,9.237056e-11,0.0,0.0,0.0,0.0,-1.421085e-11,-1.421085e-11,-1.421085e-11,-1.421085e-11
99,2020-03-28,4,1.207609,0.361058,-0.483533,-1.085133,1.17786,0.346935,-0.356328,-1.168466,...,9.237056e-11,9.237056e-11,0.0,0.0,0.0,0.0,-1.421085e-11,-1.421085e-11,-1.421085e-11,-1.421085e-11
100,2020-03-29,4,1.190206,0.348943,-0.397311,-1.141838,0.893473,0.712068,-0.352335,-1.253206,...,9.237056e-11,9.237056e-11,0.0,0.0,0.0,0.0,-1.421085e-11,-1.421085e-11,-1.421085e-11,-1.421085e-11
101,2020-03-30,4,1.106718,0.473794,-0.401871,-1.178641,1.140643,0.181995,-0.032382,-1.290257,...,9.237056e-11,9.237056e-11,0.0,0.0,0.0,0.0,-1.421085e-11,-1.421085e-11,-1.421085e-11,-1.421085e-11
102,2020-03-31,4,1.084493,0.435806,-0.271123,-1.249176,1.316881,0.22813,-0.671874,-0.873137,...,9.237056e-11,9.237056e-11,0.0,0.0,0.0,0.0,-1.421085e-11,-1.421085e-11,-1.421085e-11,-1.421085e-11


# Split into X and y

In [28]:
features = ['country_code']
for i in range(1,N+1):
    for col in lag_cols:
        features.append(col + "_lag_" + str(i))
    
target = "Confirmed"


# Split into X and y
X_train = train[features]
y_train = train[target]
X_valid = valid[features]
y_valid = valid[target]
X_valid_br = valid_br[features]
y_valid_br = valid_br[target]

print("X_train.shape = " + str(X_train.shape))
print("y_train.shape = " + str(y_train.shape))
print("X_valid.shape = " + str(X_valid.shape))
print("y_valid_br.shape = " + str(y_valid_br.shape))
print("X_valid_br.shape = " + str(X_valid_br.shape))
print("y_valid.shape = " + str(y_valid.shape))

X_train.shape = (554, 81)
y_train.shape = (554,)
X_valid.shape = (270, 81)
y_valid_br.shape = (6,)
X_valid_br.shape = (6, 81)
y_valid.shape = (270,)


In [29]:
# Split into X and y
X_train_scaled = train_scaled[features]
y_train_scaled = train_scaled[target]
X_valid_scaled = valid_scaled[features]
X_valid_br_scaled = valid_br_scaled[features]

print("X_train_scaled.shape = " + str(X_train_scaled.shape))
print("y_train_scaled.shape = " + str(y_train_scaled.shape))
print("X_valid_scaled.shape = " + str(X_valid_scaled.shape))
print("X_valid_br_scaled.shape = " + str(X_valid_br_scaled.shape))

X_train_scaled.shape = (554, 81)
y_train_scaled.shape = (554,)
X_valid_scaled.shape = (270, 81)
X_valid_br_scaled.shape = (6, 81)


# EDA

In [30]:
mask_train = (train['Country'] == 'Brazil')
mask_valid = (valid['Country'] == 'Brazil')

ax = train.loc[mask_train].plot(x='Date', y='Confirmed', style='b-', grid=True)
ax = valid.loc[mask_valid].plot(x='Date', y='Confirmed', style='y-', grid=True, ax=ax)
ax.legend(['train', 'validation'])
ax.set_xlabel("date")
ax.set_ylabel("Confirmed")
ax.set_title("Without scaling")

Text(0.5, 1.0, 'Without scaling')

In [31]:
code = valid[valid['Country'] == 'Brazil']['country_code'].unique()
ax = train_scaled[train_scaled['country_code'] ==  code[0]].plot(x='Date', y='Confirmed', style='b-', grid=True)
ax.legend(['train_scaled'])
ax.set_xlabel("date")
ax.set_ylabel("Confirmed (scaled)")
ax.set_title("With scaling")

Text(0.5, 1.0, 'With scaling')

# Train the model using XGBoost


In [32]:
n_estimators = 100             # Number of boosted trees to fit. default = 100
max_depth = 3                  # Maximum tree depth for base learners. default = 3
learning_rate = 0.1            # Boosting learning rate (xgb’s “eta”). default = 0.1
min_child_weight = 1           # Minimum sum of instance weight(hessian) needed in a child. default = 1
subsample = 1                  # Subsample ratio of the training instance. default = 1
colsample_bytree = 1           # Subsample ratio of columns when constructing each tree. default = 1
colsample_bylevel = 1          # Subsample ratio of columns for each split, in each level. default = 1
gamma = 0  
model_seed = 100


In [33]:
X_train_scaled.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 554 entries, 0 to 553
Data columns (total 81 columns):
 #   Column                                                               Non-Null Count  Dtype  
---  ------                                                               --------------  -----  
 0   country_code                                                         554 non-null    int64  
 1   Confirmed_lag_1                                                      554 non-null    float64
 2   Case fatality rate of COVID-19 (%)_lag_1                             554 non-null    float64
 3   Daily new confirmed cases of COVID-19 (rolling 3-day average)_lag_1  554 non-null    float64
 4   Age 0-9_lag_1                                                        554 non-null    float64
 5   Age 10-19_lag_1                                                      554 non-null    float64
 6   Age 20-29_lag_1                                                      554 non-null    float64
 7   Age 30-3

In [34]:
# Create the model
model = XGBRegressor(seed=model_seed,
                     n_estimators=n_estimators,
                     max_depth=max_depth,
                     learning_rate=learning_rate,
                     min_child_weight=min_child_weight,
                     subsample=subsample,
                     colsample_bytree=colsample_bytree,
                     colsample_bylevel=colsample_bylevel,
                     gamma=gamma)

# Train the regressor
model.fit(X_train_scaled, y_train_scaled)

XGBRegressor(base_score=None, booster=None, colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints=None,
             learning_rate=0.1, max_delta_step=0, max_depth=3,
             min_child_weight=1, missing=nan, monotone_constraints=None,
             n_estimators=100, n_jobs=0, num_parallel_tree=1,
             objective='reg:squarederror', random_state=100, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, seed=100, subsample=1,
             tree_method=None, validate_parameters=False, verbosity=None)

# Predict on train set

In [35]:
def get_mape(y_true, y_pred): 
    """
    Compute mean absolute percentage error (MAPE)
    """
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [36]:
# Do prediction on train set
est_scaled = model.predict(X_train_scaled)
est = est_scaled * math.sqrt(scaler.var_[0]) + scaler.mean_[0]

# Calculate RMSE
print("RMSE on train set = %0.3f" % math.sqrt(mean_squared_error(y_train, est)))

# Calculate MAPE
print("MAPE on train set = %0.3f%%" % get_mape(y_train, est))

RMSE on train set = 227.808
MAPE on train set = 6.999%


In [37]:
est_df = pd.DataFrame({'est': est, 
                       'Date': train['Date'], 
                       'Country': train['Country']})

country = 'Brazil'
mask_train = (train['Country'] == country)
mask_valid = (valid['Country'] == country)
mask_est = (est_df['Country'] == country)

ax = train.loc[mask_train].plot(x='Date', y='Confirmed', style='b-', grid=True)
ax = valid.loc[mask_valid].plot(x='Date', y='Confirmed', style='y-', grid=True, ax=ax)
ax = est_df.loc[mask_est].plot(x='Date', y='est', style='r-', grid=True, ax=ax)
ax.legend(['train', 'validation', 'predictions'])
ax.set_xlabel("Date")
ax.set_ylabel("Confirmed")

Text(0, 0.5, 'Confirmed')

# Predict on valid set brasil

In [38]:
valid_br

Unnamed: 0,Country,Date,Confirmed,Case fatality rate of COVID-19 (%),Daily new confirmed cases of COVID-19 (rolling 3-day average),Age 0-9,Age 10-19,Age 20-29,Age 30-39,Age 40-49,...,Life expectancy_mean,Life expectancy_std,# Flight Passengers_mean,# Flight Passengers_std,Population density (people per km²)_mean,Population density (people per km²)_std,Lockdown Level_mean,Lockdown Level_std,Country Temperature ºC_mean,Country Temperature ºC_std
98,Brazil,2020-03-27,3417,2.641509,405.333333,29188180.0,31633075.0,34181400.0,34436184.0,28902917.0,...,75.456,5.308942e-07,102109977.0,0.57735,25.040054,0.0,2.0,0.0,59.5,0.0
99,Brazil,2020-03-28,3904,2.69242,490.333333,29188180.0,31633075.0,34181400.0,34436184.0,28902917.0,...,75.456,5.308942e-07,102109977.0,0.57735,25.040054,0.0,2.0,0.0,59.5,0.0
100,Brazil,2020-03-29,4256,2.920082,447.0,29188180.0,31633075.0,34181400.0,34436184.0,28902917.0,...,75.456,5.308942e-07,102109977.0,0.57735,25.040054,0.0,2.0,0.0,59.5,0.0
101,Brazil,2020-03-30,4579,3.195489,387.333333,29188180.0,31633075.0,34181400.0,34436184.0,28902917.0,...,75.456,5.308942e-07,102109977.0,0.57735,25.040054,0.0,2.0,0.0,59.5,0.0
102,Brazil,2020-03-31,5717,3.472374,604.333333,29188180.0,31633075.0,34181400.0,34436184.0,28902917.0,...,75.456,5.308942e-07,102109977.0,0.57735,25.040054,0.0,2.0,0.0,59.5,0.0
103,Brazil,2020-04-01,6836,3.51583,730.5,29188180.0,31633075.0,34181400.0,34436184.0,28902917.0,...,75.456,5.308942e-07,102109977.0,0.57735,25.040054,0.0,2.0,0.0,59.5,0.0


In [39]:
# Do prediction on test set
est_scaled = model.predict(X_valid_br_scaled)
valid_br['est_scaled'] = est_scaled
valid_br['est'] = valid_br['est_scaled'] * valid_br['Confirmed_std'] + valid_br['Confirmed_mean']

# Calculate RMSE
rmse_bef_tuning = math.sqrt(mean_squared_error(y_valid_br, valid_br['est']))
print("RMSE on dev set = %0.3f" % rmse_bef_tuning)

# Calculate MAPE
mape_bef_tuning = get_mape(y_valid_br, valid_br['est'])
print("MAPE on dev set = %0.3f%%" % mape_bef_tuning)

RMSE on dev set = 770.184
MAPE on dev set = 13.465%


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [40]:
est_df = pd.DataFrame({'est': valid_br['est'], 
                       'y_valid_br': y_valid_br,
                       'Date': valid_br['Date']})

country = 'Brazil'
mask_train = (train['Country'] == country)

ax = train.loc[mask_train].plot(x='Date', y='Confirmed', style='b-', grid=True)
ax = valid_br.plot(x='Date', y='Confirmed', style='y-', grid=True, ax=ax)
ax = est_df.plot(x='Date', y='est', style='r-', grid=True, ax=ax)
ax.legend(['train', 'validation', 'predictions'])
ax.set_xlabel("Date")
ax.set_ylabel("Confirmed")

Text(0, 0.5, 'Confirmed')

In [41]:
est_df = pd.DataFrame({'est': valid_br['est'], 
                       'y_valid_br': y_valid_br,
                       'Date': valid_br['Date']})

ax = valid_br.plot(x='Date', y='Confirmed', style='y-', grid=True)
ax = est_df.plot(x='Date', y='est', style='r-', grid=True, ax=ax)
ax.legend(['validation', 'predictions'])
ax.set_xlabel("Date")
ax.set_ylabel("Confirmed")

Text(0, 0.5, 'Confirmed')

In [42]:
_ = plot_importance(model, height=0.8)