In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from xgboost import XGBRegressor, plot_importance, plot_tree
from tqdm import tqdm_notebook
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
import math
import datetime as dt

%matplotlib qt5

In [2]:
def get_mov_avg_std(df, col, N):
    """
    Given a dataframe, get mean and std dev at timestep t using values from t-1, t-2, ..., t-N.
    Inputs
        df         : dataframe. Can be of any length.
        col        : name of the column you want to calculate mean and std dev
        N          : get mean and std dev at timestep t using values from t-1, t-2, ..., t-N
    Outputs
        df_out     : same as df but with additional column containing mean and std dev
    """
    mean_list = df[col].rolling(window = N, min_periods=1).mean() # len(mean_list) = len(df)
    std_list = df[col].rolling(window = N, min_periods=1).std()   # first value will be NaN, because normalized by N-1
    
    # Add one timestep to the predictions
    mean_list = np.concatenate((np.array([np.nan]), np.array(mean_list[:-1])))
    std_list = np.concatenate((np.array([np.nan]), np.array(std_list[:-1])))
    
    # Append mean_list to df
    df_out = df.copy()
    df_out[col + '_mean'] = mean_list
    df_out[col + '_std'] = std_list
    
    return df_out

def scale_row(row, feat_mean, feat_std):
    """
    Given a pandas series in row, scale it to have 0 mean and var 1 using feat_mean and feat_std
    Inputs
        row      : pandas series. Need to scale this.
        feat_mean: mean  
        feat_std : standard deviation
    Outputs
        row_scaled : pandas series with same length as row, but scaled
    """
    # If feat_std = 0 (this happens if adj_close doesn't change over N days), 
    # set it to a small number to avoid division by zero
    feat_std = 0.001 if feat_std == 0 else feat_std
    
    row_scaled = (row-feat_mean) / feat_std
    
    return row_scaled

In [3]:
df_covid = pd.read_csv("./input/covid_19_clear.csv", parse_dates=['Date'], infer_datetime_format=True)

In [4]:
df_covid.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6516 entries, 0 to 6515
Data columns (total 8 columns):
 #   Column                                                              Non-Null Count  Dtype         
---  ------                                                              --------------  -----         
 0   Country                                                             6516 non-null   object        
 1   Date                                                                6516 non-null   datetime64[ns]
 2   Confirmed                                                           6516 non-null   int64         
 3   Deaths                                                              6516 non-null   int64         
 4   Recovered                                                           6364 non-null   float64       
 5   Case fatality rate of COVID-19 (%)                                  1794 non-null   float64       
 6   Daily new confirmed cases of COVID-19 (rolling 3-day aver

In [5]:
ax = df_covid[df_covid['Country'] == 'Brazil'].plot(x='Date', y='Confirmed', style='b-', grid=True)
ax.set_xlabel("date")
ax.set_ylabel("Confirmed")

Text(0, 0.5, 'Confirmed')

In [6]:
df_info = pd.read_csv("./input/country_info.csv", parse_dates=['Lockdown Start Date'], infer_datetime_format=True)

In [7]:
df_info.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54 entries, 0 to 53
Data columns (total 23 columns):
 #   Column                               Non-Null Count  Dtype         
---  ------                               --------------  -----         
 0   Country                              54 non-null     object        
 1   Age 0-9                              50 non-null     float64       
 2   Age 10-19                            50 non-null     float64       
 3   Age 20-29                            50 non-null     float64       
 4   Age 30-39                            50 non-null     float64       
 5   Age 40-49                            50 non-null     float64       
 6   Age 50-59                            50 non-null     float64       
 7   Age 60-69                            50 non-null     float64       
 8   Age 70-79                            50 non-null     float64       
 9   Age >80                              50 non-null     float64       
 10  # People        

In [8]:
df = pd.merge(df_covid, df_info, left_on='Country', right_on='Country', how='inner', suffixes=('', ''))

In [9]:
del df_covid
del df_info

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3202 entries, 0 to 3201
Data columns (total 30 columns):
 #   Column                                                              Non-Null Count  Dtype         
---  ------                                                              --------------  -----         
 0   Country                                                             3202 non-null   object        
 1   Date                                                                3202 non-null   datetime64[ns]
 2   Confirmed                                                           3202 non-null   int64         
 3   Deaths                                                              3202 non-null   int64         
 4   Recovered                                                           3050 non-null   float64       
 5   Case fatality rate of COVID-19 (%)                                  1277 non-null   float64       
 6   Daily new confirmed cases of COVID-19 (rolling 3-day ave

In [11]:
n_cases = 50

df.loc[df['Date'].dt.month == 1, 'Country Temperature ºC'] = df['Temperature Jan (ºC)']
df.loc[df['Date'].dt.month == 2, 'Country Temperature ºC'] = df['Temperature Feb (ºC)']
df.loc[df['Date'].dt.month == 3, 'Country Temperature ºC'] = df['Temperature Mar (ºC)']
df.loc[df['Date'].dt.month == 4, 'Country Temperature ºC'] = df['Temperature Apr (ºC)']

df = df[df['Confirmed'] > n_cases]

In [12]:
from sklearn.preprocessing import LabelEncoder

lb_make = LabelEncoder()
df["country_code"] = lb_make.fit_transform(df["Country"])
df[["Country", "country_code"]]

Unnamed: 0,Country,country_code
12,Argentina,0
13,Argentina,0
14,Argentina,0
15,Argentina,0
16,Argentina,0
...,...,...
3197,Wuhan,53
3198,Wuhan,53
3199,Wuhan,53
3200,Wuhan,53


In [13]:
df.sort_values(by=['Country','Date'], inplace=True)

In [14]:
df_confirmed = df.drop(columns=['Temperature Jan (ºC)', 'Temperature Feb (ºC)', 'Temperature Mar (ºC)', 'Temperature Apr (ºC)', 'Deaths', 'Recovered', 'Daily new confirmed deaths due to COVID-19 (rolling 3-day average)', 'Hospital beds (per 1,000 people)', 'Lockdown Start Date'])
df_deaths = df.drop(columns=['Temperature Jan (ºC)', 'Temperature Feb (ºC)', 'Temperature Mar (ºC)', 'Temperature Apr (ºC)', 'Recovered', 'Lockdown Start Date'])

In [15]:
df_confirmed.head()

Unnamed: 0,Country,Date,Confirmed,Case fatality rate of COVID-19 (%),Daily new confirmed cases of COVID-19 (rolling 3-day average),Age 0-9,Age 10-19,Age 20-29,Age 30-39,Age 40-49,...,Age >80,# People,GDP,GDP per capta,Life expectancy,# Flight Passengers,Population density (people per km²),Lockdown Level,Country Temperature ºC,country_code
12,Argentina,2020-03-16,56,,10.333333,7431085.0,7110303.0,6989730.0,6393900.0,5596155.0,...,818779.0,4441.8446,813726.335884,10398.092465,76.372,18081937.0,16.176856,3,51.6,0
13,Argentina,2020-03-17,68,,11.333333,7431085.0,7110303.0,6989730.0,6393900.0,5596155.0,...,818779.0,4441.8446,813726.335884,10398.092465,76.372,18081937.0,16.176856,3,51.6,0
14,Argentina,2020-03-18,79,,13.666667,7431085.0,7110303.0,6989730.0,6393900.0,5596155.0,...,818779.0,4441.8446,813726.335884,10398.092465,76.372,18081937.0,16.176856,3,51.6,0
15,Argentina,2020-03-19,97,,21.0,7431085.0,7110303.0,6989730.0,6393900.0,5596155.0,...,818779.0,4441.8446,813726.335884,10398.092465,76.372,18081937.0,16.176856,3,51.6,0
16,Argentina,2020-03-20,128,2.34375,26.333333,7431085.0,7110303.0,6989730.0,6393900.0,5596155.0,...,818779.0,4441.8446,813726.335884,10398.092465,76.372,18081937.0,16.176856,3,51.6,0


In [16]:
df_deaths.head()

Unnamed: 0,Country,Date,Confirmed,Deaths,Case fatality rate of COVID-19 (%),Daily new confirmed cases of COVID-19 (rolling 3-day average),Daily new confirmed deaths due to COVID-19 (rolling 3-day average),Age 0-9,Age 10-19,Age 20-29,...,# People,GDP,GDP per capta,Life expectancy,# Flight Passengers,"Hospital beds (per 1,000 people)",Population density (people per km²),Lockdown Level,Country Temperature ºC,country_code
12,Argentina,2020-03-16,56,2,,10.333333,0.0,7431085.0,7110303.0,6989730.0,...,4441.8446,813726.335884,10398.092465,76.372,18081937.0,6.352251,16.176856,3,51.6,0
13,Argentina,2020-03-17,68,2,,11.333333,0.0,7431085.0,7110303.0,6989730.0,...,4441.8446,813726.335884,10398.092465,76.372,18081937.0,6.352251,16.176856,3,51.6,0
14,Argentina,2020-03-18,79,2,,13.666667,0.0,7431085.0,7110303.0,6989730.0,...,4441.8446,813726.335884,10398.092465,76.372,18081937.0,6.352251,16.176856,3,51.6,0
15,Argentina,2020-03-19,97,3,,21.0,0.333333,7431085.0,7110303.0,6989730.0,...,4441.8446,813726.335884,10398.092465,76.372,18081937.0,6.352251,16.176856,3,51.6,0
16,Argentina,2020-03-20,128,3,2.34375,26.333333,0.333333,7431085.0,7110303.0,6989730.0,...,4441.8446,813726.335884,10398.092465,76.372,18081937.0,6.352251,16.176856,3,51.6,0


# Predicting Confirmed

## Feature Engeneering 

In [17]:
# Add a column 'order_day' to indicate the order of the rows by date
for country in df_confirmed['Country'].unique():
    df_confirmed.loc[df_confirmed['Country'] == country, 'order_day'] = [x for x in list(range(len( df_confirmed.loc[df_confirmed['Country'] == country, :])))]

# merging_keys
merging_keys = ['Country', 'country_code','order_day']

# List of columns that we will use to create lags
lag_cols = df_confirmed.columns.values.tolist()[2:22]
lag_cols

['Confirmed',
 'Case fatality rate of COVID-19 (%)',
 'Daily new confirmed cases of COVID-19 (rolling 3-day average)',
 'Age 0-9',
 'Age 10-19',
 'Age 20-29',
 'Age 30-39',
 'Age 40-49',
 'Age 50-59',
 'Age 60-69',
 'Age 70-79',
 'Age >80',
 '# People',
 'GDP',
 'GDP per capta',
 'Life expectancy',
 '# Flight Passengers',
 'Population density (people per km²)',
 'Lockdown Level',
 'Country Temperature ºC']

In [18]:
N = 4
shift_range = [x+1 for x in range(N)]

for shift in tqdm_notebook(shift_range):
    train_shift = df_confirmed[merging_keys + lag_cols].copy()
    
    for country in df_confirmed['Country'].unique():    
        # E.g. order_day of 0 becomes 1, for shift = 1.
        # So when this is merged with order_day of 1 in df_confirmed, this will represent lag of 1.

        train_shift.loc[train_shift['Country'] == country, 'order_day'] =train_shift.loc[train_shift['Country'] == country, 'order_day'] + shift

    foo = lambda x: '{}_lag_{}'.format(x, shift) if x in lag_cols else x
    train_shift = train_shift.rename(columns=foo)

    df_confirmed = pd.merge(df_confirmed, train_shift, on=merging_keys, how='left')
    
del train_shift

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  after removing the cwd from sys.path.


HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))




In [19]:
df_confirmed[df_confirmed['Country'] == 'United States']

Unnamed: 0,Country,Date,Confirmed,Case fatality rate of COVID-19 (%),Daily new confirmed cases of COVID-19 (rolling 3-day average),Age 0-9,Age 10-19,Age 20-29,Age 30-39,Age 40-49,...,Age 70-79_lag_4,Age >80_lag_4,# People_lag_4,GDP_lag_4,GDP per capta_lag_4,Life expectancy_lag_4,# Flight Passengers_lag_4,Population density (people per km²)_lag_4,Lockdown Level_lag_4,Country Temperature ºC_lag_4
1333,United States,2020-02-24,51,,6.0,39891845.0,42398071.0,46179065.0,43980069.0,40288440.0,...,,,,,,,,,,
1334,United States,2020-02-25,51,,6.0,39891845.0,42398071.0,46179065.0,43980069.0,40288440.0,...,,,,,,,,,,
1335,United States,2020-02-26,57,,8.0,39891845.0,42398071.0,46179065.0,43980069.0,40288440.0,...,,,,,,,,,,
1336,United States,2020-02-27,58,,2.333333,39891845.0,42398071.0,46179065.0,43980069.0,40288440.0,...,,,,,,,,,,
1337,United States,2020-02-28,60,,4.333333,39891845.0,42398071.0,46179065.0,43980069.0,40288440.0,...,23009234.0,9052463.0,32520.1971,18229480.0,53128.5397,78.861,889022000.0,35.607765,2.0,1.8
1338,United States,2020-02-29,68,,3.333333,39891845.0,42398071.0,46179065.0,43980069.0,40288440.0,...,23009234.0,9052463.0,32520.1971,18229480.0,53128.5397,78.861,889022000.0,35.607765,2.0,1.8
1339,United States,2020-03-01,74,,9.666667,39891845.0,42398071.0,46179065.0,43980069.0,40288440.0,...,23009234.0,9052463.0,32520.1971,18229480.0,53128.5397,78.861,889022000.0,35.607765,2.0,1.8
1340,United States,2020-03-02,98,,12.333333,39891845.0,42398071.0,46179065.0,43980069.0,40288440.0,...,23009234.0,9052463.0,32520.1971,18229480.0,53128.5397,78.861,889022000.0,35.607765,2.0,1.8
1341,United States,2020-03-03,118,5.825243,18.666667,39891845.0,42398071.0,46179065.0,43980069.0,40288440.0,...,23009234.0,9052463.0,32520.1971,18229480.0,53128.5397,78.861,889022000.0,35.607765,2.0,1.8
1342,United States,2020-03-04,149,7.2,23.333333,39891845.0,42398071.0,46179065.0,43980069.0,40288440.0,...,23009234.0,9052463.0,32520.1971,18229480.0,53128.5397,78.861,889022000.0,35.607765,2.0,1.8


# Get mean and std dev at timestamp t using values from t-1, ..., t-N

In [20]:
cols_list = lag_cols

for col in cols_list:
    df_confirmed = get_mov_avg_std(df_confirmed, col, N)
df_confirmed.head()

Unnamed: 0,Country,Date,Confirmed,Case fatality rate of COVID-19 (%),Daily new confirmed cases of COVID-19 (rolling 3-day average),Age 0-9,Age 10-19,Age 20-29,Age 30-39,Age 40-49,...,Life expectancy_mean,Life expectancy_std,# Flight Passengers_mean,# Flight Passengers_std,Population density (people per km²)_mean,Population density (people per km²)_std,Lockdown Level_mean,Lockdown Level_std,Country Temperature ºC_mean,Country Temperature ºC_std
0,Argentina,2020-03-16,56,,10.333333,7431085.0,7110303.0,6989730.0,6393900.0,5596155.0,...,,,,,,,,,,
1,Argentina,2020-03-17,68,,11.333333,7431085.0,7110303.0,6989730.0,6393900.0,5596155.0,...,76.372,,18081937.0,,16.176856,,3.0,,51.6,
2,Argentina,2020-03-18,79,,13.666667,7431085.0,7110303.0,6989730.0,6393900.0,5596155.0,...,76.372,0.0,18081937.0,0.0,16.176856,0.0,3.0,0.0,51.6,0.0
3,Argentina,2020-03-19,97,,21.0,7431085.0,7110303.0,6989730.0,6393900.0,5596155.0,...,76.372,0.0,18081937.0,0.0,16.176856,0.0,3.0,0.0,51.6,0.0
4,Argentina,2020-03-20,128,2.34375,26.333333,7431085.0,7110303.0,6989730.0,6393900.0,5596155.0,...,76.372,0.0,18081937.0,0.0,16.176856,0.0,3.0,0.0,51.6,0.0


# Split into train, validation and test set

In [21]:
valid_date = dt.datetime.today() - dt.timedelta(days=7)

mask_valid = (df_confirmed['Date'] >= valid_date )
mask_valid_br = ((df_confirmed['Date'] >= valid_date ) & (df_confirmed['Country'] == 'Brazil'))
mask_valid_sp = ((df_confirmed['Date'] >= valid_date ) & (df_confirmed['Country'] == 'São Paulo'))
mask_valid_nyc = ((df_confirmed['Date'] >= valid_date ) & (df_confirmed['Country'] == 'New York'))
mask_valid_sk = ((df_confirmed['Date'] >= valid_date ) & (df_confirmed['Country'] == 'South Korea'))
mask_valid_lombardia = ((df_confirmed['Date'] >= valid_date ) & (df_confirmed['Country'] == 'Lombardia'))
mask_valid_wuhua = ((df_confirmed['Date'] >= valid_date ) & (df_confirmed['Country'] == 'Wuhan'))


mask_train = (df_confirmed['Date'] < valid_date )

# Split into train, valid, and test
train = df_confirmed.loc[mask_train]
valid = df_confirmed.loc[mask_valid]
valid_br = df_confirmed.loc[mask_valid_br]
valid_sp = df_confirmed.loc[mask_valid_sp]
valid_nyc = df_confirmed.loc[mask_valid_nyc]
valid_sk = df_confirmed.loc[mask_valid_sk]
valid_lombardia = df_confirmed.loc[mask_valid_lombardia]
valid_wuhua = df_confirmed.loc[mask_valid_wuhua]

print("train.shape = " + str(train.shape))
print("valid.shape = " + str(valid.shape))
print("valid_br.shape = " + str(valid_br.shape))
print("valid_sp.shape = " + str(valid_sp.shape))
print("valid_nyc.shape = " + str(valid_nyc.shape))
print("valid_sk.shape = " + str(valid_sk.shape))
print("valid_lombardia.shape = " + str(valid_lombardia.shape))
print("valid_wuhua.shape = " + str(valid_wuhua.shape))


train.shape = (1175, 144)
valid.shape = (268, 144)
valid_br.shape = (5, 144)
valid_sp.shape = (4, 144)
valid_nyc.shape = (3, 144)
valid_sk.shape = (5, 144)
valid_lombardia.shape = (5, 144)
valid_wuhua.shape = (6, 144)


# Scale the train, validation and test set¶

In [22]:
cols_to_scale = ['Confirmed']

for i in range(1,N+1):
    for col in lag_cols:
        print(col + "_lag_"+ str(i))
        cols_to_scale.append(col + "_lag_" + str(i))

# Do scaling for train set
# Here we only scale the train dataset, and not the entire dataset to prevent information leak
scaler = StandardScaler()
train_scaled = scaler.fit_transform(train[cols_to_scale])
print("scaler.mean_ = " + str(scaler.mean_))
print("scaler.var_ = " + str(scaler.var_))
print("train_scaled.shape = " + str(train_scaled.shape))

# Convert the numpy array back into pandas dataframe
train_scaled = pd.DataFrame(train_scaled, columns=cols_to_scale)
train_scaled[['Date', 'country_code']] = train.reset_index()[['Date', 'country_code']]
print("train_scaled.shape = " + str(train_scaled.shape))
train_scaled.head()

Confirmed_lag_1
Case fatality rate of COVID-19 (%)_lag_1
Daily new confirmed cases of COVID-19 (rolling 3-day average)_lag_1
Age 0-9_lag_1
Age 10-19_lag_1
Age 20-29_lag_1
Age 30-39_lag_1
Age 40-49_lag_1
Age 50-59_lag_1
Age 60-69_lag_1
Age 70-79_lag_1
Age >80_lag_1
# People_lag_1
GDP_lag_1
GDP per capta_lag_1
Life expectancy_lag_1
# Flight Passengers_lag_1
Population density (people per km²)_lag_1
Lockdown Level_lag_1
Country Temperature ºC_lag_1
Confirmed_lag_2
Case fatality rate of COVID-19 (%)_lag_2
Daily new confirmed cases of COVID-19 (rolling 3-day average)_lag_2
Age 0-9_lag_2
Age 10-19_lag_2
Age 20-29_lag_2
Age 30-39_lag_2
Age 40-49_lag_2
Age 50-59_lag_2
Age 60-69_lag_2
Age 70-79_lag_2
Age >80_lag_2
# People_lag_2
GDP_lag_2
GDP per capta_lag_2
Life expectancy_lag_2
# Flight Passengers_lag_2
Population density (people per km²)_lag_2
Lockdown Level_lag_2
Country Temperature ºC_lag_2
Confirmed_lag_3
Case fatality rate of COVID-19 (%)_lag_3
Daily new confirmed cases of COVID-19 (roll

Unnamed: 0,Confirmed,Confirmed_lag_1,Case fatality rate of COVID-19 (%)_lag_1,Daily new confirmed cases of COVID-19 (rolling 3-day average)_lag_1,Age 0-9_lag_1,Age 10-19_lag_1,Age 20-29_lag_1,Age 30-39_lag_1,Age 40-49_lag_1,Age 50-59_lag_1,...,# People_lag_4,GDP_lag_4,GDP per capta_lag_4,Life expectancy_lag_4,# Flight Passengers_lag_4,Population density (people per km²)_lag_4,Lockdown Level_lag_4,Country Temperature ºC_lag_4,Date,country_code
0,-0.45585,,,,,,,,,,...,,,,,,,,,2020-03-16,0
1,-0.455268,-0.451783,,-0.356664,-0.297826,-0.300435,-0.314661,-0.329422,-0.334182,-0.344018,...,,,,,,,,,2020-03-17,0
2,-0.454735,-0.451195,,-0.35578,-0.297826,-0.300435,-0.314661,-0.329422,-0.334182,-0.344018,...,,,,,,,,,2020-03-18,0
3,-0.453862,-0.450656,,-0.35372,-0.297826,-0.300435,-0.314661,-0.329422,-0.334182,-0.344018,...,,,,,,,,,2020-03-19,0
4,-0.45236,-0.449775,,-0.347243,-0.297826,-0.300435,-0.314661,-0.329422,-0.334182,-0.344018,...,-0.346331,-0.410222,-1.001727,-0.758097,-0.527613,-0.467139,0.714528,1.268287,2020-03-20,0


In [23]:
valid_scaled = valid[['Date', 'country_code']]
for col in tqdm_notebook(cols_list):
    feat_list = [col + '_lag_' + str(shift) for shift in range(1, N+1)]
    temp = valid.apply(lambda row: scale_row(row[feat_list], row[col+'_mean'], row[col+'_std']), axis=1)
    valid_scaled = pd.concat([valid_scaled, temp], axis=1)
    
# Now the entire valid set is scaled
valid_scaled.head()

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))




Unnamed: 0,Date,country_code,Confirmed_lag_1,Confirmed_lag_2,Confirmed_lag_3,Confirmed_lag_4,Case fatality rate of COVID-19 (%)_lag_1,Case fatality rate of COVID-19 (%)_lag_2,Case fatality rate of COVID-19 (%)_lag_3,Case fatality rate of COVID-19 (%)_lag_4,...,Population density (people per km²)_lag_3,Population density (people per km²)_lag_4,Lockdown Level_lag_1,Lockdown Level_lag_2,Lockdown Level_lag_3,Lockdown Level_lag_4,Country Temperature ºC_lag_1,Country Temperature ºC_lag_2,Country Temperature ºC_lag_3,Country Temperature ºC_lag_4
12,2020-03-28,0,1.250497,0.364198,-0.807347,-0.807347,1.380777,-0.114367,-0.260058,-1.006352,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13,2020-03-29,0,1.149419,0.365018,-0.310654,-1.203783,1.287655,0.293822,-0.740352,-0.841125,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14,2020-03-30,0,1.052597,0.542528,-0.394144,-1.20098,0.884504,0.687701,-0.281718,-1.290487,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15,2020-03-31,0,1.121034,0.34968,-0.215979,-1.254735,0.290547,0.777579,0.398811,-1.466937,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16,2020-04-01,0,1.414696,-0.045233,-0.513159,-0.856304,0.001509,-0.950836,1.381663,-0.432337,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [24]:
valid_scaled.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 268 entries, 12 to 1442
Data columns (total 82 columns):
 #   Column                                                               Non-Null Count  Dtype         
---  ------                                                               --------------  -----         
 0   Date                                                                 268 non-null    datetime64[ns]
 1   country_code                                                         268 non-null    int64         
 2   Confirmed_lag_1                                                      268 non-null    float64       
 3   Confirmed_lag_2                                                      268 non-null    float64       
 4   Confirmed_lag_3                                                      268 non-null    float64       
 5   Confirmed_lag_4                                                      268 non-null    float64       
 6   Case fatality rate of COVID-19 (%)_lag_1        

In [25]:
valid_br_scaled = valid_br[['Date', 'country_code']]
for col in tqdm_notebook(cols_list):
    feat_list = [col + '_lag_' + str(shift) for shift in range(1, N+1)]
    temp = valid_br.apply(lambda row: scale_row(row[feat_list], row[col+'_mean'], row[col+'_std']), axis=1)
    valid_br_scaled = pd.concat([valid_br_scaled, temp], axis=1)
    
# Now the entire valid_br set is scaled
valid_br_scaled.head()

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))




Unnamed: 0,Date,country_code,Confirmed_lag_1,Confirmed_lag_2,Confirmed_lag_3,Confirmed_lag_4,Case fatality rate of COVID-19 (%)_lag_1,Case fatality rate of COVID-19 (%)_lag_2,Case fatality rate of COVID-19 (%)_lag_3,Case fatality rate of COVID-19 (%)_lag_4,...,Population density (people per km²)_lag_3,Population density (people per km²)_lag_4,Lockdown Level_lag_1,Lockdown Level_lag_2,Lockdown Level_lag_3,Lockdown Level_lag_4,Country Temperature ºC_lag_1,Country Temperature ºC_lag_2,Country Temperature ºC_lag_3,Country Temperature ºC_lag_4
116,2020-03-28,4,1.207609,0.361058,-0.483533,-1.085133,1.17786,0.346935,-0.356328,-1.168466,...,9.237056e-11,9.237056e-11,0.0,0.0,0.0,0.0,-1.421085e-11,-1.421085e-11,-1.421085e-11,-1.421085e-11
117,2020-03-29,4,1.190206,0.348943,-0.397311,-1.141838,0.893473,0.712068,-0.352335,-1.253206,...,9.237056e-11,9.237056e-11,0.0,0.0,0.0,0.0,-1.421085e-11,-1.421085e-11,-1.421085e-11,-1.421085e-11
118,2020-03-30,4,1.106718,0.473794,-0.401871,-1.178641,1.140643,0.181995,-0.032382,-1.290257,...,9.237056e-11,9.237056e-11,0.0,0.0,0.0,0.0,-1.421085e-11,-1.421085e-11,-1.421085e-11,-1.421085e-11
119,2020-03-31,4,1.084493,0.435806,-0.271123,-1.249176,1.316881,0.22813,-0.671874,-0.873137,...,9.237056e-11,9.237056e-11,0.0,0.0,0.0,0.0,-1.421085e-11,-1.421085e-11,-1.421085e-11,-1.421085e-11
120,2020-04-01,4,1.404555,-0.044569,-0.455876,-0.904111,1.190251,0.371019,-0.443839,-1.117432,...,9.237056e-11,9.237056e-11,0.0,0.0,0.0,0.0,-1.421085e-11,-1.421085e-11,-1.421085e-11,-1.421085e-11


In [26]:
valid_sp_scaled = valid_sp[['Date', 'country_code']]
for col in tqdm_notebook(cols_list):
    feat_list = [col + '_lag_' + str(shift) for shift in range(1, N+1)]
    temp = valid_sp.apply(lambda row: scale_row(row[feat_list], row[col+'_mean'], row[col+'_std']), axis=1)
    valid_sp_scaled = pd.concat([valid_sp_scaled, temp], axis=1)
    
# Now the entire valid_sp set is scaled
valid_sp_scaled.head()

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))




Unnamed: 0,Date,country_code,Confirmed_lag_1,Confirmed_lag_2,Confirmed_lag_3,Confirmed_lag_4,Case fatality rate of COVID-19 (%)_lag_1,Case fatality rate of COVID-19 (%)_lag_2,Case fatality rate of COVID-19 (%)_lag_3,Case fatality rate of COVID-19 (%)_lag_4,...,Population density (people per km²)_lag_3,Population density (people per km²)_lag_4,Lockdown Level_lag_1,Lockdown Level_lag_2,Lockdown Level_lag_3,Lockdown Level_lag_4,Country Temperature ºC_lag_1,Country Temperature ºC_lag_2,Country Temperature ºC_lag_3,Country Temperature ºC_lag_4
1261,2020-03-28,48,0.942716,0.488961,-0.064934,-1.366743,-0.512702,-0.527642,-0.459004,1.499348,...,0.0,0.0,0.0,0.0,0.0,0.0,-8.42937e-08,-8.42937e-08,-8.42937e-08,-8.42937e-08
1262,2020-03-29,48,0.763291,0.763291,-0.184693,-1.341888,-0.321144,-0.321144,-0.816281,1.458568,...,0.0,0.0,0.0,0.0,0.0,0.0,-8.42937e-08,-8.42937e-08,-8.42937e-08,-8.42937e-08
1263,2020-03-30,48,0.5,0.5,0.5,-1.5,0.5,0.5,0.5,-1.5,...,0.0,0.0,0.0,0.0,0.0,0.0,-8.42937e-08,-8.42937e-08,-8.42937e-08,-8.42937e-08
1264,2020-03-31,48,1.5,-0.5,-0.5,-0.5,1.5,-0.5,-0.5,-0.5,...,0.0,0.0,0.0,0.0,0.0,0.0,-8.42937e-08,-8.42937e-08,-8.42937e-08,-8.42937e-08


In [27]:
valid_nyc_scaled = valid_nyc[['Date', 'country_code']]
for col in tqdm_notebook(cols_list):
    feat_list = [col + '_lag_' + str(shift) for shift in range(1, N+1)]
    temp = valid_nyc.apply(lambda row: scale_row(row[feat_list], row[col+'_mean'], row[col+'_std']), axis=1)
    valid_nyc_scaled = pd.concat([valid_nyc_scaled, temp], axis=1)
    
# Now the entire valid_nyc set is scaled
valid_nyc_scaled.head()

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))




Unnamed: 0,Date,country_code,Confirmed_lag_1,Confirmed_lag_2,Confirmed_lag_3,Confirmed_lag_4,Case fatality rate of COVID-19 (%)_lag_1,Case fatality rate of COVID-19 (%)_lag_2,Case fatality rate of COVID-19 (%)_lag_3,Case fatality rate of COVID-19 (%)_lag_4,...,Population density (people per km²)_lag_3,Population density (people per km²)_lag_4,Lockdown Level_lag_1,Lockdown Level_lag_2,Lockdown Level_lag_3,Lockdown Level_lag_4,Country Temperature ºC_lag_1,Country Temperature ºC_lag_2,Country Temperature ºC_lag_3,Country Temperature ºC_lag_4
825,2020-03-28,30,1.185099,0.390955,-0.471228,-1.104826,1.069764,0.4524,-0.264873,-1.257291,...,1.556376e-08,1.556376e-08,0.0,0.0,0.0,0.0,-1.325606e-10,-1.325606e-10,-1.325606e-10,-1.325606e-10
826,2020-03-29,30,1.178751,0.35393,-0.37218,-1.1605,1.306912,0.149917,-0.405667,-1.051162,...,1.556376e-08,1.556376e-08,0.0,0.0,0.0,0.0,-1.325606e-10,-1.325606e-10,-1.325606e-10,-1.325606e-10
827,2020-03-30,30,1.168795,0.393804,-0.422148,-1.140451,1.218255,0.376182,-0.570034,-1.024403,...,1.556376e-08,1.556376e-08,0.0,0.0,0.0,0.0,-1.325606e-10,-1.325606e-10,-1.325606e-10,-1.325606e-10


In [28]:
valid_sk_scaled = valid_sk[['Date', 'country_code']]
for col in tqdm_notebook(cols_list):
    feat_list = [col + '_lag_' + str(shift) for shift in range(1, N+1)]
    temp = valid_sk.apply(lambda row: scale_row(row[feat_list], row[col+'_mean'], row[col+'_std']), axis=1)
    valid_sk_scaled = pd.concat([valid_sk_scaled, temp], axis=1)
    
# Now the entire valid_sk set is scaled
valid_sk_scaled.head()

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))




Unnamed: 0,Date,country_code,Confirmed_lag_1,Confirmed_lag_2,Confirmed_lag_3,Confirmed_lag_4,Case fatality rate of COVID-19 (%)_lag_1,Case fatality rate of COVID-19 (%)_lag_2,Case fatality rate of COVID-19 (%)_lag_3,Case fatality rate of COVID-19 (%)_lag_4,...,Population density (people per km²)_lag_3,Population density (people per km²)_lag_4,Lockdown Level_lag_1,Lockdown Level_lag_2,Lockdown Level_lag_3,Lockdown Level_lag_4,Country Temperature ºC_lag_1,Country Temperature ºC_lag_2,Country Temperature ºC_lag_3,Country Temperature ºC_lag_4
1152,2020-03-28,44,1.137212,0.424742,-0.38951,-1.172444,1.262881,0.207063,-0.359547,-1.110397,...,3.410605e-10,3.410605e-10,0.0,0.0,0.0,0.0,-1.073061e-07,-1.073061e-07,-1.073061e-07,-1.073061e-07
1153,2020-03-29,44,1.251784,0.242058,-0.387292,-1.106549,1.05503,0.592229,-0.52409,-1.123168,...,3.410605e-10,3.410605e-10,0.0,0.0,0.0,0.0,-1.073061e-07,-1.073061e-07,-1.073061e-07,-1.073061e-07
1154,2020-03-30,44,1.149046,0.457643,-0.503737,-1.102953,1.187633,0.231383,-0.195122,-1.223894,...,3.410605e-10,3.410605e-10,0.0,0.0,0.0,0.0,-1.073061e-07,-1.073061e-07,-1.073061e-07,-1.073061e-07
1155,2020-03-31,44,1.036192,0.48824,-0.249389,-1.275043,1.183535,0.433991,-0.582153,-1.035372,...,3.410605e-10,3.410605e-10,0.0,0.0,0.0,0.0,-1.073061e-07,-1.073061e-07,-1.073061e-07,-1.073061e-07
1156,2020-04-01,44,1.2246,0.261864,-0.338883,-1.147581,1.001394,0.528888,-0.242358,-1.287924,...,3.410605e-10,3.410605e-10,0.0,0.0,0.0,0.0,-1.073061e-07,-1.073061e-07,-1.073061e-07,-1.073061e-07


In [29]:
valid_lombardia_scaled = valid_lombardia[['Date', 'country_code']]
for col in tqdm_notebook(cols_list):
    feat_list = [col + '_lag_' + str(shift) for shift in range(1, N+1)]
    temp = valid_lombardia.apply(lambda row: scale_row(row[feat_list], row[col+'_mean'], row[col+'_std']), axis=1)
    valid_lombardia_scaled = pd.concat([valid_lombardia_scaled, temp], axis=1)
    
# Now the entire valid_lombardia set is scaled
valid_lombardia_scaled.head()

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))




Unnamed: 0,Date,country_code,Confirmed_lag_1,Confirmed_lag_2,Confirmed_lag_3,Confirmed_lag_4,Case fatality rate of COVID-19 (%)_lag_1,Case fatality rate of COVID-19 (%)_lag_2,Case fatality rate of COVID-19 (%)_lag_3,Case fatality rate of COVID-19 (%)_lag_4,...,Population density (people per km²)_lag_3,Population density (people per km²)_lag_4,Lockdown Level_lag_1,Lockdown Level_lag_2,Lockdown Level_lag_3,Lockdown Level_lag_4,Country Temperature ºC_lag_1,Country Temperature ºC_lag_2,Country Temperature ºC_lag_3,Country Temperature ºC_lag_4
712,2020-03-28,25,1.205564,0.373175,-0.505514,-1.073225,1.396479,-0.083717,-0.355413,-0.95735,...,0.0,0.0,0.0,0.0,0.0,0.0,-1.571008e-07,-1.571008e-07,-1.571008e-07,-1.571008e-07
713,2020-03-29,25,1.123434,0.429645,-0.35984,-1.193239,1.30098,0.262922,-0.694117,-0.869785,...,0.0,0.0,0.0,0.0,0.0,0.0,-1.571008e-07,-1.571008e-07,-1.571008e-07,-1.571008e-07
714,2020-03-30,25,1.075856,0.475887,-0.321937,-1.229806,1.099755,0.477339,-0.389127,-1.187967,...,0.0,0.0,0.0,0.0,0.0,0.0,-1.571008e-07,-1.571008e-07,-1.571008e-07,-1.571008e-07
715,2020-03-31,25,1.039549,0.491956,-0.263476,-1.268029,1.209306,0.278865,-0.324272,-1.163899,...,0.0,0.0,0.0,0.0,0.0,0.0,-1.571008e-07,-1.571008e-07,-1.571008e-07,-1.571008e-07
716,2020-04-01,25,1.082434,0.4386,-0.271031,-1.250003,1.151138,0.451067,-0.494599,-1.107606,...,0.0,0.0,0.0,0.0,0.0,0.0,-1.571008e-07,-1.571008e-07,-1.571008e-07,-1.571008e-07


In [30]:
valid_wuhua_scaled = valid_wuhua[['Date', 'country_code']]
for col in tqdm_notebook(cols_list):
    feat_list = [col + '_lag_' + str(shift) for shift in range(1, N+1)]
    temp = valid_wuhua.apply(lambda row: scale_row(row[feat_list], row[col+'_mean'], row[col+'_std']), axis=1)
    valid_wuhua_scaled = pd.concat([valid_wuhua_scaled, temp], axis=1)
    
# Now the entire valid_wuhua set is scaled
valid_wuhua_scaled.head()

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))




Unnamed: 0,Date,country_code,Confirmed_lag_1,Confirmed_lag_2,Confirmed_lag_3,Confirmed_lag_4,Case fatality rate of COVID-19 (%)_lag_1,Case fatality rate of COVID-19 (%)_lag_2,Case fatality rate of COVID-19 (%)_lag_3,Case fatality rate of COVID-19 (%)_lag_4,...,Population density (people per km²)_lag_3,Population density (people per km²)_lag_4,Lockdown Level_lag_1,Lockdown Level_lag_2,Lockdown Level_lag_3,Lockdown Level_lag_4,Country Temperature ºC_lag_1,Country Temperature ºC_lag_2,Country Temperature ºC_lag_3,Country Temperature ºC_lag_4
1437,2020-03-28,53,0.0,0.0,0.0,0.0,1.200961,0.40032,-0.560449,-1.040833,...,0.0,0.0,0.0,0.0,0.0,0.0,-1.060366e-07,-1.060366e-07,-1.060366e-07,-1.060366e-07
1438,2020-03-29,53,0.0,0.0,0.0,0.0,1.019489,0.530134,-0.285457,-1.264166,...,0.0,0.0,0.0,0.0,0.0,0.0,-1.060366e-07,-1.060366e-07,-1.060366e-07,-1.060366e-07
1439,2020-03-30,53,0.0,0.0,0.0,0.0,1.193381,0.275396,-0.275396,-1.193381,...,0.0,0.0,0.0,0.0,0.0,0.0,-1.060366e-07,-1.060366e-07,-1.060366e-07,-1.060366e-07
1440,2020-03-31,53,0.0,0.0,0.0,0.0,1.175901,0.423324,-0.517396,-1.081829,...,0.0,0.0,0.0,0.0,0.0,0.0,-1.060366e-07,-1.060366e-07,-1.060366e-07,-1.060366e-07
1441,2020-04-01,53,0.0,0.0,0.0,0.0,0.879883,0.659912,-0.219971,-1.319824,...,0.0,0.0,0.0,0.0,0.0,0.0,-1.060366e-07,-1.060366e-07,-1.060366e-07,-1.060366e-07


# Split into X and y

In [31]:
features = ['country_code']
for i in range(1,N+1):
    for col in lag_cols:
        features.append(col + "_lag_" + str(i))
    
target = "Confirmed"


# Split into X and y
X_train = train[features]
y_train = train[target]
X_valid = valid[features]
y_valid = valid[target]
X_valid_br = valid_br[features]
y_valid_br = valid_br[target]
X_valid_sk = valid_sk[features]
y_valid_sk = valid_sk[target]
X_valid_sp = valid_sp[features]
y_valid_sp = valid_sp[target]
X_valid_nyc = valid_nyc[features]
y_valid_nyc = valid_nyc[target]
X_valid_lombardia = valid_lombardia[features]
y_valid_lombardia = valid_lombardia[target]
X_valid_wuhua = valid_wuhua[features]
y_valid_wuhua = valid_wuhua[target]

print("X_train.shape = " + str(X_train.shape))
print("y_train.shape = " + str(y_train.shape))
print("X_valid.shape = " + str(X_valid.shape))
print("y_valid_br.shape = " + str(y_valid_br.shape))
print("X_valid_br.shape = " + str(X_valid_br.shape))
print("y_valid.shape = " + str(y_valid.shape))

X_train.shape = (1175, 81)
y_train.shape = (1175,)
X_valid.shape = (268, 81)
y_valid_br.shape = (5,)
X_valid_br.shape = (5, 81)
y_valid.shape = (268,)


In [32]:
# Split into X and y
X_train_scaled = train_scaled[features]
y_train_scaled = train_scaled[target]
X_valid_scaled = valid_scaled[features]
X_valid_br_scaled = valid_br_scaled[features]
X_valid_sk_scaled = valid_sk_scaled[features]
X_valid_sp_scaled = valid_sp_scaled[features]
X_valid_nyc_scaled = valid_nyc_scaled[features]
X_valid_lombardia_scaled = valid_lombardia_scaled[features]
X_valid_wuhua_scaled = valid_wuhua_scaled[features]


print("X_train_scaled.shape = " + str(X_train_scaled.shape))
print("y_train_scaled.shape = " + str(y_train_scaled.shape))
print("X_valid_scaled.shape = " + str(X_valid_scaled.shape))
print("X_valid_br_scaled.shape = " + str(X_valid_br_scaled.shape))
print("X_valid_sk_scaled.shape = " + str(X_valid_sk_scaled.shape))
print("X_valid_sp_scaled.shape = " + str(X_valid_sp_scaled.shape))
print("X_valid_nyc_scaled.shape = " + str(X_valid_nyc_scaled.shape))
print("X_valid_lombardia_scaled.shape = " + str(X_valid_lombardia_scaled.shape))
print("X_valid_wuhua_scaled.shape = " + str(X_valid_wuhua_scaled.shape))

X_train_scaled.shape = (1175, 81)
y_train_scaled.shape = (1175,)
X_valid_scaled.shape = (268, 81)
X_valid_br_scaled.shape = (5, 81)
X_valid_sk_scaled.shape = (5, 81)
X_valid_sp_scaled.shape = (4, 81)
X_valid_nyc_scaled.shape = (3, 81)
X_valid_lombardia_scaled.shape = (5, 81)
X_valid_wuhua_scaled.shape = (6, 81)


# EDA

In [33]:
mask_train = (train['Country'] == 'Brazil')
mask_valid = (valid['Country'] == 'Brazil')

ax = train.loc[mask_train].plot(x='Date', y='Confirmed', style='b-', grid=True)
ax = valid.loc[mask_valid].plot(x='Date', y='Confirmed', style='y-', grid=True, ax=ax)
ax.legend(['train', 'validation'])
ax.set_xlabel("date")
ax.set_ylabel("Confirmed")
ax.set_title("Without scaling")

Text(0.5, 1.0, 'Without scaling')

In [34]:
code = valid[valid['Country'] == 'Brazil']['country_code'].unique()
ax = train_scaled[train_scaled['country_code'] ==  code[0]].plot(x='Date', y='Confirmed', style='b-', grid=True)
ax.legend(['train_scaled'])
ax.set_xlabel("date")
ax.set_ylabel("Confirmed (scaled)")
ax.set_title("With scaling")

Text(0.5, 1.0, 'With scaling')

# Train the model using XGBoost


In [35]:
n_estimators = 100             # Number of boosted trees to fit. default = 100
max_depth = 3                  # Maximum tree depth for base learners. default = 3
learning_rate = 0.1            # Boosting learning rate (xgb’s “eta”). default = 0.1
min_child_weight = 1           # Minimum sum of instance weight(hessian) needed in a child. default = 1
subsample = 1                  # Subsample ratio of the training instance. default = 1
colsample_bytree = 1           # Subsample ratio of columns when constructing each tree. default = 1
colsample_bylevel = 1          # Subsample ratio of columns for each split, in each level. default = 1
gamma = 0  
model_seed = 100


In [36]:
X_train_scaled.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1175 entries, 0 to 1174
Data columns (total 81 columns):
 #   Column                                                               Non-Null Count  Dtype  
---  ------                                                               --------------  -----  
 0   country_code                                                         1175 non-null   int64  
 1   Confirmed_lag_1                                                      1121 non-null   float64
 2   Case fatality rate of COVID-19 (%)_lag_1                             951 non-null    float64
 3   Daily new confirmed cases of COVID-19 (rolling 3-day average)_lag_1  1121 non-null   float64
 4   Age 0-9_lag_1                                                        995 non-null    float64
 5   Age 10-19_lag_1                                                      995 non-null    float64
 6   Age 20-29_lag_1                                                      995 non-null    float64
 7   Age 30

In [37]:
# Create the model
model = XGBRegressor(seed=model_seed,
                     n_estimators=n_estimators,
                     max_depth=max_depth,
                     learning_rate=learning_rate,
                     min_child_weight=min_child_weight,
                     subsample=subsample,
                     colsample_bytree=colsample_bytree,
                     colsample_bylevel=colsample_bylevel,
                     gamma=gamma)

# Train the regressor
model.fit(X_train_scaled, y_train_scaled)

XGBRegressor(base_score=None, booster=None, colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints=None,
             learning_rate=0.1, max_delta_step=0, max_depth=3,
             min_child_weight=1, missing=nan, monotone_constraints=None,
             n_estimators=100, n_jobs=0, num_parallel_tree=1,
             objective='reg:squarederror', random_state=100, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, seed=100, subsample=1,
             tree_method=None, validate_parameters=False, verbosity=None)

# Predict on train set

In [38]:
def get_mape(y_true, y_pred): 
    """
    Compute mean absolute percentage error (MAPE)
    """
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [39]:
# Do prediction on train set
est_scaled = model.predict(X_train_scaled)
est = est_scaled * math.sqrt(scaler.var_[0]) + scaler.mean_[0]

# Calculate RMSE
print("RMSE on train set = %0.3f" % math.sqrt(mean_squared_error(y_train, est)))

# Calculate MAPE
print("MAPE on train set = %0.3f%%" % get_mape(y_train, est))

RMSE on train set = 300.518
MAPE on train set = 16.934%


In [40]:
est_df = pd.DataFrame({'est': est, 
                       'Date': train['Date'], 
                       'Country': train['Country']})

country = 'Brazil'
mask_train = (train['Country'] == country)
mask_valid = (valid['Country'] == country)
mask_est = (est_df['Country'] == country)

ax = train.loc[mask_train].plot(x='Date', y='Confirmed', style='b-', grid=True)
ax = valid.loc[mask_valid].plot(x='Date', y='Confirmed', style='y-', grid=True, ax=ax)
ax = est_df.loc[mask_est].plot(x='Date', y='est', style='r-', grid=True, ax=ax)
ax.legend(['train', 'validation', 'predictions'])
ax.set_xlabel("Date")
ax.set_ylabel("Confirmed")

Text(0, 0.5, 'Confirmed')

# Predict on valid set brasil

In [41]:
# Do prediction on test set
est_scaled_br = model.predict(X_valid_br_scaled)
valid_br['est_scaled_br'] = est_scaled_br
valid_br['est'] = valid_br['est_scaled_br'] * valid_br['Confirmed_std'] + valid_br['Confirmed_mean']

# Calculate RMSE
rmse_bef_tuning = math.sqrt(mean_squared_error(y_valid_br, valid_br['est']))
print("RMSE on dev set = %0.3f" % rmse_bef_tuning)

# Calculate MAPE
mape_bef_tuning = get_mape(y_valid_br, valid_br['est'])
print("MAPE on dev set = %0.3f%%" % mape_bef_tuning)

RMSE on dev set = 781.893
MAPE on dev set = 12.374%


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [42]:
est_df = pd.DataFrame({'est': valid_br['est'], 
                       'y_valid_br': y_valid_br,
                       'Date': valid_br['Date']})

country = 'Brazil'
mask_train = (train['Country'] == country)

ax = train.loc[mask_train].plot(x='Date', y='Confirmed', style='b-', grid=True)
ax = valid_br.plot(x='Date', y='Confirmed', style='y-', grid=True, ax=ax)
ax = est_df.plot(x='Date', y='est', style='r-', grid=True, ax=ax)
ax.legend(['train', 'validation', 'predictions'])
ax.set_xlabel("Date")
ax.set_ylabel("Confirmed")

Text(0, 0.5, 'Confirmed')

In [43]:
est_df = pd.DataFrame({'est': valid_br['est'], 
                       'y_valid_br': y_valid_br,
                       'Date': valid_br['Date']})

ax = valid_br.plot(x='Date', y='Confirmed', style='y-', grid=True)
ax = est_df.plot(x='Date', y='est', style='r-', grid=True, ax=ax)
ax.legend(['validation', 'predictions'])
ax.set_xlabel("Date")
ax.set_ylabel("Confirmed")
ax.set_title("Brazil")

Text(0.5, 1.0, 'Brazil')

# Predict on valid set Koreia do Sul

In [44]:
# Do prediction on test set
est_scaled_sk = model.predict(X_valid_sk_scaled)
valid_sk['est_scaled_sk'] = est_scaled_sk
valid_sk['est'] = valid_sk['est_scaled_sk'] * valid_sk['Confirmed_std'] + valid_sk['Confirmed_mean']

# Calculate RMSE
rmse_bef_tuning = math.sqrt(mean_squared_error(y_valid_sk, valid_sk['est']))
print("RMSE on dev set = %0.3f" % rmse_bef_tuning)

# Calculate MAPE
mape_bef_tuning = get_mape(y_valid_sk, valid_sk['est'])
print("MAPE on dev set = %0.3f%%" % mape_bef_tuning)

RMSE on dev set = 102.078
MAPE on dev set = 1.016%


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [45]:
est_df = pd.DataFrame({'est': valid_sk['est'], 
                       'y_valid_sk': y_valid_sk,
                       'Date': valid_sk['Date']})

country = 'South Korea'
mask_train = (train['Country'] == country)

ax = train.loc[mask_train].plot(x='Date', y='Confirmed', style='b-', grid=True)
ax = valid_sk.plot(x='Date', y='Confirmed', style='y-', grid=True, ax=ax)
ax = est_df.plot(x='Date', y='est', style='r-', grid=True, ax=ax)
ax.legend(['train', 'validation', 'predictions'])
ax.set_xlabel("Date")
ax.set_ylabel("Confirmed")

Text(0, 0.5, 'Confirmed')

In [46]:
est_df = pd.DataFrame({'est': valid_sk['est'], 
                       'y_valid_sk': y_valid_sk,
                       'Date': valid_sk['Date']})

ax = valid_sk.plot(x='Date', y='Confirmed', style='y-', grid=True)
ax = est_df.plot(x='Date', y='est', style='r-', grid=True, ax=ax)
ax.legend(['validation', 'predictions'])
ax.set_xlabel("Date")
ax.set_ylabel("Confirmed")
ax.set_title("South Korea")

Text(0.5, 1.0, 'South Korea')

## Predict on valid set NYC

In [47]:
# Do prediction on test set
est_scaled_nyc = model.predict(X_valid_nyc_scaled)
valid_nyc['est_scaled_nyc'] = est_scaled_nyc
valid_nyc['est'] = valid_nyc['est_scaled_nyc'] * valid_nyc['Confirmed_std'] + valid_nyc['Confirmed_mean']

# Calculate RMSE
rmse_bef_tuning = math.sqrt(mean_squared_error(y_valid_nyc, valid_nyc['est']))
print("RMSE on dev set = %0.3f" % rmse_bef_tuning)

# Calculate MAPE
mape_bef_tuning = get_mape(y_valid_nyc, valid_nyc['est'])
print("MAPE on dev set = %0.3f%%" % mape_bef_tuning)

RMSE on dev set = 3447.011
MAPE on dev set = 10.373%


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [48]:
est_df = pd.DataFrame({'est': valid_nyc['est'], 
                       'y_valid_nyc': y_valid_nyc,
                       'Date': valid_nyc['Date']})

country = 'New York'
mask_nyc_train = (train['Country'] == country)

ax = train.loc[mask_nyc_train].plot(x='Date', y='Confirmed', style='b-', grid=True)
ax = valid_nyc.plot(x='Date', y='Confirmed', style='y-', grid=True, ax=ax)
ax = est_df.plot(x='Date', y='est', style='r-', grid=True, ax=ax)
ax.legend(['train', 'validation', 'predictions'])
ax.set_xlabel("Date")
ax.set_ylabel("Confirmed")

Text(0, 0.5, 'Confirmed')

In [49]:
est_df = pd.DataFrame({'est': valid_nyc['est'], 
                       'y_valid_nyc': y_valid_nyc,
                       'Date': valid_nyc['Date']})

ax = valid_nyc.plot(x='Date', y='Confirmed', style='y-', grid=True)
ax = est_df.plot(x='Date', y='est', style='r-', grid=True, ax=ax)
ax.legend(['validation', 'predictions'])
ax.set_xlabel("Date")
ax.set_ylabel("Confirmed")
ax.set_title("New York")

Text(0.5, 1.0, 'New York')

# Predict on valid set Lombardia

In [50]:
# Do prediction on test set
est_scaled_lombardia = model.predict(X_valid_lombardia_scaled)
valid_lombardia['est_scaled_lombardia'] = est_scaled_lombardia
valid_lombardia['est'] = valid_lombardia['est_scaled_lombardia'] * valid_lombardia['Confirmed_std'] + valid_lombardia['Confirmed_mean']

# Calculate RMSE
rmse_bef_tuning = math.sqrt(mean_squared_error(y_valid_lombardia, valid_lombardia['est']))
print("RMSE on dev set = %0.3f" % rmse_bef_tuning)

# Calculate MAPE
mape_bef_tuning = get_mape(y_valid_lombardia, valid_lombardia['est'])
print("MAPE on dev set = %0.3f%%" % mape_bef_tuning)

est_df = pd.DataFrame({'est': valid_lombardia['est'], 
                       'y_valid_lombardia': y_valid_lombardia,
                       'Date': valid_lombardia['Date']})

ax = valid_lombardia.plot(x='Date', y='Confirmed', style='y-', grid=True)
ax = est_df.plot(x='Date', y='est', style='r-', grid=True, ax=ax)
ax.legend(['validation', 'predictions'])
ax.set_xlabel("Date")
ax.set_ylabel("Confirmed")
ax.set_title("Lombardia")

RMSE on dev set = 1351.584
MAPE on dev set = 3.120%


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


Text(0.5, 1.0, 'Lombardia')

# Predict Wuhan

In [51]:
# Do prediction on test set
est_scaled_wuhua = model.predict(X_valid_wuhua_scaled)
valid_wuhua['est_scaled_wuhua'] = est_scaled_wuhua
valid_wuhua['est'] = valid_wuhua['est_scaled_wuhua'] * valid_wuhua['Confirmed_std'] + valid_wuhua['Confirmed_mean']

# Calculate RMSE
rmse_bef_tuning = math.sqrt(mean_squared_error(y_valid_wuhua, valid_wuhua['est']))
print("RMSE on dev set = %0.3f" % rmse_bef_tuning)

# Calculate MAPE
mape_bef_tuning = get_mape(y_valid_wuhua, valid_wuhua['est'])
print("MAPE on dev set = %0.3f%%" % mape_bef_tuning)

est_df = pd.DataFrame({'est': valid_wuhua['est'], 
                       'y_valid_wuhua': y_valid_wuhua,
                       'Date': valid_wuhua['Date']})

ax = valid_wuhua.plot(x='Date', y='Confirmed', style='y-', grid=True)
ax = est_df.plot(x='Date', y='est', style='r-', grid=True, ax=ax)
ax.legend(['validation', 'predictions'])
ax.set_xlabel("Date")
ax.set_ylabel("Confirmed")
ax.set_title("Wuhan")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


RMSE on dev set = 0.408
MAPE on dev set = 0.000%


Text(0.5, 1.0, 'Wuhan')

In [52]:
_ = plot_importance(model, height=0.7)

# Predicting Deats

## Feature Engeneering

In [53]:
# Add a column 'order_day' to indicate the order of the rows by date
for country in df_deaths['Country'].unique():
    df_deaths.loc[df_deaths['Country'] == country, 'order_day'] = [x for x in list(range(len( df_deaths.loc[df_deaths['Country'] == country, :])))]

# merging_keys
merging_keys = ['Country', 'country_code','order_day']

# List of columns that we will use to create lags
lag_cols = df_deaths.columns.values.tolist()[2:22]
lag_cols

['Confirmed',
 'Deaths',
 'Case fatality rate of COVID-19 (%)',
 'Daily new confirmed cases of COVID-19 (rolling 3-day average)',
 'Daily new confirmed deaths due to COVID-19 (rolling 3-day average)',
 'Age 0-9',
 'Age 10-19',
 'Age 20-29',
 'Age 30-39',
 'Age 40-49',
 'Age 50-59',
 'Age 60-69',
 'Age 70-79',
 'Age >80',
 '# People',
 'GDP',
 'GDP per capta',
 'Life expectancy',
 '# Flight Passengers',
 'Hospital beds (per 1,000 people)']

In [54]:
N = 4
shift_range = [x+1 for x in range(N)]

for shift in tqdm_notebook(shift_range):
    train_shift = df_deaths[merging_keys + lag_cols].copy()
    
    for country in df_deaths['Country'].unique():    
        # E.g. order_day of 0 becomes 1, for shift = 1.
        # So when this is merged with order_day of 1 in df_deaths, this will represent lag of 1.

        train_shift.loc[train_shift['Country'] == country, 'order_day'] =train_shift.loc[train_shift['Country'] == country, 'order_day'] + shift

    foo = lambda x: '{}_lag_{}'.format(x, shift) if x in lag_cols else x
    train_shift = train_shift.rename(columns=foo)

    df_deaths = pd.merge(df_deaths, train_shift, on=merging_keys, how='left')
    
del train_shift

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  after removing the cwd from sys.path.


HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))




# Get mean and std dev at timestamp t using values from t-1, ..., t-N

In [55]:
cols_list = lag_cols

for col in cols_list:
    df_deaths = get_mov_avg_std(df_deaths, col, N)

# Split into train, validation and test set

In [56]:
valid_date = dt.datetime.today() - dt.timedelta(days=7)

mask_valid = (df_deaths['Date'] >= valid_date )
mask_valid_br = ((df_deaths['Date'] >= valid_date ) & (df_deaths['Country'] == 'Brazil'))
mask_valid_sp = ((df_deaths['Date'] >= valid_date ) & (df_deaths['Country'] == 'São Paulo'))
mask_valid_nyc = ((df_deaths['Date'] >= valid_date ) & (df_deaths['Country'] == 'New York'))
mask_valid_sk = ((df_deaths['Date'] >= valid_date ) & (df_deaths['Country'] == 'South Korea'))
mask_valid_lombardia = ((df_deaths['Date'] >= valid_date ) & (df_deaths['Country'] == 'Lombardia'))
mask_valid_wuhua = ((df_deaths['Date'] >= valid_date ) & (df_deaths['Country'] == 'Wuhan'))


mask_train = (df_deaths['Date'] < valid_date )

# Split into train, valid, and test
train = df_deaths.loc[mask_train]
valid = df_deaths.loc[mask_valid]
valid_br = df_deaths.loc[mask_valid_br]
valid_sp = df_deaths.loc[mask_valid_sp]
valid_nyc = df_deaths.loc[mask_valid_nyc]
valid_sk = df_deaths.loc[mask_valid_sk]
valid_lombardia = df_deaths.loc[mask_valid_lombardia]
valid_wuhua = df_deaths.loc[mask_valid_wuhua]

print("train.shape = " + str(train.shape))
print("valid.shape = " + str(valid.shape))
print("valid_br.shape = " + str(valid_br.shape))
print("valid_sp.shape = " + str(valid_sp.shape))
print("valid_nyc.shape = " + str(valid_nyc.shape))
print("valid_sk.shape = " + str(valid_sk.shape))
print("valid_lombardia.shape = " + str(valid_lombardia.shape))
print("valid_wuhua.shape = " + str(valid_wuhua.shape))


train.shape = (1175, 147)
valid.shape = (268, 147)
valid_br.shape = (5, 147)
valid_sp.shape = (4, 147)
valid_nyc.shape = (3, 147)
valid_sk.shape = (5, 147)
valid_lombardia.shape = (5, 147)
valid_wuhua.shape = (6, 147)


# Scale the train, validation and test set

In [57]:
cols_to_scale = ['Deaths']

for i in range(1,N+1):
    for col in lag_cols:
        print(col + "_lag_"+ str(i))
        cols_to_scale.append(col + "_lag_" + str(i))

# Do scaling for train set
# Here we only scale the train dataset, and not the entire dataset to prevent information leak
scaler = StandardScaler()
train_scaled = scaler.fit_transform(train[cols_to_scale])
print("scaler.mean_ = " + str(scaler.mean_))
print("scaler.var_ = " + str(scaler.var_))
print("train_scaled.shape = " + str(train_scaled.shape))

# Convert the numpy array back into pandas dataframe
train_scaled = pd.DataFrame(train_scaled, columns=cols_to_scale)
train_scaled[['Date', 'country_code']] = train.reset_index()[['Date', 'country_code']]
print("train_scaled.shape = " + str(train_scaled.shape))
train_scaled.head()

Confirmed_lag_1
Deaths_lag_1
Case fatality rate of COVID-19 (%)_lag_1
Daily new confirmed cases of COVID-19 (rolling 3-day average)_lag_1
Daily new confirmed deaths due to COVID-19 (rolling 3-day average)_lag_1
Age 0-9_lag_1
Age 10-19_lag_1
Age 20-29_lag_1
Age 30-39_lag_1
Age 40-49_lag_1
Age 50-59_lag_1
Age 60-69_lag_1
Age 70-79_lag_1
Age >80_lag_1
# People_lag_1
GDP_lag_1
GDP per capta_lag_1
Life expectancy_lag_1
# Flight Passengers_lag_1
Hospital beds (per 1,000 people)_lag_1
Confirmed_lag_2
Deaths_lag_2
Case fatality rate of COVID-19 (%)_lag_2
Daily new confirmed cases of COVID-19 (rolling 3-day average)_lag_2
Daily new confirmed deaths due to COVID-19 (rolling 3-day average)_lag_2
Age 0-9_lag_2
Age 10-19_lag_2
Age 20-29_lag_2
Age 30-39_lag_2
Age 40-49_lag_2
Age 50-59_lag_2
Age 60-69_lag_2
Age 70-79_lag_2
Age >80_lag_2
# People_lag_2
GDP_lag_2
GDP per capta_lag_2
Life expectancy_lag_2
# Flight Passengers_lag_2
Hospital beds (per 1,000 people)_lag_2
Confirmed_lag_3
Deaths_lag_3
Case 

Unnamed: 0,Deaths,Confirmed_lag_1,Deaths_lag_1,Case fatality rate of COVID-19 (%)_lag_1,Daily new confirmed cases of COVID-19 (rolling 3-day average)_lag_1,Daily new confirmed deaths due to COVID-19 (rolling 3-day average)_lag_1,Age 0-9_lag_1,Age 10-19_lag_1,Age 20-29_lag_1,Age 30-39_lag_1,...,Age 70-79_lag_4,Age >80_lag_4,# People_lag_4,GDP_lag_4,GDP per capta_lag_4,Life expectancy_lag_4,# Flight Passengers_lag_4,"Hospital beds (per 1,000 people)_lag_4",Date,country_code
0,-0.385293,,,,,,,,,,...,,,,,,,,,2020-03-16,0
1,-0.385293,-0.451783,-0.389299,,-0.356664,-0.234494,-0.297826,-0.300435,-0.314661,-0.329422,...,,,,,,,,,2020-03-17,0
2,-0.385293,-0.451195,-0.389299,,-0.35578,-0.234494,-0.297826,-0.300435,-0.314661,-0.329422,...,,,,,,,,,2020-03-18,0
3,-0.384308,-0.450656,-0.389299,,-0.35372,-0.234494,-0.297826,-0.300435,-0.314661,-0.329422,...,,,,,,,,,2020-03-19,0
4,-0.384308,-0.449775,-0.388269,,-0.347243,-0.233866,-0.297826,-0.300435,-0.314661,-0.329422,...,-0.389278,-0.439566,-0.346331,-0.410222,-1.001727,-0.758097,-0.527613,-0.32427,2020-03-20,0


In [58]:
valid_scaled = valid[['Date', 'country_code']]
for col in tqdm_notebook(cols_list):
    feat_list = [col + '_lag_' + str(shift) for shift in range(1, N+1)]
    temp = valid.apply(lambda row: scale_row(row[feat_list], row[col+'_mean'], row[col+'_std']), axis=1)
    valid_scaled = pd.concat([valid_scaled, temp], axis=1)
    
# Now the entire valid set is scaled
valid_scaled.head()

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))




Unnamed: 0,Date,country_code,Confirmed_lag_1,Confirmed_lag_2,Confirmed_lag_3,Confirmed_lag_4,Deaths_lag_1,Deaths_lag_2,Deaths_lag_3,Deaths_lag_4,...,Life expectancy_lag_3,Life expectancy_lag_4,# Flight Passengers_lag_1,# Flight Passengers_lag_2,# Flight Passengers_lag_3,# Flight Passengers_lag_4,"Hospital beds (per 1,000 people)_lag_1","Hospital beds (per 1,000 people)_lag_2","Hospital beds (per 1,000 people)_lag_3","Hospital beds (per 1,000 people)_lag_4"
12,2020-03-28,0,1.250497,0.364198,-0.807347,-0.807347,1.358732,0.0,-0.339683,-1.019049,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13,2020-03-29,0,1.149419,0.365018,-0.310654,-1.203783,1.319824,0.219971,-0.659912,-0.879883,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14,2020-03-30,0,1.052597,0.542528,-0.394144,-1.20098,0.914807,0.699559,-0.376685,-1.237681,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15,2020-03-31,0,1.121034,0.34968,-0.215979,-1.254735,1.154878,0.182349,-0.060783,-1.276444,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16,2020-04-01,0,1.414696,-0.045233,-0.513159,-0.856304,1.276444,0.303915,-0.668614,-0.911746,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [59]:
valid_br_scaled = valid_br[['Date', 'country_code']]
for col in tqdm_notebook(cols_list):
    feat_list = [col + '_lag_' + str(shift) for shift in range(1, N+1)]
    temp = valid_br.apply(lambda row: scale_row(row[feat_list], row[col+'_mean'], row[col+'_std']), axis=1)
    valid_br_scaled = pd.concat([valid_br_scaled, temp], axis=1)
    
# Now the entire valid_br set is scaled
valid_br_scaled.head()

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))




Unnamed: 0,Date,country_code,Confirmed_lag_1,Confirmed_lag_2,Confirmed_lag_3,Confirmed_lag_4,Deaths_lag_1,Deaths_lag_2,Deaths_lag_3,Deaths_lag_4,...,Life expectancy_lag_3,Life expectancy_lag_4,# Flight Passengers_lag_1,# Flight Passengers_lag_2,# Flight Passengers_lag_3,# Flight Passengers_lag_4,"Hospital beds (per 1,000 people)_lag_1","Hospital beds (per 1,000 people)_lag_2","Hospital beds (per 1,000 people)_lag_3","Hospital beds (per 1,000 people)_lag_4"
116,2020-03-28,4,1.207609,0.361058,-0.483533,-1.085133,1.164852,0.421329,-0.470898,-1.115284,...,2.676777e-08,2.676777e-08,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
117,2020-03-29,4,1.190206,0.348943,-0.397311,-1.141838,1.187977,0.328108,-0.350736,-1.165349,...,2.676777e-08,2.676777e-08,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
118,2020-03-30,4,1.106718,0.473794,-0.401871,-1.178641,1.256433,0.274845,-0.471162,-1.060115,...,2.676777e-08,2.676777e-08,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
119,2020-03-31,4,1.084493,0.435806,-0.271123,-1.249176,1.180796,0.393599,-0.462051,-1.112344,...,2.676777e-08,2.676777e-08,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
120,2020-04-01,4,1.404555,-0.044569,-0.455876,-0.904111,1.287939,0.189595,-0.411879,-1.065655,...,2.676777e-08,2.676777e-08,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [60]:
valid_sp_scaled = valid_sp[['Date', 'country_code']]
for col in tqdm_notebook(cols_list):
    feat_list = [col + '_lag_' + str(shift) for shift in range(1, N+1)]
    temp = valid_sp.apply(lambda row: scale_row(row[feat_list], row[col+'_mean'], row[col+'_std']), axis=1)
    valid_sp_scaled = pd.concat([valid_sp_scaled, temp], axis=1)
    
# Now the entire valid_sp set is scaled
valid_sp_scaled.head()

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))




Unnamed: 0,Date,country_code,Confirmed_lag_1,Confirmed_lag_2,Confirmed_lag_3,Confirmed_lag_4,Deaths_lag_1,Deaths_lag_2,Deaths_lag_3,Deaths_lag_4,...,Life expectancy_lag_3,Life expectancy_lag_4,# Flight Passengers_lag_1,# Flight Passengers_lag_2,# Flight Passengers_lag_3,# Flight Passengers_lag_4,"Hospital beds (per 1,000 people)_lag_1","Hospital beds (per 1,000 people)_lag_2","Hospital beds (per 1,000 people)_lag_3","Hospital beds (per 1,000 people)_lag_4"
1261,2020-03-28,48,0.942716,0.488961,-0.064934,-1.366743,1.179235,0.378245,-0.422744,-1.134735,...,,,,,,,,,,
1262,2020-03-29,48,0.763291,0.763291,-0.184693,-1.341888,0.783349,0.783349,-0.261116,-1.305582,...,,,,,,,,,,
1263,2020-03-30,48,0.5,0.5,0.5,-1.5,0.5,0.5,0.5,-1.5,...,,,,,,,,,,
1264,2020-03-31,48,1.5,-0.5,-0.5,-0.5,1.5,-0.5,-0.5,-0.5,...,,,,,,,,,,


In [61]:
valid_nyc_scaled = valid_nyc[['Date', 'country_code']]
for col in tqdm_notebook(cols_list):
    feat_list = [col + '_lag_' + str(shift) for shift in range(1, N+1)]
    temp = valid_nyc.apply(lambda row: scale_row(row[feat_list], row[col+'_mean'], row[col+'_std']), axis=1)
    valid_nyc_scaled = pd.concat([valid_nyc_scaled, temp], axis=1)
    
# Now the entire valid_nyc set is scaled
valid_nyc_scaled.head()

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))




Unnamed: 0,Date,country_code,Confirmed_lag_1,Confirmed_lag_2,Confirmed_lag_3,Confirmed_lag_4,Deaths_lag_1,Deaths_lag_2,Deaths_lag_3,Deaths_lag_4,...,Life expectancy_lag_3,Life expectancy_lag_4,# Flight Passengers_lag_1,# Flight Passengers_lag_2,# Flight Passengers_lag_3,# Flight Passengers_lag_4,"Hospital beds (per 1,000 people)_lag_1","Hospital beds (per 1,000 people)_lag_2","Hospital beds (per 1,000 people)_lag_3","Hospital beds (per 1,000 people)_lag_4"
825,2020-03-28,30,1.185099,0.390955,-0.471228,-1.104826,1.1968,0.361252,-0.444807,-1.113245,...,,,,,,,,,,
826,2020-03-29,30,1.178751,0.35393,-0.37218,-1.1605,1.297398,0.185868,-0.439827,-1.043439,...,,,,,,,,,,
827,2020-03-30,30,1.168795,0.393804,-0.422148,-1.140451,1.244389,0.323255,-0.540665,-1.026978,...,,,,,,,,,,


In [62]:
valid_sk_scaled = valid_sk[['Date', 'country_code']]
for col in tqdm_notebook(cols_list):
    feat_list = [col + '_lag_' + str(shift) for shift in range(1, N+1)]
    temp = valid_sk.apply(lambda row: scale_row(row[feat_list], row[col+'_mean'], row[col+'_std']), axis=1)
    valid_sk_scaled = pd.concat([valid_sk_scaled, temp], axis=1)
    
# Now the entire valid_sk set is scaled
valid_sk_scaled.head()

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))




Unnamed: 0,Date,country_code,Confirmed_lag_1,Confirmed_lag_2,Confirmed_lag_3,Confirmed_lag_4,Deaths_lag_1,Deaths_lag_2,Deaths_lag_3,Deaths_lag_4,...,Life expectancy_lag_3,Life expectancy_lag_4,# Flight Passengers_lag_1,# Flight Passengers_lag_2,# Flight Passengers_lag_3,# Flight Passengers_lag_4,"Hospital beds (per 1,000 people)_lag_1","Hospital beds (per 1,000 people)_lag_2","Hospital beds (per 1,000 people)_lag_3","Hospital beds (per 1,000 people)_lag_4"
1152,2020-03-28,44,1.137212,0.424742,-0.38951,-1.172444,1.24354,0.248708,-0.373062,-1.119186,...,0.0,0.0,-6e-05,-6e-05,-6e-05,-6e-05,-4.470348e-08,-4.470348e-08,-4.470348e-08,-4.470348e-08
1153,2020-03-29,44,1.251784,0.242058,-0.387292,-1.106549,1.119186,0.497416,-0.497416,-1.119186,...,0.0,0.0,-6e-05,-6e-05,-6e-05,-6e-05,-4.470348e-08,-4.470348e-08,-4.470348e-08,-4.470348e-08
1154,2020-03-30,44,1.149046,0.457643,-0.503737,-1.102953,1.191439,0.283676,-0.283676,-1.191439,...,0.0,0.0,-6e-05,-6e-05,-6e-05,-6e-05,-4.470348e-08,-4.470348e-08,-4.470348e-08,-4.470348e-08
1155,2020-03-31,44,1.036192,0.48824,-0.249389,-1.275043,1.157792,0.445305,-0.504678,-1.098418,...,0.0,0.0,-6e-05,-6e-05,-6e-05,-6e-05,-4.470348e-08,-4.470348e-08,-4.470348e-08,-4.470348e-08
1156,2020-04-01,44,1.2246,0.261864,-0.338883,-1.147581,1.021508,0.510754,-0.255377,-1.276885,...,0.0,0.0,-6e-05,-6e-05,-6e-05,-6e-05,-4.470348e-08,-4.470348e-08,-4.470348e-08,-4.470348e-08


In [63]:
valid_lombardia_scaled = valid_lombardia[['Date', 'country_code']]
for col in tqdm_notebook(cols_list):
    feat_list = [col + '_lag_' + str(shift) for shift in range(1, N+1)]
    temp = valid_lombardia.apply(lambda row: scale_row(row[feat_list], row[col+'_mean'], row[col+'_std']), axis=1)
    valid_lombardia_scaled = pd.concat([valid_lombardia_scaled, temp], axis=1)
    
# Now the entire valid_lombardia set is scaled
valid_lombardia_scaled.head()

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))




Unnamed: 0,Date,country_code,Confirmed_lag_1,Confirmed_lag_2,Confirmed_lag_3,Confirmed_lag_4,Deaths_lag_1,Deaths_lag_2,Deaths_lag_3,Deaths_lag_4,...,Life expectancy_lag_3,Life expectancy_lag_4,# Flight Passengers_lag_1,# Flight Passengers_lag_2,# Flight Passengers_lag_3,# Flight Passengers_lag_4,"Hospital beds (per 1,000 people)_lag_1","Hospital beds (per 1,000 people)_lag_2","Hospital beds (per 1,000 people)_lag_3","Hospital beds (per 1,000 people)_lag_4"
712,2020-03-28,25,1.205564,0.373175,-0.505514,-1.073225,1.273097,0.250081,-0.481725,-1.041453,...,,,,,,,,,,
713,2020-03-29,25,1.123434,0.429645,-0.35984,-1.193239,1.207023,0.361522,-0.482419,-1.086126,...,,,,,,,,,,
714,2020-03-30,25,1.075856,0.475887,-0.321937,-1.229806,1.102053,0.46376,-0.367862,-1.19795,...,,,,,,,,,,
715,2020-03-31,25,1.039549,0.491956,-0.263476,-1.268029,1.139307,0.379769,-0.310117,-1.208959,...,,,,,,,,,,
716,2020-04-01,25,1.082434,0.4386,-0.271031,-1.250003,1.134287,0.435841,-0.40376,-1.166368,...,,,,,,,,,,


In [64]:
valid_wuhua_scaled = valid_wuhua[['Date', 'country_code']]
for col in tqdm_notebook(cols_list):
    feat_list = [col + '_lag_' + str(shift) for shift in range(1, N+1)]
    temp = valid_wuhua.apply(lambda row: scale_row(row[feat_list], row[col+'_mean'], row[col+'_std']), axis=1)
    valid_wuhua_scaled = pd.concat([valid_wuhua_scaled, temp], axis=1)
    
# Now the entire valid_wuhua set is scaled
valid_wuhua_scaled.head()

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(FloatProgress(value=0.0, max=20.0), HTML(value='')))




Unnamed: 0,Date,country_code,Confirmed_lag_1,Confirmed_lag_2,Confirmed_lag_3,Confirmed_lag_4,Deaths_lag_1,Deaths_lag_2,Deaths_lag_3,Deaths_lag_4,...,Life expectancy_lag_3,Life expectancy_lag_4,# Flight Passengers_lag_1,# Flight Passengers_lag_2,# Flight Passengers_lag_3,# Flight Passengers_lag_4,"Hospital beds (per 1,000 people)_lag_1","Hospital beds (per 1,000 people)_lag_2","Hospital beds (per 1,000 people)_lag_3","Hospital beds (per 1,000 people)_lag_4"
1437,2020-03-28,53,0.0,0.0,0.0,0.0,1.200961,0.40032,-0.560449,-1.040833,...,,,,,,,,,,
1438,2020-03-29,53,0.0,0.0,0.0,0.0,1.019489,0.530134,-0.285457,-1.264166,...,,,,,,,,,,
1439,2020-03-30,53,0.0,0.0,0.0,0.0,1.193381,0.275396,-0.275396,-1.193381,...,,,,,,,,,,
1440,2020-03-31,53,0.0,0.0,0.0,0.0,1.175901,0.423324,-0.517396,-1.081829,...,,,,,,,,,,
1441,2020-04-01,53,0.0,0.0,0.0,0.0,0.879883,0.659912,-0.219971,-1.319824,...,,,,,,,,,,


# Split into X and y

In [65]:
features = ['country_code']
for i in range(1,N+1):
    for col in lag_cols:
        features.append(col + "_lag_" + str(i))
    
target = "Deaths"


# Split into X and y
X_train = train[features]
y_train = train[target]
X_valid = valid[features]
y_valid = valid[target]
X_valid_br = valid_br[features]
y_valid_br = valid_br[target]
X_valid_sk = valid_sk[features]
y_valid_sk = valid_sk[target]
X_valid_sp = valid_sp[features]
y_valid_sp = valid_sp[target]
X_valid_nyc = valid_nyc[features]
y_valid_nyc = valid_nyc[target]
X_valid_lombardia = valid_lombardia[features]
y_valid_lombardia = valid_lombardia[target]
X_valid_wuhua = valid_wuhua[features]
y_valid_wuhua = valid_wuhua[target]

print("X_train.shape = " + str(X_train.shape))
print("y_train.shape = " + str(y_train.shape))
print("X_valid.shape = " + str(X_valid.shape))
print("y_valid_br.shape = " + str(y_valid_br.shape))
print("X_valid_br.shape = " + str(X_valid_br.shape))
print("y_valid.shape = " + str(y_valid.shape))

X_train.shape = (1175, 81)
y_train.shape = (1175,)
X_valid.shape = (268, 81)
y_valid_br.shape = (5,)
X_valid_br.shape = (5, 81)
y_valid.shape = (268,)


In [66]:
# Split into X and y
X_train_scaled = train_scaled[features]
y_train_scaled = train_scaled[target]
X_valid_scaled = valid_scaled[features]
X_valid_br_scaled = valid_br_scaled[features]
X_valid_sk_scaled = valid_sk_scaled[features]
X_valid_sp_scaled = valid_sp_scaled[features]
X_valid_nyc_scaled = valid_nyc_scaled[features]
X_valid_lombardia_scaled = valid_lombardia_scaled[features]
X_valid_wuhua_scaled = valid_wuhua_scaled[features]


print("X_train_scaled.shape = " + str(X_train_scaled.shape))
print("y_train_scaled.shape = " + str(y_train_scaled.shape))
print("X_valid_scaled.shape = " + str(X_valid_scaled.shape))
print("X_valid_br_scaled.shape = " + str(X_valid_br_scaled.shape))
print("X_valid_sk_scaled.shape = " + str(X_valid_sk_scaled.shape))
print("X_valid_sp_scaled.shape = " + str(X_valid_sp_scaled.shape))
print("X_valid_nyc_scaled.shape = " + str(X_valid_nyc_scaled.shape))
print("X_valid_lombardia_scaled.shape = " + str(X_valid_lombardia_scaled.shape))
print("X_valid_wuhua_scaled.shape = " + str(X_valid_wuhua_scaled.shape))

X_train_scaled.shape = (1175, 81)
y_train_scaled.shape = (1175,)
X_valid_scaled.shape = (268, 81)
X_valid_br_scaled.shape = (5, 81)
X_valid_sk_scaled.shape = (5, 81)
X_valid_sp_scaled.shape = (4, 81)
X_valid_nyc_scaled.shape = (3, 81)
X_valid_lombardia_scaled.shape = (5, 81)
X_valid_wuhua_scaled.shape = (6, 81)


# Train the model using XGBoost

In [67]:
# Create the model
model = XGBRegressor(seed=model_seed,
                     n_estimators=n_estimators,
                     max_depth=max_depth,
                     learning_rate=learning_rate,
                     min_child_weight=min_child_weight,
                     subsample=subsample,
                     colsample_bytree=colsample_bytree,
                     colsample_bylevel=colsample_bylevel,
                     gamma=gamma)

# Train the regressor
model.fit(X_train_scaled, y_train_scaled)

XGBRegressor(base_score=None, booster=None, colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints=None,
             learning_rate=0.1, max_delta_step=0, max_depth=3,
             min_child_weight=1, missing=nan, monotone_constraints=None,
             n_estimators=100, n_jobs=0, num_parallel_tree=1,
             objective='reg:squarederror', random_state=100, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, seed=100, subsample=1,
             tree_method=None, validate_parameters=False, verbosity=None)

# Predict on train set

In [68]:
# Do prediction on train set
est_scaled = model.predict(X_train_scaled)
est = est_scaled * math.sqrt(scaler.var_[0]) + scaler.mean_[0]

# Calculate RMSE
print("RMSE on train set = %0.3f" % math.sqrt(mean_squared_error(y_train, est)))

# Calculate MAPE
print("MAPE on train set = %0.3f%%" % get_mape(y_train, est))

est_df = pd.DataFrame({'est': est, 
                       'Date': train['Date'], 
                       'Country': train['Country']})

country = 'Brazil'
mask_train = (train['Country'] == country)
mask_valid = (valid['Country'] == country)
mask_est = (est_df['Country'] == country)

ax = train.loc[mask_train].plot(x='Date', y='Deaths', style='b-', grid=True)
ax = valid.loc[mask_valid].plot(x='Date', y='Deaths', style='y-', grid=True, ax=ax)
ax = est_df.loc[mask_est].plot(x='Date', y='est', style='r-', grid=True, ax=ax)
ax.legend(['train', 'validation', 'predictions'])
ax.set_xlabel("Date")
ax.set_ylabel("Deaths")

RMSE on train set = 15.038
MAPE on train set = inf%


  


Text(0, 0.5, 'Deaths')

# Predict on valid set brasil

In [69]:
# Do prediction on test set
est_scaled_br = model.predict(X_valid_br_scaled)
valid_br['est_scaled_br'] = est_scaled_br
valid_br['est'] = valid_br['est_scaled_br'] * valid_br['Deaths_std'] + valid_br['Deaths_mean']

# Calculate RMSE
rmse_bef_tuning = math.sqrt(mean_squared_error(y_valid_br, valid_br['est']))
print("RMSE on dev set = %0.3f" % rmse_bef_tuning)

# Calculate MAPE
mape_bef_tuning = get_mape(y_valid_br, valid_br['est'])
print("MAPE on dev set = %0.3f%%" % mape_bef_tuning)

est_df = pd.DataFrame({'est': valid_br['est'], 
                       'y_valid_br': y_valid_br,
                       'Date': valid_br['Date']})

ax = valid_br.plot(x='Date', y='Deaths', style='y-', grid=True)
ax = est_df.plot(x='Date', y='est', style='r-', grid=True, ax=ax)
ax.legend(['validation', 'predictions'])
ax.set_xlabel("Date")
ax.set_ylabel("Deaths")
ax.set_title("Brazil")

RMSE on dev set = 31.402
MAPE on dev set = 17.644%


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


Text(0.5, 1.0, 'Brazil')

# Predict on valid set Koreia do Sul

In [70]:
# Do prediction on test set
est_scaled_sk = model.predict(X_valid_sk_scaled)
valid_sk['est_scaled_sk'] = est_scaled_sk
valid_sk['est'] = valid_sk['est_scaled_sk'] * valid_sk['Deaths_std'] + valid_sk['Deaths_mean']

# Calculate RMSE
rmse_bef_tuning = math.sqrt(mean_squared_error(y_valid_sk, valid_sk['est']))
print("RMSE on dev set = %0.3f" % rmse_bef_tuning)

# Calculate MAPE
mape_bef_tuning = get_mape(y_valid_sk, valid_sk['est'])
print("MAPE on dev set = %0.3f%%" % mape_bef_tuning)

est_df = pd.DataFrame({'est': valid_sk['est'], 
                       'y_valid_sk': y_valid_sk,
                       'Date': valid_sk['Date']})

ax = valid_sk.plot(x='Date', y='Deaths', style='y-', grid=True)
ax = est_df.plot(x='Date', y='est', style='r-', grid=True, ax=ax)
ax.legend(['validation', 'predictions'])
ax.set_xlabel("Date")
ax.set_ylabel("Deaths")
ax.set_title("South Korea")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


RMSE on dev set = 5.288
MAPE on dev set = 3.197%


Text(0.5, 1.0, 'South Korea')

# Predict New York

In [71]:
# Do prediction on test set
est_scaled_nyc = model.predict(X_valid_nyc_scaled)
valid_nyc['est_scaled_nyc'] = est_scaled_nyc
valid_nyc['est'] = valid_nyc['est_scaled_nyc'] * valid_nyc['Deaths_std'] + valid_nyc['Deaths_mean']

# Calculate RMSE
rmse_bef_tuning = math.sqrt(mean_squared_error(y_valid_nyc, valid_nyc['est']))
print("RMSE on dev set = %0.3f" % rmse_bef_tuning)

# Calculate MAPE
mape_bef_tuning = get_mape(y_valid_nyc, valid_nyc['est'])
print("MAPE on dev set = %0.3f%%" % mape_bef_tuning)

est_df = pd.DataFrame({'est': valid_nyc['est'], 
                       'y_valid_nyc': y_valid_nyc,
                       'Date': valid_nyc['Date']})

ax = valid_nyc.plot(x='Date', y='Deaths', style='y-', grid=True)
ax = est_df.plot(x='Date', y='est', style='r-', grid=True, ax=ax)
ax.legend(['validation', 'predictions'])
ax.set_xlabel("Date")
ax.set_ylabel("Deaths")
ax.set_title("New York")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


RMSE on dev set = 145.183
MAPE on dev set = 22.670%


Text(0.5, 1.0, 'New York')

# Predict on valid set Lombardia

In [72]:
# Do prediction on test set
est_scaled_lombardia = model.predict(X_valid_lombardia_scaled)
valid_lombardia['est_scaled_lombardia'] = est_scaled_lombardia
valid_lombardia['est'] = valid_lombardia['est_scaled_lombardia'] * valid_lombardia['Deaths_std'] + valid_lombardia['Deaths_mean']

# Calculate RMSE
rmse_bef_tuning = math.sqrt(mean_squared_error(y_valid_lombardia, valid_lombardia['est']))
print("RMSE on dev set = %0.3f" % rmse_bef_tuning)

# Calculate MAPE
mape_bef_tuning = get_mape(y_valid_lombardia, valid_lombardia['est'])
print("MAPE on dev set = %0.3f%%" % mape_bef_tuning)

est_df = pd.DataFrame({'est': valid_lombardia['est'], 
                       'y_valid_lombardia': y_valid_lombardia,
                       'Date': valid_lombardia['Date']})

ax = valid_lombardia.plot(x='Date', y='Deaths', style='y-', grid=True)
ax = est_df.plot(x='Date', y='est', style='r-', grid=True, ax=ax)
ax.legend(['validation', 'predictions'])
ax.set_xlabel("Date")
ax.set_ylabel("Deaths")
ax.set_title("Lombardia")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


RMSE on dev set = 433.086
MAPE on dev set = 6.379%


Text(0.5, 1.0, 'Lombardia')

# Predict Wuhan

In [73]:
# Do prediction on test set
est_scaled_wuhua = model.predict(X_valid_wuhua_scaled)
valid_wuhua['est_scaled_wuhua'] = est_scaled_wuhua
valid_wuhua['est'] = valid_wuhua['est_scaled_wuhua'] * valid_wuhua['Deaths_std'] + valid_wuhua['Deaths_mean']

# Calculate RMSE
rmse_bef_tuning = math.sqrt(mean_squared_error(y_valid_wuhua, valid_wuhua['est']))
print("RMSE on dev set = %0.3f" % rmse_bef_tuning)

# Calculate MAPE
mape_bef_tuning = get_mape(y_valid_wuhua, valid_wuhua['est'])
print("MAPE on dev set = %0.3f%%" % mape_bef_tuning)

est_df = pd.DataFrame({'est': valid_wuhua['est'], 
                       'y_valid_wuhua': y_valid_wuhua,
                       'Date': valid_wuhua['Date']})

ax = valid_wuhua.plot(x='Date', y='Deaths', style='y-', grid=True)
ax = est_df.plot(x='Date', y='est', style='r-', grid=True, ax=ax)
ax.legend(['validation', 'predictions'])
ax.set_xlabel("Date")
ax.set_ylabel("Deaths")
ax.set_title("Wuhan")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


RMSE on dev set = 4.373
MAPE on dev set = 0.126%


Text(0.5, 1.0, 'Wuhan')