In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from xgboost import XGBRegressor, plot_importance, plot_tree
from tqdm import tqdm_notebook
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
import math

%matplotlib qt5

In [2]:
def get_mov_avg_std(df, col, N):
    """
    Given a dataframe, get mean and std dev at timestep t using values from t-1, t-2, ..., t-N.
    Inputs
        df         : dataframe. Can be of any length.
        col        : name of the column you want to calculate mean and std dev
        N          : get mean and std dev at timestep t using values from t-1, t-2, ..., t-N
    Outputs
        df_out     : same as df but with additional column containing mean and std dev
    """
    mean_list = df[col].rolling(window = N, min_periods=1).mean() # len(mean_list) = len(df)
    std_list = df[col].rolling(window = N, min_periods=1).std()   # first value will be NaN, because normalized by N-1
    
    # Add one timestep to the predictions
    mean_list = np.concatenate((np.array([np.nan]), np.array(mean_list[:-1])))
    std_list = np.concatenate((np.array([np.nan]), np.array(std_list[:-1])))
    
    # Append mean_list to df
    df_out = df.copy()
    df_out[col + '_mean'] = mean_list
    df_out[col + '_std'] = std_list
    
    return df_out

def scale_row(row, feat_mean, feat_std):
    """
    Given a pandas series in row, scale it to have 0 mean and var 1 using feat_mean and feat_std
    Inputs
        row      : pandas series. Need to scale this.
        feat_mean: mean  
        feat_std : standard deviation
    Outputs
        row_scaled : pandas series with same length as row, but scaled
    """
    # If feat_std = 0 (this happens if adj_close doesn't change over N days), 
    # set it to a small number to avoid division by zero
    feat_std = 0.001 if feat_std == 0 else feat_std
    
    row_scaled = (row-feat_mean) / feat_std
    
    return row_scaled

In [3]:
df_covid = pd.read_csv("./input/covid_19_clear.csv", parse_dates=['Date'], infer_datetime_format=True)


In [4]:
df_covid.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11346 entries, 0 to 11345
Data columns (total 5 columns):
 #   Column     Non-Null Count  Dtype         
---  ------     --------------  -----         
 0   Country    11346 non-null  object        
 1   Date       11346 non-null  datetime64[ns]
 2   Confirmed  11346 non-null  float64       
 3   Deaths     11346 non-null  float64       
 4   Recovered  11346 non-null  float64       
dtypes: datetime64[ns](1), float64(3), object(1)
memory usage: 443.3+ KB


In [5]:
df_covid
df_covid_world = df_covid.groupby(['Date']).sum()[['Confirmed']].reset_index()

In [6]:
df_covid_world

Unnamed: 0,Date,Confirmed
0,2020-01-22,555.0
1,2020-01-23,653.0
2,2020-01-24,941.0
3,2020-01-25,1434.0
4,2020-01-26,2118.0
...,...,...
57,2020-03-19,242708.0
58,2020-03-20,272166.0
59,2020-03-21,304524.0
60,2020-03-22,335955.0


In [7]:
ax = df_covid_world.plot(x='Date', y='Confirmed', style='b-', grid=True)
ax.set_xlabel("date")
ax.set_ylabel("Confirmed")

Text(0, 0.5, 'Confirmed')

# Feature Engeneering 

In [8]:
# Add a column 'order_day' to indicate the order of the rows by date
df_covid_world['order_day'] = [x for x in list(range(len(df_covid_world)))]

# merging_keys
merging_keys = ['order_day']

# List of columns that we will use to create lags
lag_cols = ['Confirmed']
lag_cols

['Confirmed']

In [9]:
N = 2
shift_range = [x+1 for x in range(N)]

for shift in tqdm_notebook(shift_range):
    train_shift = df_covid_world[merging_keys + lag_cols].copy()
    
    # E.g. order_day of 0 becomes 1, for shift = 1.
    # So when this is merged with order_day of 1 in df_covid_world, this will represent lag of 1.
    train_shift['order_day'] = train_shift['order_day'] + shift
    
    foo = lambda x: '{}_lag_{}'.format(x, shift) if x in lag_cols else x
    train_shift = train_shift.rename(columns=foo)

    df_covid_world = pd.merge(df_covid_world, train_shift, on=merging_keys, how='left') #.fillna(0)
    
del train_shift

# Remove the first N rows which contain NaNs
df_covid_world = df_covid_world[N:]
    
df_covid_world.head()

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  after removing the cwd from sys.path.


HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))




Unnamed: 0,Date,Confirmed,order_day,Confirmed_lag_1,Confirmed_lag_2
2,2020-01-24,941.0,2,653.0,555.0
3,2020-01-25,1434.0,3,941.0,653.0
4,2020-01-26,2118.0,4,1434.0,941.0
5,2020-01-27,2927.0,5,2118.0,1434.0
6,2020-01-28,5578.0,6,2927.0,2118.0


In [10]:
df_covid_world.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 60 entries, 2 to 61
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   Date             60 non-null     datetime64[ns]
 1   Confirmed        60 non-null     float64       
 2   order_day        60 non-null     int64         
 3   Confirmed_lag_1  60 non-null     float64       
 4   Confirmed_lag_2  60 non-null     float64       
dtypes: datetime64[ns](1), float64(3), int64(1)
memory usage: 2.8 KB


# Get mean and std dev at timestamp t using values from t-1, ..., t-N

In [11]:
cols_list = [
"Confirmed"
]

for col in cols_list:
    df_covid_world = get_mov_avg_std(df_covid_world, col, N)
df_covid_world.head()

Unnamed: 0,Date,Confirmed,order_day,Confirmed_lag_1,Confirmed_lag_2,Confirmed_mean,Confirmed_std
2,2020-01-24,941.0,2,653.0,555.0,,
3,2020-01-25,1434.0,3,941.0,653.0,941.0,
4,2020-01-26,2118.0,4,1434.0,941.0,1187.5,348.603643
5,2020-01-27,2927.0,5,2118.0,1434.0,1776.0,483.661038
6,2020-01-28,5578.0,6,2927.0,2118.0,2522.5,572.049386


# Split into train, validation and test set

In [12]:
valid_size = 0.2
test_size = 0.2

# Get sizes of each of the datasets
num_valid = int(valid_size*len(df_covid_world))
num_test = int(test_size*len(df_covid_world))
num_train = len(df_covid_world)-num_valid-num_test
print("num_train = ", num_train)
print("num_valid = ", num_valid)
print("num_test = ", num_test)

# Split into train, valid, and test
train = df_covid_world[:num_train]
valid = df_covid_world[num_train:num_train+num_valid]
train_valid = df_covid_world[:num_train+num_valid]
test = df_covid_world[num_train+num_valid:]
print("train.shape = " + str(train.shape))
print("valid.shape = " + str(valid.shape))
print("train_valid.shape = " + str(train_valid.shape))
print("test.shape = " + str(test.shape))

num_train =  36
num_valid =  12
num_test =  12
train.shape = (36, 7)
valid.shape = (12, 7)
train_valid.shape = (48, 7)
test.shape = (12, 7)


# Scale the train, validation and test set¶

In [13]:
cols_to_scale = [
"Confirmed"
]

for i in range(1,N+1):
    cols_to_scale.append("Confirmed_lag_"+str(i))

# Do scaling for train set
# Here we only scale the train dataset, and not the entire dataset to prevent information leak
scaler = StandardScaler()
train_scaled = scaler.fit_transform(train[cols_to_scale])
print("scaler.mean_ = " + str(scaler.mean_))
print("scaler.var_ = " + str(scaler.var_))
print("train_scaled.shape = " + str(train_scaled.shape))

# Convert the numpy array back into pandas dataframe
train_scaled = pd.DataFrame(train_scaled, columns=cols_to_scale)
train_scaled[['Date']] = train.reset_index()[['Date']]
print("train_scaled.shape = " + str(train_scaled.shape))
train_scaled.head()

scaler.mean_ = [45088.19444444 42769.66666667 40486.36111111]
scaler.var_ = [8.99419013e+08 9.06571349e+08 9.06450397e+08]
train_scaled.shape = (36, 3)
train_scaled.shape = (36, 4)


Unnamed: 0,Confirmed,Confirmed_lag_1,Confirmed_lag_2,Date
0,-1.472048,-1.398792,-1.326301,2020-01-24
1,-1.45561,-1.389226,-1.323046,2020-01-25
2,-1.432802,-1.372853,-1.31348,2020-01-26
3,-1.405827,-1.350136,-1.297105,2020-01-27
4,-1.317432,-1.323267,-1.274387,2020-01-28


In [14]:
# Do scaling for train+valid set
scaler_train_valid = StandardScaler()
train_valid_scaled = scaler_train_valid.fit_transform(train_valid[cols_to_scale])
print("scaler_train_valid.mean_ = " + str(scaler_train_valid.mean_))
print("scaler_train_valid.var_ = " + str(scaler_train_valid.var_))
print("train_valid_scaled.shape = " + str(train_valid_scaled.shape))

# Convert the numpy array back into pandas dataframe
train_valid_scaled = pd.DataFrame(train_valid_scaled, columns=cols_to_scale)
train_valid_scaled[['Date']] = train_valid.reset_index()[['Date']]
print("train_valid_scaled.shape = " + str(train_valid_scaled.shape))
train_valid_scaled.head()

scaler_train_valid.mean_ = [59356.6875     56748.10416667 54289.        ]
scaler_train_valid.var_ = [1.32241691e+09 1.29525319e+09 1.27531011e+09]
train_valid_scaled.shape = (48, 3)
train_valid_scaled.shape = (48, 4)


Unnamed: 0,Confirmed,Confirmed_lag_1,Confirmed_lag_2,Date
0,-1.606369,-1.558646,-1.50467,2020-01-24
1,-1.592812,-1.550644,-1.501926,2020-01-25
2,-1.574003,-1.536946,-1.493861,2020-01-26
3,-1.551756,-1.51794,-1.480056,2020-01-27
4,-1.478856,-1.495462,-1.460903,2020-01-28


In [15]:
valid_scaled = valid[['Date']]
for col in tqdm_notebook(cols_list):
    feat_list = [col + '_lag_' + str(shift) for shift in range(1, N+1)]
    temp = valid.apply(lambda row: scale_row(row[feat_list], row[col+'_mean'], row[col+'_std']), axis=1)
    valid_scaled = pd.concat([valid_scaled, temp], axis=1)
    
# Now the entire valid set is scaled
valid_scaled.head()

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




Unnamed: 0,Date,Confirmed_lag_1,Confirmed_lag_2
38,2020-02-29,0.707107,-0.707107
39,2020-03-01,0.707107,-0.707107
40,2020-03-02,0.707107,-0.707107
41,2020-03-03,0.707107,-0.707107
42,2020-03-04,0.707107,-0.707107


In [16]:
# Do scaling for test set
test_scaled = test[['Date']]
for col in tqdm_notebook(cols_list):
    feat_list = [col + '_lag_' + str(shift) for shift in range(1, N+1)]
    temp = test.apply(lambda row: scale_row(row[feat_list], row[col+'_mean'], row[col+'_std']), axis=1)
    test_scaled = pd.concat([test_scaled, temp], axis=1)
    
# Now the entire test set is scaled
test_scaled.head()

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  This is separate from the ipykernel package so we can avoid doing imports until


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))




Unnamed: 0,Date,Confirmed_lag_1,Confirmed_lag_2
50,2020-03-12,0.707107,-0.707107
51,2020-03-13,0.707107,-0.707107
52,2020-03-14,0.707107,-0.707107
53,2020-03-15,0.707107,-0.707107
54,2020-03-16,0.707107,-0.707107


# Split into X and y

In [17]:
features = []
for i in range(1,N+1):
    features.append("Confirmed_lag_"+str(i))


target = "Confirmed"


# Split into X and y
X_train = train[features]
y_train = train[target]
X_valid = valid[features]
y_valid = valid[target]
X_train_valid = train_valid[features]
y_train_valid = train_valid[target]
X_sample = test[features]
y_sample = test[target]
print("X_train.shape = " + str(X_train.shape))
print("y_train.shape = " + str(y_train.shape))
print("X_valid.shape = " + str(X_valid.shape))
print("y_valid.shape = " + str(y_valid.shape))
print("X_train_valid.shape = " + str(X_train_valid.shape))
print("y_train_valid.shape = " + str(y_train_valid.shape))
print("X_sample.shape = " + str(X_sample.shape))
print("y_sample.shape = " + str(y_sample.shape))

X_train.shape = (36, 2)
y_train.shape = (36,)
X_valid.shape = (12, 2)
y_valid.shape = (12,)
X_train_valid.shape = (48, 2)
y_train_valid.shape = (48,)
X_sample.shape = (12, 2)
y_sample.shape = (12,)


In [18]:
# Split into X and y
X_train_scaled = train_scaled[features]
y_train_scaled = train_scaled[target]
X_valid_scaled = valid_scaled[features]
X_train_valid_scaled = train_valid_scaled[features]
y_train_valid_scaled = train_valid_scaled[target]
X_sample_scaled = test_scaled[features]
print("X_train_scaled.shape = " + str(X_train_scaled.shape))
print("y_train_scaled.shape = " + str(y_train_scaled.shape))
print("X_valid_scaled.shape = " + str(X_valid_scaled.shape))
print("X_train_valid_scaled.shape = " + str(X_train_valid_scaled.shape))
print("y_train_valid_scaled.shape = " + str(y_train_valid_scaled.shape))
print("X_sample_scaled.shape = " + str(X_sample_scaled.shape))

X_train_scaled.shape = (36, 2)
y_train_scaled.shape = (36,)
X_valid_scaled.shape = (12, 2)
X_train_valid_scaled.shape = (48, 2)
y_train_valid_scaled.shape = (48,)
X_sample_scaled.shape = (12, 2)


# EDA

In [19]:
ax = train.plot(x='Date', y='Confirmed', style='b-', grid=True)
ax = valid.plot(x='Date', y='Confirmed', style='y-', grid=True, ax=ax)
ax = test.plot(x='Date', y='Confirmed', style='g-', grid=True, ax=ax)
ax.legend(['train', 'validation', 'test'])
ax.set_xlabel("date")
ax.set_ylabel("Confirmed")
ax.set_title("Without scaling")

Text(0.5, 1.0, 'Without scaling')

In [20]:
ax = train_scaled.plot(x='Date', y='Confirmed', style='b-', grid=True)
ax.legend(['train_scaled'])
ax.set_xlabel("date")
ax.set_ylabel("Confirmed (scaled)")
ax.set_title("With scaling")

Text(0.5, 1.0, 'With scaling')

# Train the model using XGBoost


In [21]:
n_estimators = 100             # Number of boosted trees to fit. default = 100
max_depth = 3                  # Maximum tree depth for base learners. default = 3
learning_rate = 0.1            # Boosting learning rate (xgb’s “eta”). default = 0.1
min_child_weight = 1           # Minimum sum of instance weight(hessian) needed in a child. default = 1
subsample = 1                  # Subsample ratio of the training instance. default = 1
colsample_bytree = 1           # Subsample ratio of columns when constructing each tree. default = 1
colsample_bylevel = 1          # Subsample ratio of columns for each split, in each level. default = 1
gamma = 0  
model_seed = 100


In [22]:
# Create the model
model = XGBRegressor(seed=model_seed,
                     n_estimators=n_estimators,
                     max_depth=max_depth,
                     learning_rate=learning_rate,
                     min_child_weight=min_child_weight,
                     subsample=subsample,
                     colsample_bytree=colsample_bytree,
                     colsample_bylevel=colsample_bylevel,
                     gamma=gamma)

# Train the regressor
model.fit(X_train_scaled, y_train_scaled)

XGBRegressor(base_score=None, booster=None, colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints=None,
             learning_rate=0.1, max_delta_step=0, max_depth=3,
             min_child_weight=1, missing=nan, monotone_constraints=None,
             n_estimators=100, n_jobs=0, num_parallel_tree=1,
             objective='reg:squarederror', random_state=100, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, seed=100, subsample=1,
             tree_method=None, validate_parameters=False, verbosity=None)

# Predict on train set

In [23]:
def get_mape(y_true, y_pred): 
    """
    Compute mean absolute percentage error (MAPE)
    """
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [24]:
# Do prediction on train set
est_scaled = model.predict(X_train_scaled)
est = est_scaled * math.sqrt(scaler.var_[0]) + scaler.mean_[0]

# Calculate RMSE
print("RMSE on train set = %0.3f" % math.sqrt(mean_squared_error(y_train, est)))

# Calculate MAPE
print("MAPE on train set = %0.3f%%" % get_mape(y_train, est))

RMSE on train set = 32.665
MAPE on train set = 0.543%


In [25]:
est_df = pd.DataFrame({'est': est, 
                       'Date': train['Date']})

ax = train.plot(x='Date', y='Confirmed', style='b-', grid=True)
ax = valid.plot(x='Date', y='Confirmed', style='y-', grid=True, ax=ax)
ax = test.plot(x='Date', y='Confirmed', style='g-', grid=True, ax=ax)
ax = est_df.plot(x='Date', y='est', style='r-', grid=True, ax=ax)
ax.legend(['train', 'dev', 'test', 'predictions'])
ax.set_xlabel("Date")
ax.set_ylabel("USD")
ax.set_title('Without scaling')

Text(0.5, 1.0, 'Without scaling')

# Predict on dev set

In [26]:
valid

Unnamed: 0,Date,Confirmed,order_day,Confirmed_lag_1,Confirmed_lag_2,Confirmed_mean,Confirmed_std
38,2020-02-29,86011.0,38,84120.0,82754.0,83437.0,965.907863
39,2020-03-01,88369.0,39,86011.0,84120.0,85065.5,1337.138923
40,2020-03-02,90306.0,40,88369.0,86011.0,87190.0,1667.35779
41,2020-03-03,92840.0,41,90306.0,88369.0,89337.5,1369.665835
42,2020-03-04,95120.0,42,92840.0,90306.0,91573.0,1791.808584
43,2020-03-05,97882.0,43,95120.0,92840.0,93980.0,1612.203461
44,2020-03-06,101784.0,44,97882.0,95120.0,96501.0,1953.02893
45,2020-03-07,105821.0,45,101784.0,97882.0,99833.0,2759.13066
46,2020-03-08,109795.0,46,105821.0,101784.0,103802.5,2854.590076
47,2020-03-09,113561.0,47,109795.0,105821.0,107808.0,2810.042348


In [27]:
# Do prediction on test set
est_scaled = model.predict(X_valid_scaled)
valid['est_scaled'] = est_scaled
valid['est'] = valid['est_scaled'] * valid['Confirmed_std'] + valid['Confirmed_mean']

# Calculate RMSE
rmse_bef_tuning = math.sqrt(mean_squared_error(y_valid, valid['est']))
print("RMSE on dev set = %0.3f" % rmse_bef_tuning)

# Calculate MAPE
mape_bef_tuning = get_mape(y_valid, valid['est'])
print("MAPE on dev set = %0.3f%%" % mape_bef_tuning)

RMSE on dev set = 3583.329
MAPE on dev set = 3.114%


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [28]:
est_df = pd.DataFrame({'est': valid['est'], 
                       'y_valid': y_valid,
                       'Date': valid['Date']})

ax = train.plot(x='Date', y='Confirmed', style='b-', grid=True)
ax = valid.plot(x='Date', y='Confirmed', style='y-', grid=True, ax=ax)
ax = test.plot(x='Date', y='Confirmed', style='g-', grid=True, ax=ax)
ax = est_df.plot(x='Date', y='est', style='r-', grid=True, ax=ax)
ax.legend(['train', 'validation', 'test', 'predictions'])
ax.set_xlabel("Date")
ax.set_ylabel("Confirmed")

Text(0, 0.5, 'Confirmed')

In [29]:
_ = plot_importance(model, height=0.999)