In [24]:
import numpy as np
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
import os
import tensorflow as tf
from tqdm import tqdm
from sklearn.utils import shuffle
from sklearn.metrics import mean_squared_log_error

from datetime import datetime
from datetime import timedelta

from keras import layers
from keras import Input
from keras.models import Model
from keras.callbacks import ModelCheckpoint, ReduceLROnPlateau, EarlyStopping

import matplotlib.pyplot as plt
import seaborn as sns
sns.set()


for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [25]:
train_df = gpd.read_file("train.csv")
train_df["ConfirmedCases"] = train_df["ConfirmedCases"].astype("float")
train_df["Fatalities"] = train_df["Fatalities"].astype("float")
#The country_region got modified in the enriched dataset by @optimo, 
# so we have to apply the same change to this Dataframe to facilitate the merge.
train_df["Country_Region"] = [ row.Country_Region.replace("'","").strip(" ") if row.Province_State=="" else str(row.Country_Region+"_"+row.Province_State).replace("'","").strip(" ") for idx,row in train_df.iterrows()]

In [26]:
train_df.head()

Unnamed: 0,Id,Province_State,Country_Region,Date,ConfirmedCases,Fatalities,geometry
0,1,,Afghanistan,2020-01-22,0.0,0.0,
1,2,,Afghanistan,2020-01-23,0.0,0.0,
2,3,,Afghanistan,2020-01-24,0.0,0.0,
3,4,,Afghanistan,2020-01-25,0.0,0.0,
4,5,,Afghanistan,2020-01-26,0.0,0.0,


In [27]:
#Still using the enriched data from week 2 as there is everything required for the model's training
extra_data_df = gpd.read_file("../country_info.csv")
extra_data_df["Country_Region"] = [country_name.replace("'","") for country_name in extra_data_df["country"]]
extra_data_df.head()

Unnamed: 0,field_1,Timestamp,country,start_date,end_date,lockdown_severity,leasure_shopping_prohibited,maximum_together,mentality_grade,government_performance,...,jun_temp,july_temp,aug_temp,sept_temp,oct_temp,nov_temp,dec_temp,annual_temp,geometry,Country_Region
0,0,4/10/2020 14:03:47,Belgium,3/18/2020,,5.0,1,2.0,7.0,6.0,...,15.4345294171,17.2385293171,17.1832941847,14.4274116965,10.6783882024,5.76197057647,3.11392355706,9.51424127412,,Belgium
1,1,4/15/2020 12:15:09,Croatia,3/18/2020,,8.0,1,,8.0,9.0,...,17.9920712546,20.0398930136,19.6672854418,16.355071476400006,11.3496107036,5.95976431786,1.52857298393,10.4709106861,,Croatia
2,2,4/24/2020 8:17:43,Denmark,3/11/2020,,5.0,1,10.0,3.0,5.0,...,14.4017857146,15.8914285386,15.9023573054,13.0076785775,9.35905722214,4.94214644714,1.64559750071,7.81897849714,,Denmark
3,3,4/10/2020 14:02:19,France,3/16/2020,,7.1,1,2.0,7.4,7.4,...,16.0979416867,18.6038650936,18.2286943859,15.7827178334,11.7466920652,6.69005464548,3.86573959333,10.5437222252,,France
4,4,4/15/2020 12:02:17,Germany,3/20/2020,,6.0,1,2.0,7.0,8.0,...,15.4968931021,17.188080171099998,16.96864174,13.9023048175,9.43178396455,4.27051657064,0.880767919412,8.504451891819999,,Germany


In [28]:
extra_data_df.head()

Unnamed: 0,field_1,Timestamp,country,start_date,end_date,lockdown_severity,leasure_shopping_prohibited,maximum_together,mentality_grade,government_performance,...,jun_temp,july_temp,aug_temp,sept_temp,oct_temp,nov_temp,dec_temp,annual_temp,geometry,Country_Region
0,0,4/10/2020 14:03:47,Belgium,3/18/2020,,5.0,1,2.0,7.0,6.0,...,15.4345294171,17.2385293171,17.1832941847,14.4274116965,10.6783882024,5.76197057647,3.11392355706,9.51424127412,,Belgium
1,1,4/15/2020 12:15:09,Croatia,3/18/2020,,8.0,1,,8.0,9.0,...,17.9920712546,20.0398930136,19.6672854418,16.355071476400006,11.3496107036,5.95976431786,1.52857298393,10.4709106861,,Croatia
2,2,4/24/2020 8:17:43,Denmark,3/11/2020,,5.0,1,10.0,3.0,5.0,...,14.4017857146,15.8914285386,15.9023573054,13.0076785775,9.35905722214,4.94214644714,1.64559750071,7.81897849714,,Denmark
3,3,4/10/2020 14:02:19,France,3/16/2020,,7.1,1,2.0,7.4,7.4,...,16.0979416867,18.6038650936,18.2286943859,15.7827178334,11.7466920652,6.69005464548,3.86573959333,10.5437222252,,France
4,4,4/15/2020 12:02:17,Germany,3/20/2020,,6.0,1,2.0,7.0,8.0,...,15.4968931021,17.188080171099998,16.96864174,13.9023048175,9.43178396455,4.27051657064,0.880767919412,8.504451891819999,,Germany


In [47]:
train_df = train_df.drop('geometry',axis=1)
extra_data_df = extra_data_df.drop('geometry',axis=1)

In [50]:
train_df = train_df.merge(extra_data_df,on='Country_Region')

Unnamed: 0,Id,Province_State,Country_Region,Date,ConfirmedCases,Fatalities,field_1,Timestamp,country,start_date,...,apr_temp,may_temp,jun_temp,july_temp,aug_temp,sept_temp,oct_temp,nov_temp,dec_temp,annual_temp
0,2623,,Belgium,2020-01-22,0.0,0.0,0,4/10/2020 14:03:47,Belgium,3/18/2020,...,8.14548228765,12.3979999324,15.4345294171,17.2385293171,17.1832941847,14.4274116965,10.6783882024,5.76197057647,3.11392355706,9.51424127412
1,2624,,Belgium,2020-01-23,0.0,0.0,0,4/10/2020 14:03:47,Belgium,3/18/2020,...,8.14548228765,12.3979999324,15.4345294171,17.2385293171,17.1832941847,14.4274116965,10.6783882024,5.76197057647,3.11392355706,9.51424127412
2,2625,,Belgium,2020-01-24,0.0,0.0,0,4/10/2020 14:03:47,Belgium,3/18/2020,...,8.14548228765,12.3979999324,15.4345294171,17.2385293171,17.1832941847,14.4274116965,10.6783882024,5.76197057647,3.11392355706,9.51424127412
3,2626,,Belgium,2020-01-25,0.0,0.0,0,4/10/2020 14:03:47,Belgium,3/18/2020,...,8.14548228765,12.3979999324,15.4345294171,17.2385293171,17.1832941847,14.4274116965,10.6783882024,5.76197057647,3.11392355706,9.51424127412
4,2627,,Belgium,2020-01-26,0.0,0.0,0,4/10/2020 14:03:47,Belgium,3/18/2020,...,8.14548228765,12.3979999324,15.4345294171,17.2385293171,17.1832941847,14.4274116965,10.6783882024,5.76197057647,3.11392355706,9.51424127412
5,2628,,Belgium,2020-01-27,0.0,0.0,0,4/10/2020 14:03:47,Belgium,3/18/2020,...,8.14548228765,12.3979999324,15.4345294171,17.2385293171,17.1832941847,14.4274116965,10.6783882024,5.76197057647,3.11392355706,9.51424127412
6,2629,,Belgium,2020-01-28,0.0,0.0,0,4/10/2020 14:03:47,Belgium,3/18/2020,...,8.14548228765,12.3979999324,15.4345294171,17.2385293171,17.1832941847,14.4274116965,10.6783882024,5.76197057647,3.11392355706,9.51424127412
7,2630,,Belgium,2020-01-29,0.0,0.0,0,4/10/2020 14:03:47,Belgium,3/18/2020,...,8.14548228765,12.3979999324,15.4345294171,17.2385293171,17.1832941847,14.4274116965,10.6783882024,5.76197057647,3.11392355706,9.51424127412
8,2631,,Belgium,2020-01-30,0.0,0.0,0,4/10/2020 14:03:47,Belgium,3/18/2020,...,8.14548228765,12.3979999324,15.4345294171,17.2385293171,17.1832941847,14.4274116965,10.6783882024,5.76197057647,3.11392355706,9.51424127412
9,2632,,Belgium,2020-01-31,0.0,0.0,0,4/10/2020 14:03:47,Belgium,3/18/2020,...,8.14548228765,12.3979999324,15.4345294171,17.2385293171,17.1832941847,14.4274116965,10.6783882024,5.76197057647,3.11392355706,9.51424127412


In [54]:
#Just getting rid of the first days to have a multiple of 7
#Makes it easier to generate the sequences
train_df = train_df.query("Date>'2020-01-22'and Date<'2020-04-01'")
days_in_sequence = 21

trend_list = []

with tqdm(total=len(list(train_df.Country_Region.unique()))) as pbar:
    for country in train_df.Country_Region.unique():
        for province in train_df.query(f"Country_Region=='{country}'").Province_State.unique():
            province_df = train_df.query(f"Country_Region=='{country}' and Province_State=='{province}'")
            
            #I added a quick hack to double the number of sequences
            #Warning: This will later create a minor leakage from the 
            # training set into the validation set.
            for i in range(0,len(province_df),int(days_in_sequence/3)):
                if i+days_in_sequence<=len(province_df):
                    #prepare all the temporal inputs
                    infection_trend = [float(x) for x in province_df[i:i+days_in_sequence-1].ConfirmedCases.values]
                    fatality_trend = [float(x) for x in province_df[i:i+days_in_sequence-1].Fatalities.values]

                    expected_cases = float(province_df.iloc[i+days_in_sequence-1].ConfirmedCases)
                    expected_fatalities = float(province_df.iloc[i+days_in_sequence-1].Fatalities)

                    trend_list.append({"infection_trend":infection_trend,
                                     "fatality_trend":fatality_trend,
                            
                                     "expected_cases":expected_cases,
                                     "expected_fatalities":expected_fatalities})
        pbar.update(1)
trend_df = pd.DataFrame(trend_list)

100%|██████████| 15/15 [00:00<00:00, 106.06it/s]


In [56]:
trend_df["temporal_inputs"] = [np.asarray([trends["infection_trend"],trends["fatality_trend"]]) for idx,trends in trend_df.iterrows()]

trend_df = shuffle(trend_df)

In [57]:
trend_df.head()

Unnamed: 0,expected_cases,expected_fatalities,fatality_trend,infection_trend,temporal_inputs
78,0.0,0.0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
89,3028.0,28.0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, ...","[8.0, 8.0, 18.0, 27.0, 42.0, 56.0, 90.0, 114.0...","[[8.0, 8.0, 18.0, 27.0, 42.0, 56.0, 90.0, 114...."
38,6.0,0.0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
33,12327.0,28.0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[46.0, 48.0, 79.0, 130.0, 159.0, 196.0, 262.0,...","[[46.0, 48.0, 79.0, 130.0, 159.0, 196.0, 262.0..."
7,0.0,0.0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."


In [58]:
i=0
temp_df = pd.DataFrame()
for idx,row in trend_df.iterrows():
    if sum(row.infection_trend)>0:
        temp_df = temp_df.append(row)
    else:
        if i<25:
            temp_df = temp_df.append(row)
            i+=1
trend_df = temp_df

In [59]:
trend_df.head()

Unnamed: 0,expected_cases,expected_fatalities,fatality_trend,infection_trend,temporal_inputs
78,0.0,0.0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
89,3028.0,28.0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, ...","[8.0, 8.0, 18.0, 27.0, 42.0, 56.0, 90.0, 114.0...","[[8.0, 8.0, 18.0, 27.0, 42.0, 56.0, 90.0, 114...."
38,6.0,0.0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
33,12327.0,28.0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[46.0, 48.0, 79.0, 130.0, 159.0, 196.0, 262.0,...","[[46.0, 48.0, 79.0, 130.0, 159.0, 196.0, 262.0..."
7,0.0,0.0,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."


In [79]:
sequence_length = 30
training_percentage = 0.9

In [80]:
training_item_count = int(len(trend_df)*training_percentage)
validation_item_count = len(trend_df)-int(len(trend_df)*training_percentage)
training_df = trend_df[:training_item_count]
validation_df = trend_df[training_item_count:]

In [81]:
X_temporal_train = np.asarray(np.transpose(np.reshape(np.asarray([np.asarray(x) for x in training_df["temporal_inputs"].values]),(training_item_count,5,sequence_length)),(0,2,1) )).astype(np.float32)
Y_cases_train = np.asarray([np.asarray(x) for x in training_df["expected_cases"]]).astype(np.float32)
Y_fatalities_train = np.asarray([np.asarray(x) for x in training_df["expected_fatalities"]]).astype(np.float32)

ValueError: cannot reshape array of size 3760 into shape (94,5,30)