In [114]:
import datetime
import pandas as pd
import numpy as np
import torch

# Construct the dataset

In [115]:
def downloadData():

    first_date = (datetime.date(2020, 2, 24))
    last_date = (datetime.date(2022, 8, 4))

    for delta in range(1, (last_date - first_date).days):

        today = first_date + datetime.timedelta(days=delta)
        today_url = 'https://raw.githubusercontent.com/pcm-dpc/COVID-19/master/dati-regioni/dpc-covid19-ita-regioni-'+today.strftime("%Y%m%d")+'.csv'
        present = pd.read_csv(today_url)
        present.to_csv('date_datasets/'+today.strftime("%Y%m%d")+'.csv', index=False)

In [116]:
#downloadData()

In [117]:
regioni = ["Abruzzo", "Basilicata", "Calabria", "Campania", "Emilia-Romagna", "Friuli-Venezia Giulia", "Lazio", "Liguria", "Lombardia", "Marche", "Molise", "P.A. Bolzano", "P.A. Trento", "Piemonte", "Puglia", "Sardegna", "Sicilia", "Toscana", "Umbria", "Valle d'Aosta", "Veneto"]
regioni_no_friuli = ["Abruzzo", "Basilicata", "Calabria", "Campania", "Emilia-Romagna", "Lazio", "Liguria", "Lombardia", "Marche", "Molise", "P.A. Bolzano", "P.A. Trento", "Piemonte", "Puglia", "Sardegna", "Sicilia", "Toscana", "Umbria", "Valle d'Aosta", "Veneto"]

first_date = (datetime.date(2020, 9, 1))   # perché all'inizio i dati erano sballati
last_date = (datetime.date(2022, 8, 4))

In [118]:
def dateToRegion(regioni, first_date, last_date):

    for regione in regioni_no_friuli:

        regione_csv = pd.DataFrame(data={"perc_story": [], "ospedalizzati_story": []})

        for delta in range(1, (last_date - first_date).days):
            # open the files
            today = first_date + datetime.timedelta(days=delta)
            yesterday = today - datetime.timedelta(days=1)
            present = pd.read_csv('date_datasets/'+today.strftime("%Y%m%d")+'.csv')
            past = pd.read_csv('date_datasets/'+yesterday.strftime("%Y%m%d")+'.csv')
            regione_present = present[ present['denominazione_regione'] == regione ]
            regione_past = past[ past['denominazione_regione'] == regione ]
            # compute the perc
            try:
                tot_tamponi_present = regione_present['tamponi'].values[0]
            except:
                print(regione)
                print(today)
                print(regione_present['tamponi'])
            tot_tamponi_past = regione_past['tamponi'].values[0]
            tamponi_oggi = tot_tamponi_present - tot_tamponi_past
            nuovi_positivi = regione_present['nuovi_positivi'].values[0]
            percentuale = np.around(nuovi_positivi / tamponi_oggi * 100, 2)     # <---
            # compute the hospitalized
            ospedalizzati_attuali = regione_present['totale_ospedalizzati'].values[0]       # <---
            # append the row
            nuova_riga = pd.DataFrame([[percentuale, ospedalizzati_attuali]],
                                        columns=["perc_story", "ospedalizzati_story"])
            regione_csv = pd.concat([regione_csv, nuova_riga])
        
        regione_csv.to_csv(f"region_datasets/{regione}.csv", index=False)

In [119]:
#dateToRegion(regioni_no_friuli, first_date, last_date)

In [120]:
# Plot every columns

# for regione in regioni_no_friuli:
#     df = pd.read_csv(f"region_datasets/{regione}.csv")
#     df.plot(y=['perc_story'], kind="line", figsize=(10, 10), color='red', title=regione)
#     df.plot(y=['ospedalizzati_story'], kind="line", figsize=(10, 10), color='orange', title=regione)

# Preprocess the datasets

## Merge all the datasets and window them

In [121]:
def preprocessDatasets(regions_array):
    
    window_length = 14
    global_dataset = np.empty([0, window_length])

    for regione in regions_array:

        dataset = pd.read_csv(f"region_datasets/{regione}.csv")
        dataset = dataset['ospedalizzati_story'].values

        indexer = np.arange(window_length)[None, :] + np.arange(dataset.shape[0]-window_length+1)[:, None]
        dataset = dataset[indexer]

        global_dataset = np.vstack((global_dataset, dataset))

    return global_dataset

In [122]:
regioni_no_friuli_no_lomba = ["Abruzzo", "Basilicata", "Calabria", "Campania", "Emilia-Romagna", "Lazio", "Liguria", "Marche", "Molise", "P.A. Bolzano", "P.A. Trento", "Piemonte", "Puglia", "Sardegna", "Sicilia", "Toscana", "Umbria", "Valle d'Aosta", "Veneto"]

In [123]:
training_set = preprocessDatasets(regioni_no_friuli_no_lomba)

train_input = torch.from_numpy(training_set[:, :-1]).type(torch.FloatTensor)
train_target = torch.from_numpy(training_set[:, 1:]).type(torch.FloatTensor)

In [124]:
from numpy import dtype


test_set = preprocessDatasets(["Lombardia"])

test_input = torch.from_numpy(test_set[:, :-1]).type(torch.FloatTensor)
test_target = torch.from_numpy(test_set[:, 1:]).type(torch.FloatTensor)

## The network

In [125]:
class LSTM(torch.nn.Module):
    def __init__(self, hidden_layers=64):
        super(LSTM, self).__init__()
        self.hidden_layers = hidden_layers
        # lstm1, lstm2, linear are all layers in the network
        self.lstm1 = torch.nn.LSTMCell(1, self.hidden_layers)
        self.lstm2 = torch.nn.LSTMCell(self.hidden_layers, self.hidden_layers)
        self.linear = torch.nn.Linear(self.hidden_layers, 1)
        
    def forward(self, y, future_preds=0):
        outputs, num_samples = [], y.size(0)
        h_t = torch.zeros(num_samples, self.hidden_layers, dtype=torch.float32)
        c_t = torch.zeros(num_samples, self.hidden_layers, dtype=torch.float32)
        h_t2 = torch.zeros(num_samples, self.hidden_layers, dtype=torch.float32)
        c_t2 = torch.zeros(num_samples, self.hidden_layers, dtype=torch.float32)
        
        for time_step in y.split(1, dim=1):
            # N, 1
            h_t, c_t = self.lstm1(time_step, (h_t, c_t)) # initial hidden and cell states
            h_t2, c_t2 = self.lstm2(h_t, (h_t2, c_t2)) # new hidden and cell states
            output = self.linear(h_t2) # output from the last FC layer
            outputs.append(output)
            
        for i in range(future_preds):
            # this only generates future predictions if we pass in future_preds>0
            # mirrors the code above, using last output/prediction as input
            h_t, c_t = self.lstm1(output, (h_t, c_t))
            h_t2, c_t2 = self.lstm2(h_t, (h_t2, c_t2))
            output = self.linear(h_t2)
            outputs.append(output)
        # transform list to tensor    
        outputs = torch.cat(outputs, dim=1)
        return outputs

## The fit

In [126]:
model = LSTM(hidden_layers=64)
criterion = torch.nn.MSELoss()
optimiser = torch.optim.LBFGS(model.parameters(), lr=0.08)

In [127]:
def training_loop(n_epochs, model, optimiser, loss_fn, train_input, train_target, test_input, test_target):

    def closure():
        optimiser.zero_grad()
        out = model(train_input)
        loss = loss_fn(out, train_target)
        loss.backward()
        return loss

    for epoch in range(n_epochs):

        # Train
        optimiser.step(closure)

        # Validate
        with torch.no_grad():
            future = 0
            pred = model(test_input, future_preds=future)
            # use all pred samples, but only go to 999
            loss_val = loss_fn(pred, test_target)
            y = pred.detach().numpy()
        # # draw figures
        # plt.figure(figsize=(12,6))
        # plt.title(f"Step {i+1}")
        # plt.xlabel("x")
        # plt.ylabel("y")
        # plt.xticks(fontsize=20)
        # plt.yticks(fontsize=20)
        # n = train_input.shape[1] # 999
        # def draw(yi, colour):
        #     plt.plot(np.arange(n), yi[:n], colour, linewidth=2.0)
        #     plt.plot(np.arange(n, n+future), yi[n:], colour+":", linewidth=2.0)
        # draw(y[0], 'r')
        # draw(y[1], 'b')
        # draw(y[2], 'g')
        # plt.savefig("predict%d.png"%i, dpi=200)
        # plt.close()

        # print the loss
        out = model(train_input)
        loss_train = loss_fn(out, train_target)
        print(f"Epoch {epoch+1}, Training loss {loss_train.item():.4f}, Validation loss {loss_val.item():.4f}")

In [128]:
training_loop(100, model, optimiser, criterion, train_input, train_target, test_input, test_target)

Epoch 1, Training loss 741280.0625, Validation loss 10244147.0000
Epoch 2, Training loss 544948.9375, Validation loss 8855690.0000
Epoch 3, Training loss 1017135.3125, Validation loss 7549261.0000
Epoch 4, Training loss 533017.8125, Validation loss 8681516.0000
Epoch 5, Training loss 515587.0312, Validation loss 8338853.5000
Epoch 6, Training loss 503522.0938, Validation loss 8383300.0000
Epoch 7, Training loss 581732.8750, Validation loss 8808748.0000
Epoch 8, Training loss 574452.6250, Validation loss 8877343.0000
Epoch 9, Training loss 568145.0000, Validation loss 8804914.0000
Epoch 10, Training loss 958178.3125, Validation loss 6834364.0000
Epoch 11, Training loss 256357696.0000, Validation loss 220921920.0000
Epoch 12, Training loss 11091334.0000, Validation loss 31382028.0000
Epoch 13, Training loss 1484733.0000, Validation loss 13015797.0000
Epoch 14, Training loss 577596.1875, Validation loss 8908980.0000
Epoch 15, Training loss 576960.6875, Validation loss 8887211.0000
Epoch 1