In [1]:
# importing the necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

In [2]:
# reading in the data file
corona = pd.read_csv("./Data/covid19-in-india/covid_19_india.csv", parse_dates = [1], dayfirst = True)

# dropping the columns which don't add to the analysis
corona = corona.drop(["Sno", "Time", "State/UnionTerritory"], axis = 1)

# converting "Date" into Datetime format
corona["Date"] = pd.to_datetime(corona["Date"])

In [3]:
# grouping the cases by date for easier analysis
datewise = corona.groupby(["Date"]).agg({"ConfirmedIndianNational": 'sum', "ConfirmedForeignNational": 'sum', "Cured": "sum", "Deaths":'sum'})

# making the "Date" column part of the database
datewise["Date"] = datewise.index

# making the index column
ind = []
for i in range(len(datewise)):
    ind.append(i)
datewise["Index"] = ind
datewise.set_index("Index", inplace = True)

# resetting DataFrame with columns in desired order
datewise = datewise[["Date", "ConfirmedIndianNational", "ConfirmedForeignNational", "Cured", "Deaths"]]

In [4]:
# adding a new column to the database which counts the days since the first case was reported
datewise["Days Since"] = datewise["Date"] - datewise["Date"][0]
datewise["Days Since"] = datewise["Days Since"].dt.days

# removing the "Date" column
datewise = datewise.drop(["Date"], axis = 1)

# removing the rows with entries that are not integers
to_remove = []
for idx, i in datewise.iterrows():
    try:
        k = int(i["ConfirmedIndianNational"])
    except:
        to_remove.append(idx)
datewise = datewise.drop(to_remove)

# Hyper-parameter search

In [5]:
# importing the requisite libraries and making the train, validation data and an emoty list to store the model's MSE
from statsmodels.tsa.arima_model import ARIMA
from sklearn.metrics import mean_squared_error
import numpy as np

train_ml = datewise.iloc[: int(datewise.shape[0] * 0.85)]
valid_ml = datewise.iloc[int(datewise.shape[0] * 0.85):]
mse = []

In [6]:
def evaluate(p, d, q):
    # making and training the model
    log_series = np.log(train_ml["ConfirmedIndianNational"])
    model_arima = ARIMA(log_series, (p, d, q))
    model_arima_fit = model_arima.fit()
    # making the predictions
    prediction_arima = model_arima_fit.forecast(len(valid_ml))[0]
    # printing the root mean square error
    mse.append(np.sqrt(mean_squared_error(list(valid_ml["ConfirmedIndianNational"]), np.exp(prediction_arima))))

In [7]:
# evaluate parameters
p_values = [0, 1, 2, 4, 6, 8, 10]
d_values = range(0, 3)
q_values = range(0, 3)
msg = "Could not find the MSE for: "
arr = []
for i in p_values:
    for j in d_values:
        for k in q_values:
            arr.append(str(i) + "\t" + str(j) + "\t" + str(k))
            try:
                evaluate(i, j, k)
            except:
                print(msg, str(i), str(j), str(k))
                
least_mse = min(mse)

Could not find the MSE for:  0 0 2




Could not find the MSE for:  1 0 1
Could not find the MSE for:  1 0 2
Could not find the MSE for:  2 0 1
Could not find the MSE for:  2 0 2
Could not find the MSE for:  2 2 2
Could not find the MSE for:  4 0 0
Could not find the MSE for:  4 0 1
Could not find the MSE for:  4 0 2




Could not find the MSE for:  6 0 1
Could not find the MSE for:  6 0 2




Could not find the MSE for:  8 0 2




In [8]:
for idx, i in enumerate(mse):
    if i == least_mse:
        print(arr[idx])

2	0	1
