In [2]:
import json

import math
import pandas as pd 
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_squared_error, mean_absolute_error
from tqdm.notebook import trange

In [3]:
ori_df = pd.read_csv('https://raw.githubusercontent.com/owid/covid-19-data/master/public/data/owid-covid-data.csv')

In [4]:
df = ori_df[['location','total_deaths','date']].copy()

In [5]:
df["total_deaths"] = df["total_deaths"].fillna(0)
df.head

<bound method NDFrame.head of            location  total_deaths        date
0       Afghanistan           0.0  2020-01-05
1       Afghanistan           0.0  2020-01-06
2       Afghanistan           0.0  2020-01-07
3       Afghanistan           0.0  2020-01-08
4       Afghanistan           0.0  2020-01-09
...             ...           ...         ...
383883     Zimbabwe        5739.0  2024-02-28
383884     Zimbabwe        5739.0  2024-02-29
383885     Zimbabwe        5739.0  2024-03-01
383886     Zimbabwe        5739.0  2024-03-02
383887     Zimbabwe        5740.0  2024-03-03

[383888 rows x 3 columns]>

In [6]:
asia=df.loc[df['location'] == 'Asia']
africa=df.loc[df['location'] == 'Africa']
europe=df.loc[df['location'] == 'Europe']
north_america=df.loc[df['location'] == 'North America']
ocenania=df.loc[df['location'] == 'Ocenania']
south_America=df.loc[df['location'] == 'South America']

In [7]:
new_df=pd.concat([asia,africa,europe,north_america,ocenania,south_America]).reset_index().drop(columns='index')

In [8]:
new_df.head()

Unnamed: 0,location,total_deaths,date
0,Asia,0.0,2020-01-05
1,Asia,0.0,2020-01-06
2,Asia,0.0,2020-01-07
3,Asia,0.0,2020-01-08
4,Asia,0.0,2020-01-09


In [19]:
new_df['date']

object


In [10]:
new_df['year']=pd.to_datetime(new_df['date']).dt.year
new_df['month']=pd.to_datetime(new_df['date']).dt.month
new_df['day']=pd.to_datetime(new_df['date']).dt.day

In [11]:
asia.tail()

Unnamed: 0,location,total_deaths,date
19785,Asia,0.0,2024-03-16
19786,Asia,0.0,2024-03-17
19787,Asia,0.0,2024-03-18
19788,Asia,0.0,2024-03-19
19789,Asia,0.0,2024-03-20


In [12]:
history = asia['total_deaths']\
    .reset_index()\
    .drop(columns='index')

history.head()

Unnamed: 0,total_deaths
0,0.0
1,0.0
2,0.0
3,0.0
4,0.0


## Start predicting next 5 days

In [13]:
period = 5

In [14]:
## 1. Train model up until most current day
model = ARIMA(history, order=(1,1,0))
model_fit = model.fit()

## 2. Predict in next 'period' days
yhat = model_fit.forecast(period)

### Add predicted values back to dataframe

In [15]:
last_date = asia['date'].tail(1).item()
last_date

'2024-03-20'

In [52]:
asia_predicted = [
    {
        'location': 'Asia',
        'total_deaths': x,
        'date': pd.Timedelta(i + 1, 'd') + pd.to_datetime(last_date)
    }
    for i, x in enumerate(yhat)
]
df_asia_pred = pd.DataFrame.from_records(asia_predicted)
df_asia_pred.head()

Unnamed: 0,location,total_deaths,date
0,Asia,8.645399e-173,2024-03-21
1,Asia,8.645263e-173,2024-03-22
2,Asia,8.645263e-173,2024-03-23
3,Asia,8.645263e-173,2024-03-24
4,Asia,8.645263e-173,2024-03-25


In [57]:
df_asia_new = pd.concat([asia, df_asia_pred])
df_asia_new["date"]=pd.to_datetime(df_asia_new["date"]).dt.to_period(freq="D")
df_asia_new.tail(10)

Unnamed: 0,location,total_deaths,date
19785,Asia,0.0,2024-03-16
19786,Asia,0.0,2024-03-17
19787,Asia,0.0,2024-03-18
19788,Asia,0.0,2024-03-19
19789,Asia,0.0,2024-03-20
0,Asia,8.645399e-173,2024-03-21
1,Asia,8.645263e-173,2024-03-22
2,Asia,8.645263e-173,2024-03-23
3,Asia,8.645263e-173,2024-03-24
4,Asia,8.645263e-173,2024-03-25


In [54]:
df_asia_new.index[-1]

4

In [56]:
import numpy as np
df_asia_new['total_deaths'].apply(np.int64) 

18253    0
18254    0
18255    0
18256    0
18257    0
        ..
0        0
1        0
2        0
3        0
4        0
Name: total_deaths, Length: 1542, dtype: int64