In [25]:
import json

import math
import pandas as pd 
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_squared_error, mean_absolute_error
from tqdm.notebook import trange

In [26]:
ori_df = pd.read_csv('https://raw.githubusercontent.com/owid/covid-19-data/master/public/data/owid-covid-data.csv',encoding='unicode-escape')

In [27]:
df = ori_df[['location','total_deaths','date']].copy()

In [28]:
df["total_deaths"] = df["total_deaths"].fillna(0)
df.head

<bound method NDFrame.head of            location  total_deaths        date
0       Afghanistan           0.0  2020-01-05
1       Afghanistan           0.0  2020-01-06
2       Afghanistan           0.0  2020-01-07
3       Afghanistan           0.0  2020-01-08
4       Afghanistan           0.0  2020-01-09
...             ...           ...         ...
401003     Zimbabwe        5740.0  2024-05-08
401004     Zimbabwe        5740.0  2024-05-09
401005     Zimbabwe        5740.0  2024-05-10
401006     Zimbabwe        5740.0  2024-05-11
401007     Zimbabwe        5740.0  2024-05-12

[401008 rows x 3 columns]>

In [29]:
asia=df.loc[df['location'] == 'Asia']
africa=df.loc[df['location'] == 'Africa']
europe=df.loc[df['location'] == 'Europe']
north_america=df.loc[df['location'] == 'North America']
oceania=df.loc[df['location'] == 'Oceania'].reset_index()
south_America=df.loc[df['location'] == 'South America']

In [30]:
new_df=pd.concat([asia,africa,europe,oceania,north_america,south_America]).reset_index().drop(columns='index')

In [31]:
new_df.tail()

Unnamed: 0,level_0,location,total_deaths,date
9575,334829,South America,1354147.0,2024-05-08
9576,334830,South America,1354147.0,2024-05-09
9577,334831,South America,1354147.0,2024-05-10
9578,334832,South America,1354147.0,2024-05-11
9579,334833,South America,1354153.0,2024-05-12


In [32]:
new_df['date']

0       2020-01-05
1       2020-01-06
2       2020-01-07
3       2020-01-08
4       2020-01-09
           ...    
9575    2024-05-08
9576    2024-05-09
9577    2024-05-10
9578    2024-05-11
9579    2024-05-12
Name: date, Length: 9580, dtype: object

In [33]:
new_df['year']=pd.to_datetime(new_df['date']).dt.year
new_df['month']=pd.to_datetime(new_df['date']).dt.month
new_df['day']=pd.to_datetime(new_df['date']).dt.day

In [34]:
oceania['date'] = pd.to_datetime(new_df['date'])
dt_start = pd.to_datetime('2020-01-05')
dt_end = pd.to_datetime('2024-04-28')
oceania=oceania[oceania['date'].between(dt_start, dt_end)]


In [35]:
oceania.head()

Unnamed: 0,index,location,total_deaths,date
0,266744,Oceania,0.0,2020-01-05
1,266745,Oceania,0.0,2020-01-06
2,266746,Oceania,0.0,2020-01-07
3,266747,Oceania,0.0,2020-01-08
4,266748,Oceania,0.0,2020-01-09


In [36]:
oceania.loc[:, 'total_deaths_processed'] = oceania['total_deaths'].diff(periods=1)

oceania.head()

Unnamed: 0,index,location,total_deaths,date,total_deaths_processed
0,266744,Oceania,0.0,2020-01-05,
1,266745,Oceania,0.0,2020-01-06,0.0
2,266746,Oceania,0.0,2020-01-07,0.0
3,266747,Oceania,0.0,2020-01-08,0.0
4,266748,Oceania,0.0,2020-01-09,0.0


In [37]:
history = oceania.iloc[1:]['total_deaths_processed']\
    .reset_index()\
    .drop(columns='index')

history.head()

Unnamed: 0,total_deaths_processed
0,0.0
1,0.0
2,0.0
3,0.0
4,0.0


## Start predicting 

In [38]:
period = 365

In [39]:
## 1. Train model up until most current day
model = ARIMA(history, order=(5,1,0))
model_fit = model.fit()

## 2. Predict in next 'period' days
yhat = model_fit.forecast(period)

yhat

1575     7.022841
1576     8.186763
1577     9.544970
1578    11.129195
1579    12.976478
          ...    
1935    12.012430
1936    12.012430
1937    12.012430
1938    12.012430
1939    12.012430
Name: predicted_mean, Length: 365, dtype: float64

### Add predicted values back to dataframe

In [40]:
last_date = oceania['date'].tail(1).item()
last_date

Timestamp('2024-04-28 00:00:00')

In [41]:
oceania_predicted = [
    {
        'location': 'Oceania',
        'total_deaths_processed': x,
        'date': pd.Timedelta(i + 1, 'd') + pd.to_datetime(last_date)
    }
    for i, x in enumerate(yhat)
]
df_oceania_pred = pd.DataFrame.from_records(oceania_predicted)
df_oceania_pred.head()

Unnamed: 0,location,total_deaths_processed,date
0,Oceania,7.022841,2024-04-29
1,Oceania,8.186763,2024-04-30
2,Oceania,9.54497,2024-05-01
3,Oceania,11.129195,2024-05-02
4,Oceania,12.976478,2024-05-03


In [42]:
df_oceania_new = pd.concat([oceania, df_oceania_pred])
df_oceania_new["date"]=pd.to_datetime(df_oceania_new["date"]).dt.to_period(freq="D")
df_oceania_new.tail(10)

Unnamed: 0,index,location,total_deaths,date,total_deaths_processed
355,,Oceania,,2025-04-19,12.01243
356,,Oceania,,2025-04-20,12.01243
357,,Oceania,,2025-04-21,12.01243
358,,Oceania,,2025-04-22,12.01243
359,,Oceania,,2025-04-23,12.01243
360,,Oceania,,2025-04-24,12.01243
361,,Oceania,,2025-04-25,12.01243
362,,Oceania,,2025-04-26,12.01243
363,,Oceania,,2025-04-27,12.01243
364,,Oceania,,2025-04-28,12.01243


In [43]:
total_death_begin = df_oceania_new.iloc[0]['total_deaths'].item()
idx_begin = df_oceania_new.index[0]

df_oceania_new.loc[idx_begin, 'total_deaths_processed'] = total_death_begin

In [44]:
df_oceania_new.loc[:, 'total_deaths'] = df_oceania_new['total_deaths_processed'].cumsum()
df_oceania_new['total_deaths'] = df_oceania_new['total_deaths'].astype(int)
df_oceania_new.head()

Unnamed: 0,index,location,total_deaths,date,total_deaths_processed
0,266744.0,Oceania,0,2020-01-05,0.0
1,266745.0,Oceania,0,2020-01-06,0.0
2,266746.0,Oceania,0,2020-01-07,0.0
3,266747.0,Oceania,0,2020-01-08,0.0
4,266748.0,Oceania,0,2020-01-09,0.0


In [45]:
df_oceania_new['date'] = pd.to_datetime(df_oceania_new['date'].astype(str))
df_oceania_new['year'] = df_oceania_new['date'].dt.year
df_oceania_new['month'] = df_oceania_new['date'].dt.month
df_oceania_new['day'] = df_oceania_new['date'].dt.day
df_oceania_new.head()

Unnamed: 0,index,location,total_deaths,date,total_deaths_processed,year,month,day
0,266744.0,Oceania,0,2020-01-05,0.0,2020,1,5
1,266745.0,Oceania,0,2020-01-06,0.0,2020,1,6
2,266746.0,Oceania,0,2020-01-07,0.0,2020,1,7
3,266747.0,Oceania,0,2020-01-08,0.0,2020,1,8
4,266748.0,Oceania,0,2020-01-09,0.0,2020,1,9


In [46]:
df_oceania_new['total_deaths']=df_oceania_new['total_deaths'].astype('int') 
df_oceania_new['total_deaths']

0          0
1          0
2          0
3          0
4          0
       ...  
360    37427
361    37439
362    37451
363    37463
364    37475
Name: total_deaths, Length: 1941, dtype: int32

In [47]:
df_oceania_final = df_oceania_new[['location','total_deaths','date','year','month','day']].copy().reset_index()
df_oceania_final.head()

Unnamed: 0,index,location,total_deaths,date,year,month,day
0,0,Oceania,0,2020-01-05,2020,1,5
1,1,Oceania,0,2020-01-06,2020,1,6
2,2,Oceania,0,2020-01-07,2020,1,7
3,3,Oceania,0,2020-01-08,2020,1,8
4,4,Oceania,0,2020-01-09,2020,1,9


In [48]:
df_oceania_final.to_csv(r"C:\Users\DELL\OneDrive - VietNam National University - HCM INTERNATIONAL UNIVERSITY\Desktop\thesis\Covid-19-report\arima\data_training\oceania.csv", index=False)