In [26]:
import json

import math
import pandas as pd 
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_squared_error, mean_absolute_error
from tqdm.notebook import trange

In [27]:
ori_df = pd.read_csv('https://raw.githubusercontent.com/owid/covid-19-data/master/public/data/owid-covid-data.csv',encoding='unicode-escape')

In [28]:
df = ori_df[['location','total_deaths','date']].copy()

In [29]:
df["total_deaths"] = df["total_deaths"].fillna(0)
df.head

<bound method NDFrame.head of            location  total_deaths        date
0       Afghanistan           0.0  2020-01-05
1       Afghanistan           0.0  2020-01-06
2       Afghanistan           0.0  2020-01-07
3       Afghanistan           0.0  2020-01-08
4       Afghanistan           0.0  2020-01-09
...             ...           ...         ...
401003     Zimbabwe        5740.0  2024-05-08
401004     Zimbabwe        5740.0  2024-05-09
401005     Zimbabwe        5740.0  2024-05-10
401006     Zimbabwe        5740.0  2024-05-11
401007     Zimbabwe        5740.0  2024-05-12

[401008 rows x 3 columns]>

In [30]:
asia=df.loc[df['location'] == 'Asia']
africa=df.loc[df['location'] == 'Africa']
europe=df.loc[df['location'] == 'Europe']
north_america=df.loc[df['location'] == 'North America'].reset_index()
ocenania=df.loc[df['location'] == 'Ocenania']
south_America=df.loc[df['location'] == 'South America']

In [31]:
new_df=pd.concat([asia,africa,europe,north_america,ocenania,south_America]).reset_index().drop(columns='index')

In [32]:
new_df.head()

Unnamed: 0,level_0,location,total_deaths,date
0,19084,Asia,0.0,2020-01-05
1,19085,Asia,0.0,2020-01-06
2,19086,Asia,0.0,2020-01-07
3,19087,Asia,0.0,2020-01-08
4,19088,Asia,0.0,2020-01-09


In [33]:
new_df['date']

0       2020-01-05
1       2020-01-06
2       2020-01-07
3       2020-01-08
4       2020-01-09
           ...    
7980    2024-05-08
7981    2024-05-09
7982    2024-05-10
7983    2024-05-11
7984    2024-05-12
Name: date, Length: 7985, dtype: object

In [34]:
new_df['year']=pd.to_datetime(new_df['date']).dt.year
new_df['month']=pd.to_datetime(new_df['date']).dt.month
new_df['day']=pd.to_datetime(new_df['date']).dt.day

In [35]:
north_america['date'] = pd.to_datetime(new_df['date'])
dt_start = pd.to_datetime('2020-01-05')
dt_end = pd.to_datetime('2024-04-28')

north_america=north_america[north_america['date'].between(dt_start, dt_end)]

In [36]:
north_america.tail()

Unnamed: 0,index,location,total_deaths,date
1571,258292,North America,1664527.0,2024-04-24
1572,258293,North America,1664527.0,2024-04-25
1573,258294,North America,1664527.0,2024-04-26
1574,258295,North America,1664527.0,2024-04-27
1575,258296,North America,1664870.0,2024-04-28


In [37]:
north_america.loc[:, 'total_deaths_processed'] = north_america['total_deaths'].diff(periods=1)

north_america.head()

Unnamed: 0,index,location,total_deaths,date,total_deaths_processed
0,256721,North America,0.0,2020-01-05,
1,256722,North America,0.0,2020-01-06,0.0
2,256723,North America,0.0,2020-01-07,0.0
3,256724,North America,0.0,2020-01-08,0.0
4,256725,North America,0.0,2020-01-09,0.0


In [38]:
history = north_america.iloc[1:]['total_deaths_processed']\
    .reset_index()\
    .drop(columns='index')

history.head()

Unnamed: 0,total_deaths_processed
0,0.0
1,0.0
2,0.0
3,0.0
4,0.0


## Start predicting 

In [39]:
period = 365

In [40]:
## 1. Train model up until most current day
model = ARIMA(history, order=(5,1,0))
model_fit = model.fit()

## 2. Predict in next 'period' days
yhat = model_fit.forecast(period)

yhat

1575     57.251874
1576     66.841519
1577     77.935978
1578     90.869836
1579    105.952620
           ...    
1935     98.075750
1936     98.075750
1937     98.075750
1938     98.075750
1939     98.075750
Name: predicted_mean, Length: 365, dtype: float64

### Add predicted values back to dataframe

In [41]:
last_date = north_america['date'].tail(1).item()
last_date

Timestamp('2024-04-28 00:00:00')

In [42]:
north_america_predicted = [
    {
        'location': 'North America',
        'total_deaths_processed': x,
        'date': pd.Timedelta(i + 1, 'd') + pd.to_datetime(last_date)
    }
    for i, x in enumerate(yhat)
]
df_north_america_pred = pd.DataFrame.from_records(north_america_predicted)
df_north_america_pred.head()

Unnamed: 0,location,total_deaths_processed,date
0,North America,57.251874,2024-04-29
1,North America,66.841519,2024-04-30
2,North America,77.935978,2024-05-01
3,North America,90.869836,2024-05-02
4,North America,105.95262,2024-05-03


In [43]:
df_north_america_new = pd.concat([north_america, df_north_america_pred])
df_north_america_new["date"]=pd.to_datetime(df_north_america_new["date"]).dt.to_period(freq="D")
df_north_america_new.tail(10)

Unnamed: 0,index,location,total_deaths,date,total_deaths_processed
355,,North America,,2025-04-19,98.07575
356,,North America,,2025-04-20,98.07575
357,,North America,,2025-04-21,98.07575
358,,North America,,2025-04-22,98.07575
359,,North America,,2025-04-23,98.07575
360,,North America,,2025-04-24,98.07575
361,,North America,,2025-04-25,98.07575
362,,North America,,2025-04-26,98.07575
363,,North America,,2025-04-27,98.07575
364,,North America,,2025-04-28,98.07575


In [44]:
total_death_begin = df_north_america_new.iloc[0]['total_deaths'].item()
idx_begin = df_north_america_new.index[0]

df_north_america_new.loc[idx_begin, 'total_deaths_processed'] = total_death_begin

In [45]:
df_north_america_new.loc[:, 'total_deaths'] = df_north_america_new['total_deaths_processed'].cumsum()
df_north_america_new['total_deaths'] = df_north_america_new['total_deaths'].astype(int)
df_north_america_new.head()

Unnamed: 0,index,location,total_deaths,date,total_deaths_processed
0,256721.0,North America,0,2020-01-05,0.0
1,256722.0,North America,0,2020-01-06,0.0
2,256723.0,North America,0,2020-01-07,0.0
3,256724.0,North America,0,2020-01-08,0.0
4,256725.0,North America,0,2020-01-09,0.0


In [46]:
df_north_america_new['date'] = pd.to_datetime(df_north_america_new['date'].astype(str))
df_north_america_new['year'] = df_north_america_new['date'].dt.year
df_north_america_new['month'] = df_north_america_new['date'].dt.month
df_north_america_new['day'] = df_north_america_new['date'].dt.day
df_north_america_new.head()

Unnamed: 0,index,location,total_deaths,date,total_deaths_processed,year,month,day
0,256721.0,North America,0,2020-01-05,0.0,2020,1,5
1,256722.0,North America,0,2020-01-06,0.0,2020,1,6
2,256723.0,North America,0,2020-01-07,0.0,2020,1,7
3,256724.0,North America,0,2020-01-08,0.0,2020,1,8
4,256725.0,North America,0,2020-01-09,0.0,2020,1,9


In [47]:
df_north_america_new['total_deaths']=df_north_america_new['total_deaths'].astype('int') 
df_north_america_new['total_deaths']

0            0
1            0
2            0
3            0
4            0
        ...   
360    1700136
361    1700234
362    1700332
363    1700430
364    1700528
Name: total_deaths, Length: 1941, dtype: int32

In [48]:
df_north_america_new

Unnamed: 0,index,location,total_deaths,date,total_deaths_processed,year,month,day
0,256721.0,North America,0,2020-01-05,0.00000,2020,1,5
1,256722.0,North America,0,2020-01-06,0.00000,2020,1,6
2,256723.0,North America,0,2020-01-07,0.00000,2020,1,7
3,256724.0,North America,0,2020-01-08,0.00000,2020,1,8
4,256725.0,North America,0,2020-01-09,0.00000,2020,1,9
...,...,...,...,...,...,...,...,...
360,,North America,1700136,2025-04-24,98.07575,2025,4,24
361,,North America,1700234,2025-04-25,98.07575,2025,4,25
362,,North America,1700332,2025-04-26,98.07575,2025,4,26
363,,North America,1700430,2025-04-27,98.07575,2025,4,27


In [49]:
df_north_america_final = df_north_america_new[['location','total_deaths','date','year','month','day']].copy().reset_index()
df_north_america_final.head()

Unnamed: 0,index,location,total_deaths,date,year,month,day
0,0,North America,0,2020-01-05,2020,1,5
1,1,North America,0,2020-01-06,2020,1,6
2,2,North America,0,2020-01-07,2020,1,7
3,3,North America,0,2020-01-08,2020,1,8
4,4,North America,0,2020-01-09,2020,1,9


In [50]:
df_north_america_final.to_csv(r"C:\Users\DELL\OneDrive - VietNam National University - HCM INTERNATIONAL UNIVERSITY\Desktop\thesis\Covid-19-report\arima\data_training\north_america.csv", index=False)