In [53]:
import json

import math
import pandas as pd 
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_squared_error, mean_absolute_error
from tqdm.notebook import trange

In [54]:
ori_df = pd.read_csv('https://raw.githubusercontent.com/owid/covid-19-data/master/public/data/owid-covid-data.csv')

In [55]:
df = ori_df[['location','total_deaths','date']].copy()

In [56]:
df["total_deaths"] = df["total_deaths"].fillna(0)
df.head

<bound method NDFrame.head of            location  total_deaths        date
0       Afghanistan           0.0  2020-01-05
1       Afghanistan           0.0  2020-01-06
2       Afghanistan           0.0  2020-01-07
3       Afghanistan           0.0  2020-01-08
4       Afghanistan           0.0  2020-01-09
...             ...           ...         ...
401003     Zimbabwe        5740.0  2024-05-08
401004     Zimbabwe        5740.0  2024-05-09
401005     Zimbabwe        5740.0  2024-05-10
401006     Zimbabwe        5740.0  2024-05-11
401007     Zimbabwe        5740.0  2024-05-12

[401008 rows x 3 columns]>

In [57]:
asia=df.loc[df['location'] == 'Asia']
africa=df.loc[df['location'] == 'Africa'].reset_index()
europe=df.loc[df['location'] == 'Europe']
north_america=df.loc[df['location'] == 'North America']
ocenania=df.loc[df['location'] == 'Ocenania']
south_America=df.loc[df['location'] == 'South America']

In [58]:
new_df=pd.concat([asia,africa,europe,north_america,ocenania,south_America]).reset_index().drop(columns='index')

In [59]:
new_df.head()

Unnamed: 0,level_0,location,total_deaths,date
0,19084,Asia,0.0,2020-01-05
1,19085,Asia,0.0,2020-01-06
2,19086,Asia,0.0,2020-01-07
3,19087,Asia,0.0,2020-01-08
4,19088,Asia,0.0,2020-01-09


In [60]:
new_df['date']

0       2020-01-05
1       2020-01-06
2       2020-01-07
3       2020-01-08
4       2020-01-09
           ...    
7980    2024-05-08
7981    2024-05-09
7982    2024-05-10
7983    2024-05-11
7984    2024-05-12
Name: date, Length: 7985, dtype: object

In [61]:
new_df['year']=pd.to_datetime(new_df['date']).dt.year
new_df['month']=pd.to_datetime(new_df['date']).dt.month
new_df['day']=pd.to_datetime(new_df['date']).dt.day

In [62]:
africa['date'] = pd.to_datetime(new_df['date'])
dt_start = pd.to_datetime('2020-01-05')
dt_end = pd.to_datetime('2024-04-28')

africa=africa[africa['date'].between(dt_start, dt_end)]

In [63]:
africa.tail()

Unnamed: 0,index,location,total_deaths,date
1571,3161,Africa,259101.0,2024-04-24
1572,3162,Africa,259101.0,2024-04-25
1573,3163,Africa,259101.0,2024-04-26
1574,3164,Africa,259101.0,2024-04-27
1575,3165,Africa,259101.0,2024-04-28


In [64]:
africa.loc[:, 'total_deaths_processed'] = africa['total_deaths'].diff(periods=1)

africa.head()

Unnamed: 0,index,location,total_deaths,date,total_deaths_processed
0,1590,Africa,0.0,2020-01-05,
1,1591,Africa,0.0,2020-01-06,0.0
2,1592,Africa,0.0,2020-01-07,0.0
3,1593,Africa,0.0,2020-01-08,0.0
4,1594,Africa,0.0,2020-01-09,0.0


In [65]:
history = africa.iloc[1:]['total_deaths_processed']\
    .reset_index()\
    .drop(columns='index')

history.head()

Unnamed: 0,total_deaths_processed
0,0.0
1,0.0
2,0.0
3,0.0
4,0.0


## Start predicting 

In [66]:
period = 365

In [67]:
## 1. Train model up until most current day
model = ARIMA(history, order=(5,1,0))
model_fit = model.fit()

## 2. Predict in next 'period' days
yhat = model_fit.forecast(period)

yhat

1575   -5.046772e-32
1576   -5.583658e-32
1577   -5.612341e-32
1578   -5.656168e-32
1579   -5.125118e-32
            ...     
1935   -5.158618e-32
1936   -5.158618e-32
1937   -5.158618e-32
1938   -5.158618e-32
1939   -5.158618e-32
Name: predicted_mean, Length: 365, dtype: float64

### Add predicted values back to dataframe

In [68]:
last_date = africa['date'].tail(1).item()
last_date

Timestamp('2024-04-28 00:00:00')

In [69]:
africa_predicted = [
    {
        'location': 'Africa',
        'total_deaths_processed': x,
        'date': pd.Timedelta(i + 1, 'd') + pd.to_datetime(last_date)
    }
    for i, x in enumerate(yhat)
]
df_africa_pred = pd.DataFrame.from_records(africa_predicted)


In [70]:
df_africa_new = pd.concat([africa, df_africa_pred])

In [71]:

df_africa_new["date"]=pd.to_datetime(df_africa_new["date"]).dt.to_period(freq="D")


In [72]:
df_africa_new.tail(10)

Unnamed: 0,index,location,total_deaths,date,total_deaths_processed
355,,Africa,,2025-04-19,-5.158618e-32
356,,Africa,,2025-04-20,-5.158618e-32
357,,Africa,,2025-04-21,-5.158618e-32
358,,Africa,,2025-04-22,-5.158618e-32
359,,Africa,,2025-04-23,-5.158618e-32
360,,Africa,,2025-04-24,-5.158618e-32
361,,Africa,,2025-04-25,-5.158618e-32
362,,Africa,,2025-04-26,-5.158618e-32
363,,Africa,,2025-04-27,-5.158618e-32
364,,Africa,,2025-04-28,-5.158618e-32


In [73]:
total_death_begin = df_africa_new.iloc[0]['total_deaths'].item()
idx_begin = df_africa_new.index[0]

df_africa_new.loc[idx_begin, 'total_deaths_processed'] = total_death_begin

In [74]:
df_africa_new.loc[:, 'total_deaths'] = df_africa_new['total_deaths_processed'].cumsum()
df_africa_new['total_deaths'] = df_africa_new['total_deaths'].astype(int)
df_africa_new.head()

Unnamed: 0,index,location,total_deaths,date,total_deaths_processed
0,1590.0,Africa,0,2020-01-05,0.0
1,1591.0,Africa,0,2020-01-06,0.0
2,1592.0,Africa,0,2020-01-07,0.0
3,1593.0,Africa,0,2020-01-08,0.0
4,1594.0,Africa,0,2020-01-09,0.0


In [75]:
df_africa_new['date'] = pd.to_datetime(df_africa_new['date'].astype(str))
df_africa_new['year'] = df_africa_new['date'].dt.year
df_africa_new['month'] = df_africa_new['date'].dt.month
df_africa_new['day'] = df_africa_new['date'].dt.day
df_africa_new.head()

Unnamed: 0,index,location,total_deaths,date,total_deaths_processed,year,month,day
0,1590.0,Africa,0,2020-01-05,0.0,2020,1,5
1,1591.0,Africa,0,2020-01-06,0.0,2020,1,6
2,1592.0,Africa,0,2020-01-07,0.0,2020,1,7
3,1593.0,Africa,0,2020-01-08,0.0,2020,1,8
4,1594.0,Africa,0,2020-01-09,0.0,2020,1,9


In [76]:
df_africa_new['total_deaths']=df_africa_new['total_deaths'].astype('int') 
df_africa_new['total_deaths']

0           0
1           0
2           0
3           0
4           0
        ...  
360    259101
361    259101
362    259101
363    259101
364    259101
Name: total_deaths, Length: 1941, dtype: int32

In [77]:
df_africa_final = df_africa_new[['location','total_deaths','date','year','month','day']].copy().reset_index()
df_africa_final.head()

Unnamed: 0,index,location,total_deaths,date,year,month,day
0,0,Africa,0,2020-01-05,2020,1,5
1,1,Africa,0,2020-01-06,2020,1,6
2,2,Africa,0,2020-01-07,2020,1,7
3,3,Africa,0,2020-01-08,2020,1,8
4,4,Africa,0,2020-01-09,2020,1,9


In [78]:
df_africa_final.to_csv(r"C:\Users\DELL\OneDrive - VietNam National University - HCM INTERNATIONAL UNIVERSITY\Desktop\thesis\Covid-19-report\arima\data_training\africa.csv", index=False)