In [1]:
import json

import math
import pandas as pd 
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_squared_error, mean_absolute_error
from tqdm.notebook import trange

In [2]:
ori_df = pd.read_csv('https://raw.githubusercontent.com/owid/covid-19-data/master/public/data/owid-covid-data.csv',encoding='unicode-escape')

In [3]:
df = ori_df[['location','total_deaths','date']].copy()

In [4]:
df["total_deaths"] = df["total_deaths"].fillna(0)
df.head

<bound method NDFrame.head of            location  total_deaths        date
0       Afghanistan           0.0  2020-01-05
1       Afghanistan           0.0  2020-01-06
2       Afghanistan           0.0  2020-01-07
3       Afghanistan           0.0  2020-01-08
4       Afghanistan           0.0  2020-01-09
...             ...           ...         ...
401003     Zimbabwe        5740.0  2024-05-08
401004     Zimbabwe        5740.0  2024-05-09
401005     Zimbabwe        5740.0  2024-05-10
401006     Zimbabwe        5740.0  2024-05-11
401007     Zimbabwe        5740.0  2024-05-12

[401008 rows x 3 columns]>

In [5]:
asia=df.loc[df['location'] == 'Asia']
africa=df.loc[df['location'] == 'Africa']
europe=df.loc[df['location'] == 'Europe']
north_america=df.loc[df['location'] == 'North America']
ocenania=df.loc[df['location'] == 'Ocenania']
south_america=df.loc[df['location'] == 'South America'].reset_index()

In [6]:
new_df=pd.concat([asia,africa,europe,north_america,ocenania,south_america]).reset_index().drop(columns='index')

In [7]:
new_df.head()

Unnamed: 0,level_0,location,total_deaths,date
0,19084,Asia,0.0,2020-01-05
1,19085,Asia,0.0,2020-01-06
2,19086,Asia,0.0,2020-01-07
3,19087,Asia,0.0,2020-01-08
4,19088,Asia,0.0,2020-01-09


In [8]:
new_df['date']

0       2020-01-05
1       2020-01-06
2       2020-01-07
3       2020-01-08
4       2020-01-09
           ...    
7980    2024-05-08
7981    2024-05-09
7982    2024-05-10
7983    2024-05-11
7984    2024-05-12
Name: date, Length: 7985, dtype: object

In [9]:
new_df['year']=pd.to_datetime(new_df['date']).dt.year
new_df['month']=pd.to_datetime(new_df['date']).dt.month
new_df['day']=pd.to_datetime(new_df['date']).dt.day

In [10]:
south_america['date'] = pd.to_datetime(new_df['date'])
dt_start = pd.to_datetime('2020-01-05')
dt_end = pd.to_datetime('2024-04-28')

south_america=south_america[south_america['date'].between(dt_start, dt_end)]

In [11]:
south_america.tail()

Unnamed: 0,index,location,total_deaths,date
1571,334815,South America,1354116.0,2024-04-24
1572,334816,South America,1354116.0,2024-04-25
1573,334817,South America,1354116.0,2024-04-26
1574,334818,South America,1354116.0,2024-04-27
1575,334819,South America,1354133.0,2024-04-28


In [12]:
south_america.loc[:, 'total_deaths_processed'] = south_america['total_deaths'].diff(periods=1)

south_america.head()

Unnamed: 0,index,location,total_deaths,date,total_deaths_processed
0,333244,South America,0.0,2020-01-05,
1,333245,South America,0.0,2020-01-06,0.0
2,333246,South America,0.0,2020-01-07,0.0
3,333247,South America,0.0,2020-01-08,0.0
4,333248,South America,0.0,2020-01-09,0.0


In [13]:
history = south_america.iloc[1:]['total_deaths_processed']\
    .reset_index()\
    .drop(columns='index')

history.head()

Unnamed: 0,total_deaths_processed
0,0.0
1,0.0
2,0.0
3,0.0
4,0.0


## Start predicting 

In [14]:
period = 365

In [15]:
## 1. Train model up until most current day
model = ARIMA(history, order=(5,1,0))
model_fit = model.fit()

## 2. Predict in next 'period' days
yhat = model_fit.forecast(period)

yhat

1575    2.837571
1576    3.312847
1577    3.862718
1578    4.503755
1579    5.251299
          ...   
1935    4.860901
1936    4.860901
1937    4.860901
1938    4.860901
1939    4.860901
Name: predicted_mean, Length: 365, dtype: float64

### Add predicted values back to dataframe

In [16]:
last_date = south_america['date'].tail(1).item()
last_date

Timestamp('2024-04-28 00:00:00')

In [17]:
south_america_predicted = [
    {
        'location': 'South America',
        'total_deaths_processed': x,
        'date': pd.Timedelta(i + 1, 'd') + pd.to_datetime(last_date)
    }
    for i, x in enumerate(yhat)
]
df_south_america_pred = pd.DataFrame.from_records(south_america_predicted)
df_south_america_pred.head()

Unnamed: 0,location,total_deaths_processed,date
0,South America,2.837571,2024-04-29
1,South America,3.312847,2024-04-30
2,South America,3.862718,2024-05-01
3,South America,4.503755,2024-05-02
4,South America,5.251299,2024-05-03


In [18]:
df_south_america_new = pd.concat([south_america, df_south_america_pred])
df_south_america_new["date"]=pd.to_datetime(df_south_america_new["date"]).dt.to_period(freq="D")
df_south_america_new.tail(10)

Unnamed: 0,index,location,total_deaths,date,total_deaths_processed
355,,South America,,2025-04-19,4.860901
356,,South America,,2025-04-20,4.860901
357,,South America,,2025-04-21,4.860901
358,,South America,,2025-04-22,4.860901
359,,South America,,2025-04-23,4.860901
360,,South America,,2025-04-24,4.860901
361,,South America,,2025-04-25,4.860901
362,,South America,,2025-04-26,4.860901
363,,South America,,2025-04-27,4.860901
364,,South America,,2025-04-28,4.860901


In [19]:
total_death_begin = df_south_america_new.iloc[0]['total_deaths'].item()
idx_begin = df_south_america_new.index[0]

df_south_america_new.loc[idx_begin, 'total_deaths_processed'] = total_death_begin

In [20]:
df_south_america_new.loc[:, 'total_deaths'] = df_south_america_new['total_deaths_processed'].cumsum()
df_south_america_new['total_deaths'] = df_south_america_new['total_deaths'].astype(int)
df_south_america_new.head()

Unnamed: 0,index,location,total_deaths,date,total_deaths_processed
0,333244.0,South America,0,2020-01-05,0.0
1,333245.0,South America,0,2020-01-06,0.0
2,333246.0,South America,0,2020-01-07,0.0
3,333247.0,South America,0,2020-01-08,0.0
4,333248.0,South America,0,2020-01-09,0.0


In [21]:
df_south_america_new['date'] = pd.to_datetime(df_south_america_new['date'].astype(str))
df_south_america_new['year'] = df_south_america_new['date'].dt.year
df_south_america_new['month'] = df_south_america_new['date'].dt.month
df_south_america_new['day'] = df_south_america_new['date'].dt.day
df_south_america_new.head()

Unnamed: 0,index,location,total_deaths,date,total_deaths_processed,year,month,day
0,333244.0,South America,0,2020-01-05,0.0,2020,1,5
1,333245.0,South America,0,2020-01-06,0.0,2020,1,6
2,333246.0,South America,0,2020-01-07,0.0,2020,1,7
3,333247.0,South America,0,2020-01-08,0.0,2020,1,8
4,333248.0,South America,0,2020-01-09,0.0,2020,1,9


In [22]:
df_south_america_new['total_deaths']=df_south_america_new['total_deaths'].astype('int') 
df_south_america_new['total_deaths']

0            0
1            0
2            0
3            0
4            0
        ...   
360    1355880
361    1355885
362    1355890
363    1355895
364    1355900
Name: total_deaths, Length: 1941, dtype: int32

In [23]:
df_south_america_final = df_south_america_new[['location','total_deaths','date','year','month','day']].copy().reset_index()
df_south_america_final.head()

Unnamed: 0,index,location,total_deaths,date,year,month,day
0,0,South America,0,2020-01-05,2020,1,5
1,1,South America,0,2020-01-06,2020,1,6
2,2,South America,0,2020-01-07,2020,1,7
3,3,South America,0,2020-01-08,2020,1,8
4,4,South America,0,2020-01-09,2020,1,9


In [24]:
df_south_america_final.to_csv(r"C:\Users\DELL\OneDrive - VietNam National University - HCM INTERNATIONAL UNIVERSITY\Desktop\thesis\Covid-19-report\arima\data_training\south_america.csv", index=False)