In [29]:
import json

import math
import pandas as pd 
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_squared_error, mean_absolute_error
from tqdm.notebook import trange

In [30]:
ori_df = pd.read_csv('https://raw.githubusercontent.com/owid/covid-19-data/master/public/data/owid-covid-data.csv')

In [31]:
df = ori_df[['location','total_deaths','date']].copy()

In [32]:
df["total_deaths"] = df["total_deaths"].fillna(0)
df.head

<bound method NDFrame.head of            location  total_deaths        date
0       Afghanistan           0.0  2020-01-05
1       Afghanistan           0.0  2020-01-06
2       Afghanistan           0.0  2020-01-07
3       Afghanistan           0.0  2020-01-08
4       Afghanistan           0.0  2020-01-09
...             ...           ...         ...
401003     Zimbabwe        5740.0  2024-05-08
401004     Zimbabwe        5740.0  2024-05-09
401005     Zimbabwe        5740.0  2024-05-10
401006     Zimbabwe        5740.0  2024-05-11
401007     Zimbabwe        5740.0  2024-05-12

[401008 rows x 3 columns]>

In [33]:
asia=df.loc[df['location'] == 'Asia'].reset_index()
africa=df.loc[df['location'] == 'Africa'].reset_index()
europe=df.loc[df['location'] == 'Europe'].reset_index()
north_america=df.loc[df['location'] == 'North America']
ocenania=df.loc[df['location'] == 'Ocenania']
south_America=df.loc[df['location'] == 'South America']

In [34]:
new_df=pd.concat([europe,africa,europe,north_america,ocenania,south_America]).reset_index().drop(columns='index')

In [35]:
new_df.head()

Unnamed: 0,level_0,location,total_deaths,date
0,0,Europe,3.0,2020-01-05
1,1,Europe,3.0,2020-01-06
2,2,Europe,3.0,2020-01-07
3,3,Europe,3.0,2020-01-08
4,4,Europe,3.0,2020-01-09


In [36]:
new_df['date']

0       2020-01-05
1       2020-01-06
2       2020-01-07
3       2020-01-08
4       2020-01-09
           ...    
7979    2024-05-08
7980    2024-05-09
7981    2024-05-10
7982    2024-05-11
7983    2024-05-12
Name: date, Length: 7984, dtype: object

In [37]:
new_df['year']=pd.to_datetime(new_df['date']).dt.year
new_df['month']=pd.to_datetime(new_df['date']).dt.month
new_df['day']=pd.to_datetime(new_df['date']).dt.day

In [38]:
europe['date'] = pd.to_datetime(new_df['date'])
dt_start = pd.to_datetime('2020-01-05')
dt_end = pd.to_datetime('2024-04-28')

europe=europe[europe['date'].between(dt_start, dt_end)]

In [39]:
europe.tail()

Unnamed: 0,index,location,total_deaths,date
1571,112708,Europe,2100361.0,2024-04-24
1572,112709,Europe,2100361.0,2024-04-25
1573,112710,Europe,2100361.0,2024-04-26
1574,112711,Europe,2100361.0,2024-04-27
1575,112712,Europe,2100451.0,2024-04-28


In [40]:
europe.loc[:, 'total_deaths_processed'] = europe['total_deaths'].diff(periods=1)

europe.head()

Unnamed: 0,index,location,total_deaths,date,total_deaths_processed
0,111137,Europe,3.0,2020-01-05,
1,111138,Europe,3.0,2020-01-06,0.0
2,111139,Europe,3.0,2020-01-07,0.0
3,111140,Europe,3.0,2020-01-08,0.0
4,111141,Europe,3.0,2020-01-09,0.0


In [41]:
history = europe.iloc[1:]['total_deaths_processed']\
    .reset_index()\
    .drop(columns='index')

history.head()

Unnamed: 0,total_deaths_processed
0,0.0
1,0.0
2,0.0
3,0.0
4,0.0


## Start predicting 

In [42]:
period = 365

In [43]:
## 1. Train model up until most current day
model = ARIMA(history, order=(5,1,0))
model_fit = model.fit()

## 2. Predict in next 'period' days
yhat = model_fit.forecast(period)

yhat

1575    15.021972
1576    17.538544
1577    20.449632
1578    23.843336
1579    27.800896
          ...    
1935    25.734070
1936    25.734070
1937    25.734070
1938    25.734070
1939    25.734070
Name: predicted_mean, Length: 365, dtype: float64

### Add predicted values back to dataframe

In [44]:
last_date = europe['date'].tail(1).item()
last_date

Timestamp('2024-04-28 00:00:00')

In [45]:
europe_predicted = [
    {
        'location': 'Europe',
        'total_deaths_processed': x,
        'date': pd.Timedelta(i + 1, 'd') + pd.to_datetime(last_date)
    }
    for i, x in enumerate(yhat)
]
df_europe_pred = pd.DataFrame.from_records(europe_predicted)


In [46]:
df_europe_new = pd.concat([europe, df_europe_pred])

In [47]:

df_europe_new["date"]=pd.to_datetime(df_europe_new["date"]).dt.to_period(freq="D")


In [48]:
df_europe_new.tail(10)

Unnamed: 0,index,location,total_deaths,date,total_deaths_processed
355,,Europe,,2025-04-19,25.73407
356,,Europe,,2025-04-20,25.73407
357,,Europe,,2025-04-21,25.73407
358,,Europe,,2025-04-22,25.73407
359,,Europe,,2025-04-23,25.73407
360,,Europe,,2025-04-24,25.73407
361,,Europe,,2025-04-25,25.73407
362,,Europe,,2025-04-26,25.73407
363,,Europe,,2025-04-27,25.73407
364,,Europe,,2025-04-28,25.73407


In [49]:
total_death_begin = df_europe_new.iloc[0]['total_deaths'].item()
idx_begin = df_europe_new.index[0]

df_europe_new.loc[idx_begin, 'total_deaths_processed'] = total_death_begin

In [50]:
df_europe_new.loc[:, 'total_deaths'] = df_europe_new['total_deaths_processed'].cumsum()
df_europe_new['total_deaths'] = df_europe_new['total_deaths'].astype(int)
df_europe_new.head()

Unnamed: 0,index,location,total_deaths,date,total_deaths_processed
0,111137.0,Europe,3,2020-01-05,3.0
1,111138.0,Europe,3,2020-01-06,0.0
2,111139.0,Europe,3,2020-01-07,0.0
3,111140.0,Europe,3,2020-01-08,0.0
4,111141.0,Europe,3,2020-01-09,0.0


In [51]:
df_europe_new['date'] = pd.to_datetime(df_europe_new['date'].astype(str))
df_europe_new['year'] = df_europe_new['date'].dt.year
df_europe_new['month'] = df_europe_new['date'].dt.month
df_europe_new['day'] = df_europe_new['date'].dt.day
df_europe_new.head()

Unnamed: 0,index,location,total_deaths,date,total_deaths_processed,year,month,day
0,111137.0,Europe,3,2020-01-05,3.0,2020,1,5
1,111138.0,Europe,3,2020-01-06,0.0,2020,1,6
2,111139.0,Europe,3,2020-01-07,0.0,2020,1,7
3,111140.0,Europe,3,2020-01-08,0.0,2020,1,8
4,111141.0,Europe,3,2020-01-09,0.0,2020,1,9


In [52]:
df_europe_new['total_deaths']=df_europe_new['total_deaths'].astype('int') 
df_europe_new['total_deaths']

0            3
1            3
2            3
3            3
4            3
        ...   
360    2109707
361    2109733
362    2109759
363    2109784
364    2109810
Name: total_deaths, Length: 1941, dtype: int32

In [53]:
df_europe_final = df_europe_new[['location','total_deaths','date','year','month','day']].copy().reset_index()
df_europe_final.head()

Unnamed: 0,index,location,total_deaths,date,year,month,day
0,0,Europe,3,2020-01-05,2020,1,5
1,1,Europe,3,2020-01-06,2020,1,6
2,2,Europe,3,2020-01-07,2020,1,7
3,3,Europe,3,2020-01-08,2020,1,8
4,4,Europe,3,2020-01-09,2020,1,9


In [54]:
df_europe_final['total_deaths']

0             3
1             3
2             3
3             3
4             3
         ...   
1936    2109707
1937    2109733
1938    2109759
1939    2109784
1940    2109810
Name: total_deaths, Length: 1941, dtype: int32

In [55]:
df_europe_final.to_csv(r"C:\Users\DELL\OneDrive - VietNam National University - HCM INTERNATIONAL UNIVERSITY\Desktop\thesis\Covid-19-report\arima\data_training\europe.csv", index=False)