In [4]:
import json

import math
import pandas as pd 
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_squared_error, mean_absolute_error
from tqdm.notebook import trange

In [5]:
ori_df = pd.read_csv('https://raw.githubusercontent.com/owid/covid-19-data/master/public/data/owid-covid-data.csv')

In [6]:
ori_df.to_csv('ori.csv')

In [7]:
df = ori_df[['location','total_deaths','date']].copy()

In [8]:
df["total_deaths"] = df["total_deaths"].fillna(0)
df.head

<bound method NDFrame.head of            location  total_deaths        date
0       Afghanistan           0.0  2020-01-05
1       Afghanistan           0.0  2020-01-06
2       Afghanistan           0.0  2020-01-07
3       Afghanistan           0.0  2020-01-08
4       Afghanistan           0.0  2020-01-09
...             ...           ...         ...
397672     Zimbabwe        5740.0  2024-04-24
397673     Zimbabwe        5740.0  2024-04-25
397674     Zimbabwe        5740.0  2024-04-26
397675     Zimbabwe        5740.0  2024-04-27
397676     Zimbabwe        5740.0  2024-04-28

[397677 rows x 3 columns]>

In [9]:
asia=df.loc[df['location'] == 'Asia']
africa=df.loc[df['location'] == 'Africa']
europe=df.loc[df['location'] == 'Europe']
north_america=df.loc[df['location'] == 'North America']
ocenania=df.loc[df['location'] == 'Ocenania']
south_America=df.loc[df['location'] == 'South America']

In [10]:
new_df=pd.concat([asia,africa,europe,north_america,ocenania,south_America]).reset_index().drop(columns='index')

In [11]:
new_df.head()

Unnamed: 0,location,total_deaths,date
0,Asia,0.0,2020-01-05
1,Asia,0.0,2020-01-06
2,Asia,0.0,2020-01-07
3,Asia,0.0,2020-01-08
4,Asia,0.0,2020-01-09


In [12]:
new_df['date']

0       2020-01-05
1       2020-01-06
2       2020-01-07
3       2020-01-08
4       2020-01-09
           ...    
7925    2024-04-24
7926    2024-04-25
7927    2024-04-26
7928    2024-04-27
7929    2024-04-28
Name: date, Length: 7930, dtype: object

In [13]:
new_df['year']=pd.to_datetime(new_df['date']).dt.year
new_df['month']=pd.to_datetime(new_df['date']).dt.month
new_df['day']=pd.to_datetime(new_df['date']).dt.day

In [None]:
asia['date'] = pd.to_datetime(df['date'])
dt_start = pd.to_datetime('2020-01-05')
dt_end = pd.to_datetime('2024-04-28')

asia=asia[asia['date'].between(dt_start, dt_end)]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  asia['date'] = pd.to_datetime(df['date'])


In [15]:
asia.tail()

Unnamed: 0,location,total_deaths,date
20487,Asia,1636895.0,2024-04-24
20488,Asia,1636895.0,2024-04-25
20489,Asia,1636895.0,2024-04-26
20490,Asia,1636895.0,2024-04-27
20491,Asia,1636914.0,2024-04-28


## Start predicting next 5 days

In [17]:
period = 1

In [18]:
## 1. Train model up until most current day
model = ARIMA(history, order=(1,1,0))
model_fit = model.fit()

## 2. Predict in next 'period' days
yhat = model_fit.forecast(period)

### Add predicted values back to dataframe

In [19]:
last_date = asia['date'].tail(1).item()
last_date

Timestamp('2024-04-28 00:00:00')

In [20]:
asia_predicted = [
    {
        'location': 'Asia',
        'total_deaths': x,
        'date': pd.Timedelta(i + 1, 'd') + pd.to_datetime(last_date)
    }
    for i, x in enumerate(yhat)
]
df_asia_pred = pd.DataFrame.from_records(asia_predicted)
df_asia_pred.head()

Unnamed: 0,location,total_deaths,date
0,Asia,1636914.0,2024-04-29


In [21]:
df_asia_new = pd.concat([asia, df_asia_pred])
df_asia_new["date"]=pd.to_datetime(df_asia_new["date"]).dt.to_period(freq="D")
df_asia_new.tail(10)

Unnamed: 0,location,total_deaths,date
20483,Asia,1636867.0,2024-04-20
20484,Asia,1636895.0,2024-04-21
20485,Asia,1636895.0,2024-04-22
20486,Asia,1636895.0,2024-04-23
20487,Asia,1636895.0,2024-04-24
20488,Asia,1636895.0,2024-04-25
20489,Asia,1636895.0,2024-04-26
20490,Asia,1636895.0,2024-04-27
20491,Asia,1636914.0,2024-04-28
0,Asia,1636914.0,2024-04-29


In [22]:
df_asia_new['date'] = pd.to_datetime(df['date'])
df_asia_new['year'] = df_asia_new['date'].dt.year
df_asia_new['month'] = df_asia_new['date'].dt.month
df_asia_new['day'] = df_asia_new['date'].dt.day
df_asia_new

Unnamed: 0,location,total_deaths,date,year,month,day
20464,Asia,1636807.0,2024-04-01,2024,4,1
20465,Asia,1636807.0,2024-04-02,2024,4,2
20466,Asia,1636807.0,2024-04-03,2024,4,3
20467,Asia,1636807.0,2024-04-04,2024,4,4
20468,Asia,1636807.0,2024-04-05,2024,4,5
20469,Asia,1636807.0,2024-04-06,2024,4,6
20470,Asia,1636842.0,2024-04-07,2024,4,7
20471,Asia,1636842.0,2024-04-08,2024,4,8
20472,Asia,1636842.0,2024-04-09,2024,4,9
20473,Asia,1636842.0,2024-04-10,2024,4,10


In [23]:
df_asia_new['total_deaths']=df_asia_new['total_deaths'].astype('int') 
df_asia_new['total_deaths']

20464    1636807
20465    1636807
20466    1636807
20467    1636807
20468    1636807
20469    1636807
20470    1636842
20471    1636842
20472    1636842
20473    1636842
20474    1636842
20475    1636842
20476    1636842
20477    1636867
20478    1636867
20479    1636867
20480    1636867
20481    1636867
20482    1636867
20483    1636867
20484    1636895
20485    1636895
20486    1636895
20487    1636895
20488    1636895
20489    1636895
20490    1636895
20491    1636914
0        1636914
Name: total_deaths, dtype: int32

In [48]:
df_test = pd.DataFrame()
df_test['ds'] = asia['date']
df_test['y'] = asia['total_deaths']
# df_test.set_index('ds', inplace=True)
df_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 28 entries, 20464 to 20491
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype         
---  ------  --------------  -----         
 0   ds      28 non-null     datetime64[ns]
 1   y       28 non-null     float64       
dtypes: datetime64[ns](1), float64(1)
memory usage: 672.0 bytes


In [51]:
from prophet import Prophet
m = Prophet()
m.fit(df_test)
future = m.make_future_dataframe(periods=365)
future.tail()
forecast = m.predict(future)

# forecast[['date', 'yhat', 'yhat_lower', 'yhat_upper']].tail()

02:37:59 - cmdstanpy - INFO - Chain [1] start processing
02:38:02 - cmdstanpy - INFO - Chain [1] done processing


NameError: name 'fore' is not defined

In [58]:
forecast['yhat']=forecast['yhat'].astype('int') 
forecast.tail()

Unnamed: 0,ds,trend,yhat_lower,yhat_upper,trend_lower,trend_upper,additive_terms,additive_terms_lower,additive_terms_upper,weekly,weekly_lower,weekly_upper,multiplicative_terms,multiplicative_terms_lower,multiplicative_terms_upper,yhat
388,2025-04-24,1663197.0,739686.822896,2617503.0,739689.537171,2617506.0,-2.714275,-2.714275,-2.714275,-2.714275,-2.714275,-2.714275,0.0,0.0,0.0,1663194
389,2025-04-25,1663269.0,736328.999804,2623533.0,736403.964698,2623608.0,-74.964894,-74.964894,-74.964894,-74.964894,-74.964894,-74.964894,0.0,0.0,0.0,1663194
390,2025-04-26,1663341.0,733011.532342,2629563.0,733158.747146,2629711.0,-147.214804,-147.214804,-147.214804,-147.214804,-147.214804,-147.214804,0.0,0.0,0.0,1663194
391,2025-04-27,1663414.0,729702.069572,2635618.0,729902.533927,2635819.0,-200.464355,-200.464355,-200.464355,-200.464355,-200.464355,-200.464355,0.0,0.0,0.0,1663213
392,2025-04-28,1663486.0,726860.42811,2640394.0,726646.392332,2640180.0,214.035779,214.035779,214.035779,214.035779,214.035779,214.035779,0.0,0.0,0.0,1663700
