In [1]:
%pip install pycaret

Collecting pycaret
  Downloading pycaret-3.2.0-py3-none-any.whl (484 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m484.7/484.7 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting category-encoders>=2.4.0 (from pycaret)
  Downloading category_encoders-2.6.3-py2.py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.9/81.9 kB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
Collecting deprecation>=2.1.0 (from pycaret)
  Downloading deprecation-2.1.0-py2.py3-none-any.whl (11 kB)
Collecting kaleido>=0.2.1 (from pycaret)
  Downloading kaleido-0.2.1-py2.py3-none-manylinux1_x86_64.whl (79.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.9/79.9 MB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
Collecting matplotlib<=3.6,>=3.3.0 (from pycaret)
  Downloading matplotlib-3.6.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (11.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.8/11

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot
import plotly.graph_objects as go
from pycaret.datasets import get_data
from pycaret.time_series import TSForecastingExperiment

In [5]:
df_deaths_us = pd.read_csv('/content/drive/MyDrive/Covid19/time_series_covid19_deaths_US.csv')
df_confirmed_global = pd.read_csv('/content/drive/MyDrive/Covid19/time_series_covid19_confirmed_global.csv')
df_confirmed_us = pd.read_csv('/content/drive/MyDrive/Covid19/time_series_covid19_confirmed_US.csv')
df_recovered_global = pd.read_csv('/content/drive/MyDrive/Covid19/time_series_covid19_recovered_global.csv')

In [6]:
print("Deaths in US shape", df_deaths_us.shape)
print(df_confirmed_global.shape)
print(df_confirmed_us.shape)

Deaths in US shape (3342, 1155)
(289, 1147)
(3342, 1154)


In [7]:
df_c_us = df_confirmed_us.iloc[1,0:10].to_frame()
df_c_us

Unnamed: 0,1
UID,84001003
iso2,US
iso3,USA
code3,840
FIPS,1003.0
Admin2,Baldwin
Province_State,Alabama
Country_Region,US
Lat,30.72775
Long_,-87.722071


In [8]:
df_1 = df_confirmed_us.iloc[1,11:].to_frame().reset_index()
df_1.rename(columns = {'index':'time',1:'cases'}, inplace=True)
df_1['time'] = pd.to_datetime(df_1['time'], format='%m/%d/%y')
df_1['cases'] = df_1['cases'].astype(float)
df_1.set_index('time')

Unnamed: 0_level_0,cases
time,Unnamed: 1_level_1
2020-01-22,0.0
2020-01-23,0.0
2020-01-24,0.0
2020-01-25,0.0
2020-01-26,0.0
...,...
2023-03-05,69767.0
2023-03-06,69767.0
2023-03-07,69767.0
2023-03-08,69860.0


In [9]:
fig = go.Figure([go.Scatter(x = df_1['time'], y = df_1['cases'])])
fig.show()

In [10]:
# We want to forecast the next 12 months of data and we will use 3 fold cross-validation to test the models.
fh = 12 # or alternately fh = np.arange(1,13)
fold = 3

fig_kwargs = {
    # "renderer": "notebook",
    "renderer": "png",
    "width": 1000,
    "height": 600,
}

eda = TSForecastingExperiment()
eda.setup(data=df_1, target = 'cases', index='time')

Unnamed: 0,Description,Value
0,session_id,6036
1,Target,cases
2,Approach,Univariate
3,Exogenous Variables,Not Present
4,Original data shape,"(1143, 1)"
5,Transformed data shape,"(1143, 1)"
6,Transformed train set shape,"(1142, 1)"
7,Transformed test set shape,"(1, 1)"
8,Rows with missing values,0.0%
9,Fold Generator,ExpandingWindowSplitter


<pycaret.time_series.forecasting.oop.TSForecastingExperiment at 0x7ca6df91f550>

In [10]:
eda.plot_model()

In [11]:
# ACF and PACF for the original dataset
eda.plot_model(plot="acf")

In [12]:
eda.plot_model(plot="pacf", data_kwargs={'nlags':36}, fig_kwargs={'height': 500, "width": 800})

In [13]:
eda.plot_model(plot="periodogram")
eda.plot_model(plot="fft")

In [14]:
eda.plot_model(plot="diagnostics", fig_kwargs={"height": 800, "width": 1000})


In [11]:
eda.plot_model(plot="decomp", data_kwargs={'seasonal_period': 7}, fig_kwargs={"height": 500})

In [16]:
# Get the 3 best baseline models
best_baseline_models = eda.compare_models(n_select=1)
best_baseline_models

Unnamed: 0,Model,MASE,RMSSE,MAE,RMSE,MAPE,SMAPE,TT (Sec)
auto_arima,Auto ARIMA,0.0214,0.0126,9.1999,9.1999,0.0001,0.0001,292.3133
knn_cds_dt,K Neighbors w/ Cond. Deseasonalize & Detrending,0.032,0.0189,13.7869,13.7869,0.0002,0.0002,0.1833
arima,ARIMA,0.0415,0.0245,17.8943,17.8943,0.0003,0.0003,0.1833
exp_smooth,Exponential Smoothing,0.046,0.0272,19.8392,19.8392,0.0003,0.0003,1.6067
et_cds_dt,Extra Trees w/ Cond. Deseasonalize & Detrending,0.0488,0.0288,21.0002,21.0002,0.0003,0.0003,0.6233
lightgbm_cds_dt,Light Gradient Boosting w/ Cond. Deseasonalize & Detrending,0.0579,0.0342,24.9409,24.9409,0.0004,0.0004,0.4233
huber_cds_dt,Huber w/ Cond. Deseasonalize & Detrending,0.0654,0.0386,28.1521,28.1521,0.0004,0.0004,0.14
ridge_cds_dt,Ridge w/ Cond. Deseasonalize & Detrending,0.0786,0.0464,33.8462,33.8462,0.0005,0.0005,0.11
llar_cds_dt,Lasso Least Angular Regressor w/ Cond. Deseasonalize & Detrending,0.0786,0.0464,33.8505,33.8505,0.0005,0.0005,0.1067
lr_cds_dt,Linear w/ Cond. Deseasonalize & Detrending,0.0786,0.0464,33.8462,33.8462,0.0005,0.0005,0.2967


Processing:   0%|          | 0/117 [00:00<?, ?it/s]

In [17]:
df_t = TimeSeries.from_dataframe(df_1, time_col='time', value_cols='cases')
df_t

In [18]:
target = df_t['cases'][:1000]
# optionally, use past observed rainfall (pretending to be unknown beyond index 100)
past_cov = series['rain (mm)'][:100]
# optionally, use future temperatures (pretending this component is a forecast) >>>
future_cov = series['T (degC)'][:106]
# predict 6 pressure values using the 12 past values of pressure and rainfall, as well as the 6 temperature
# # values corresponding to the forecasted period >>>
model = XGBModel( lags=12,lags_past_covariates=12,lags_future_covariates=[0,1,2,3,4,5],output_chunk_length=6)
model.fit(target, past_covariates=past_cov, future_covariates=future_cov)
pred = model.predict(6)

NameError: ignored