In [1]:
import numpy as np

In [2]:
!pip install skforecast
!pip install dask[dataframe]



In [3]:
import pandas as pd
import lightgbm as lgb
import joblib
import matplotlib.pyplot as plt
import warnings

plt.style.use('fivethirtyeight')
plt.rcParams['lines.linewidth'] = 1.5
plt.rcParams['font.size'] = 10

from skforecast.ForecasterAutoreg import ForecasterAutoreg
from skforecast.ForecasterAutoregCustom import ForecasterAutoregCustom
from skforecast.ForecasterAutoregDirect import ForecasterAutoregDirect
from skforecast.model_selection import grid_search_forecaster
from skforecast.model_selection import backtesting_forecaster
from skforecast.utils import save_forecaster
from skforecast.utils import load_forecaster
from skforecast.datasets import fetch_dataset

warnings.filterwarnings('once')

In [4]:
model = joblib.load('ML models/LightGBM_ML.pkl')

In [5]:
old_data = pd.read_csv('Fitabase Data 4.12.16-5.12.16/test_train_data.csv')
data = pd.read_csv('FitBit API Database/test_train_data_api_merged.csv')

print("Archivo : test_train_data.csv")
print(old_data)
print("\n")
print("Archivo : test_train_data_api_merged.csv")
print(data)



Archivo : test_train_data.csv
                Id                 Time   HeartRate  Intensity  Calories
0       2022484408  2016-04-12 07:21:00  101.600000          1   3.32064
1       2022484408  2016-04-12 07:22:00   87.888889          1   3.94326
2       2022484408  2016-04-12 07:23:00   58.000000          0   1.34901
3       2022484408  2016-04-12 07:24:00   58.000000          0   1.03770
4       2022484408  2016-04-12 07:25:00   56.777778          0   1.03770
...            ...                  ...         ...        ...       ...
333141  8877689391  2016-05-12 13:55:00   60.666667          0   1.33353
333142  8877689391  2016-05-12 13:56:00   61.875000          0   1.33353
333143  8877689391  2016-05-12 13:57:00   58.142857          0   1.33353
333144  8877689391  2016-05-12 13:58:00   61.200000          0   1.33353
333145  8877689391  2016-05-12 13:59:00   58.000000          0   1.33353

[333146 rows x 5 columns]


Archivo : test_train_data_api_merged.csv
           Id           

In [6]:
for column in data.columns:
    # Verificar si la columna tiene valores NaN
    if data[column].isna().any():
        # Realizar un dropna en la columna si hay valores NaN
        data = data.dropna(subset=[column])


In [7]:
#Comprobamos los valores NaN de cada columna
data.isna().sum()

Id           0
Time         0
Calories     0
Distance     0
Steps        0
HeartRate    0
dtype: int64

In [8]:
archivo_final = "FitBit API Database/test_train_data_api_merged.csv"
data = data.dropna(subset=['HeartRate'])
data = data.dropna(subset=['Steps'])

print(data)
data.to_csv(archivo_final, index=False)

           Id                 Time  Calories  Distance  Steps  HeartRate
21253  BSGSML  2024-02-15 18:13:00   1.30970       0.0    0.0       57.0
21254  BSGSML  2024-02-15 18:14:00   1.44067       0.0    0.0       58.0
21255  BSGSML  2024-02-15 18:15:00   1.44067       0.0    0.0       63.0
21256  BSGSML  2024-02-15 18:16:00   1.30970       0.0    0.0       58.0
21257  BSGSML  2024-02-15 18:17:00   1.44067       0.0    0.0       57.0
...       ...                  ...       ...       ...    ...        ...
83515  BSGSML  2024-03-29 23:55:00   1.57248       0.0    0.0       67.0
83516  BSGSML  2024-03-29 23:56:00   1.57248       0.0    0.0       67.0
83517  BSGSML  2024-03-29 23:57:00   1.57248       0.0    0.0       67.0
83518  BSGSML  2024-03-29 23:58:00   1.57248       0.0    0.0       66.0
83519  BSGSML  2024-03-29 23:59:00   1.57248       0.0    0.0       65.0

[61232 rows x 6 columns]


In [9]:
print(f'Número de filas con missing values: {data.isnull().any(axis=1).mean()}')

Número de filas con missing values: 0.0


In [10]:
data['Time'] = pd.to_datetime(data['Time'], format='%Y-%m-%d %H:%M:%S')
data = data.set_index('Time')
data = data.asfreq('60S')
data = data.sort_index()
data

Unnamed: 0_level_0,Id,Calories,Distance,Steps,HeartRate
Time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2024-02-15 18:13:00,BSGSML,1.30970,0.0,0.0,57.0
2024-02-15 18:14:00,BSGSML,1.44067,0.0,0.0,58.0
2024-02-15 18:15:00,BSGSML,1.44067,0.0,0.0,63.0
2024-02-15 18:16:00,BSGSML,1.30970,0.0,0.0,58.0
2024-02-15 18:17:00,BSGSML,1.44067,0.0,0.0,57.0
...,...,...,...,...,...
2024-03-29 23:55:00,BSGSML,1.57248,0.0,0.0,67.0
2024-03-29 23:56:00,BSGSML,1.57248,0.0,0.0,67.0
2024-03-29 23:57:00,BSGSML,1.57248,0.0,0.0,67.0
2024-03-29 23:58:00,BSGSML,1.57248,0.0,0.0,66.0


In [11]:
steps = 5
datos_train = data[:-steps]
datos_test  = data[-steps:]

print(
    f"Fechas train : {datos_train.index.min()} --- {datos_train.index.max()}  (n={len(datos_train)})"
)
print(
    f"Fechas test  : {datos_test.index.min()} --- {datos_test.index.max()}  (n={len(datos_test)})"
)

print(f'Número de filas con missing values : (datos_train) : {datos_train.isnull().any(axis=1).mean()}')
print(f'Número de filas con missing values:  (datos_test) : {datos_test.isnull().any(axis=1).mean()}')

Fechas train : 2024-02-15 18:13:00 --- 2024-03-29 23:54:00  (n=62262)
Fechas test  : 2024-03-29 23:55:00 --- 2024-03-29 23:59:00  (n=5)
Número de filas con missing values : (datos_train) : 0.01662330153223475
Número de filas con missing values:  (datos_test) : 0.0


In [12]:
for column in datos_train.columns:
    # Verificar si la columna tiene valores NaN
    if datos_train[column].isna().any():
        # Realizar un dropna en la columna si hay valores NaN
        datos_train = datos_train.dropna(subset=[column])

print(f'Número de filas con missing values en datos_train: {datos_train.isnull().any(axis=1).mean()}')
print(f'Número de filas con missing values en datos_test: {datos_test.isnull().any(axis=1).mean()}')

Número de filas con missing values en datos_train: 0.0
Número de filas con missing values en datos_test: 0.0


In [15]:
forecaster = ForecasterAutoreg(
                regressor = model,
                lags      = 7,
                differentiation = None
             )

forecaster.fit(y=datos_train['HeartRate'], exog=datos_train[['Calories','Steps','Distance']])
forecaster

ValueError: `y` has missing values.

In [None]:
predicciones = forecaster.predict(steps=5,  exog=datos_train[['Calories', 'Steps', 'Distance']])

In [None]:
fig, ax = plt.subplots(figsize=(6, 2.5))
datos_train['HeartRate'].plot(ax=ax, label='train')
datos_test['HeartRate'].plot(ax=ax, label='test')
predicciones.plot(ax=ax, label='predicciones')
ax.legend();