In [71]:
import pandas as pd
from datetime import datetime, timedelta
from pymongo import MongoClient

def fetch_data_from_mongodb(collection, room_number='Room108', days_back=1):
    today = datetime.today().replace(hour=0, minute=0, second=0, microsecond=0)
    start_date = today - pd.DateOffset(days=days_back)
    cursor = collection.find(
        {'temperature': {'$exists': True, '$ne': None},
         'roomNumber': room_number, 'createdAt': {'$gte': start_date}},
        {'createdAt': 1, 'temperature': 1, '_id': 0}
    )
    data = list(cursor)
    df = pd.DataFrame(data)
    return df

In [72]:
# Connect to MongoDB Atlas
client = MongoClient(
    "<mongo_db_credentials>")
db = client['test']
collection = db['sensordatas']

# Fetch data from MongoDB
df = fetch_data_from_mongodb(
    collection, room_number='Room108', days_back=1)  # Adjust days_back as needed

df["createdAt"] = pd.to_datetime(df["createdAt"], format='%Y-%m-%d %H:%M:%S')
df["createdAt"] = df["createdAt"].apply(lambda x: x.replace(microsecond=0))
df

Unnamed: 0,temperature,createdAt
0,23.2,2024-01-11 00:24:40
1,23.2,2024-01-11 00:24:35
2,23.1,2024-01-11 00:24:30
3,23.2,2024-01-11 00:24:25
4,23.2,2024-01-11 00:24:20
...,...,...
3745,24.0,2024-01-10 18:59:54
3746,23.9,2024-01-10 18:59:49
3747,23.5,2024-01-10 18:59:44
3748,23.5,2024-01-10 18:59:21


In [73]:
df = df.drop_duplicates(subset=['createdAt'], keep='last').reset_index(drop=True)
df

Unnamed: 0,temperature,createdAt
0,23.2,2024-01-11 00:24:40
1,23.2,2024-01-11 00:24:35
2,23.1,2024-01-11 00:24:30
3,23.2,2024-01-11 00:24:25
4,23.2,2024-01-11 00:24:20
...,...,...
3728,24.0,2024-01-10 18:59:54
3729,23.9,2024-01-10 18:59:49
3730,23.5,2024-01-10 18:59:44
3731,23.5,2024-01-10 18:59:21


In [74]:
df = df.set_index('createdAt')
df = df.resample('2min').mean().reset_index()
df

Unnamed: 0,createdAt,temperature
0,2024-01-10 18:58:00,23.733333
1,2024-01-10 19:00:00,24.187500
2,2024-01-10 19:02:00,24.160000
3,2024-01-10 19:04:00,25.550000
4,2024-01-10 19:06:00,26.280952
...,...,...
159,2024-01-11 00:16:00,23.450000
160,2024-01-11 00:18:00,23.354167
161,2024-01-11 00:20:00,23.308333
162,2024-01-11 00:22:00,23.245833


In [75]:
df["temperature"] = df["temperature"].fillna(method='ffill')
df

Unnamed: 0,createdAt,temperature
0,2024-01-10 18:58:00,23.733333
1,2024-01-10 19:00:00,24.187500
2,2024-01-10 19:02:00,24.160000
3,2024-01-10 19:04:00,25.550000
4,2024-01-10 19:06:00,26.280952
...,...,...
159,2024-01-11 00:16:00,23.450000
160,2024-01-11 00:18:00,23.354167
161,2024-01-11 00:20:00,23.308333
162,2024-01-11 00:22:00,23.245833


In [76]:
df.index = pd.PeriodIndex(df["createdAt"], freq='2min')
df = df.drop(columns=['createdAt'])
df

Unnamed: 0_level_0,temperature
createdAt,Unnamed: 1_level_1
2024-01-10 18:58,23.733333
2024-01-10 19:00,24.187500
2024-01-10 19:02,24.160000
2024-01-10 19:04,25.550000
2024-01-10 19:06,26.280952
...,...
2024-01-11 00:16,23.450000
2024-01-11 00:18,23.354167
2024-01-11 00:20,23.308333
2024-01-11 00:22,23.245833


In [77]:
from pycaret.time_series import TSForecastingExperiment

exp_ts = TSForecastingExperiment()

exp_ts.setup(
    data=df,
    target="temperature",
    fh=5,
    fold_strategy='expanding',
    fold=4,
    session_id=42,
)

Unnamed: 0,Description,Value
0,session_id,42
1,Target,temperature
2,Approach,Univariate
3,Exogenous Variables,Not Present
4,Original data shape,"(164, 1)"
5,Transformed data shape,"(164, 1)"
6,Transformed train set shape,"(159, 1)"
7,Transformed test set shape,"(5, 1)"
8,Rows with missing values,0.0%
9,Fold Generator,ExpandingWindowSplitter


<pycaret.time_series.forecasting.oop.TSForecastingExperiment at 0x1c6627fd3c0>

In [78]:
exp_ts.check_stats()

Unnamed: 0,Test,Test Name,Data,Property,Setting,Value
0,Summary,Statistics,Transformed,Length,,164.0
1,Summary,Statistics,Transformed,# Missing Values,,0.0
2,Summary,Statistics,Transformed,Mean,,24.324688
3,Summary,Statistics,Transformed,Median,,23.970833
4,Summary,Statistics,Transformed,Standard Deviation,,0.907174
5,Summary,Statistics,Transformed,Variance,,0.822965
6,Summary,Statistics,Transformed,Kurtosis,,2.254304
7,Summary,Statistics,Transformed,Skewness,,1.808602
8,Summary,Statistics,Transformed,# Distinct Values,,140.0
9,White Noise,Ljung-Box,Transformed,Test Statictic,"{'alpha': 0.05, 'K': 24}",714.031499


In [79]:
exp_ts.plot_model(plot="diagnostics")

In [80]:
best_models = exp_ts.compare_models(
    include=['prophet', 'arima', 'auto_arima', 'grand_means', 'naive', 'ets', 'naive', 'tbats'],
    n_select=5
)

#tuned_models = [exp_ts.tune_model(model) for model in best_models]
blended_model = exp_ts.blend_models(estimator_list=best_models)
exp_ts.plot_model(blended_model, plot="forecast")

ValueError: Estimator prophet Not Available. Please see docstring for list of available estimators.

In [81]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(df, test_size=0.2, shuffle=False)

In [82]:
from statsmodels.tsa.arima.model import ARIMA
from matplotlib import pyplot as plt

arima_model = ARIMA(df_train, order=([1, 15, 18], 0, 3))
arima_model_fit = arima_model.fit()
# summary of fit model
print(arima_model_fit.summary())
# line plot of residuals
residuals = pd.DataFrame(arima_model_fit.resid)
residuals.plot()
plt.show()
# density plot of residuals
residuals.plot(kind='kde')
plt.show()
# summary stats of residuals
print(residuals.describe())

                                  SARIMAX Results                                   
Dep. Variable:                  temperature   No. Observations:                  131
Model:             ARIMA([1, 15, 18], 0, 3)   Log Likelihood                 -20.172
Date:                      Thu, 11 Jan 2024   AIC                             56.345
Time:                              01:26:53   BIC                             79.347
Sample:                          01-10-2024   HQIC                            65.692
                               - 01-10-2024                                         
Covariance Type:                        opg                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         24.4512      0.286     85.528      0.000      23.891      25.012
ar.L1          0.8208      0.055     14.837      0.000       0.712       0.929
ar.L

In [83]:
preds = arima_model_fit.forecast(steps=df_test.shape[0])
preds

2024-01-10 23:20    23.842675
2024-01-10 23:22    23.967909
2024-01-10 23:24    24.098594
2024-01-10 23:26    24.210980
2024-01-10 23:28    24.305250
2024-01-10 23:30    24.386230
2024-01-10 23:32    24.456749
2024-01-10 23:34    24.515284
2024-01-10 23:36    24.560386
2024-01-10 23:38    24.589855
2024-01-10 23:40    24.606565
2024-01-10 23:42    24.629865
2024-01-10 23:44    24.659842
2024-01-10 23:46    24.682709
2024-01-10 23:48    24.695789
2024-01-10 23:50    24.714252
2024-01-10 23:52    24.731949
2024-01-10 23:54    24.747314
2024-01-10 23:56    24.752775
2024-01-10 23:58    24.746120
2024-01-11 00:00    24.728681
2024-01-11 00:02    24.704087
2024-01-11 00:04    24.675265
2024-01-11 00:06    24.644086
2024-01-11 00:08    24.611743
2024-01-11 00:10    24.579437
2024-01-11 00:12    24.548693
2024-01-11 00:14    24.520997
2024-01-11 00:16    24.496988
2024-01-11 00:18    24.475120
2024-01-11 00:20    24.454422
2024-01-11 00:22    24.435410
2024-01-11 00:24    24.418754
Freq: 2T, 

In [84]:
df_pred = pd.DataFrame(preds).reset_index(names="createdAt")
df_pred["createdAt"] = df_pred["createdAt"].apply(lambda x: x.to_timestamp())
df_pred

Unnamed: 0,createdAt,predicted_mean
0,2024-01-10 23:20:00,23.842675
1,2024-01-10 23:22:00,23.967909
2,2024-01-10 23:24:00,24.098594
3,2024-01-10 23:26:00,24.21098
4,2024-01-10 23:28:00,24.30525
5,2024-01-10 23:30:00,24.38623
6,2024-01-10 23:32:00,24.456749
7,2024-01-10 23:34:00,24.515284
8,2024-01-10 23:36:00,24.560386
9,2024-01-10 23:38:00,24.589855


In [85]:
df = df.reset_index()
df["createdAt"] = df["createdAt"].apply(lambda x: x.to_timestamp())
df

Unnamed: 0,createdAt,temperature
0,2024-01-10 18:58:00,23.733333
1,2024-01-10 19:00:00,24.187500
2,2024-01-10 19:02:00,24.160000
3,2024-01-10 19:04:00,25.550000
4,2024-01-10 19:06:00,26.280952
...,...,...
159,2024-01-11 00:16:00,23.450000
160,2024-01-11 00:18:00,23.354167
161,2024-01-11 00:20:00,23.308333
162,2024-01-11 00:22:00,23.245833


In [86]:
import plotly.graph_objects as go

fig = go.Figure()

fig.add_trace(go.Scatter(
    x=df['createdAt'],
    y=df['temperature'],
    name="Actual",
    mode='lines',
    marker=dict(color="blue"),
))

# add vertical line at the beginning of the start date of df_pred
fig.add_vline(x=df_pred['createdAt'][0], line_width=3, line_dash="dash", line_color="red")
fig.add_trace(go.Scatter(
    x=df_pred['createdAt'],
    y=df_pred['predicted_mean'],
    name="Predicted",
    mode='lines',
    marker=dict(color="red"),
))
fig.show()