## 02 - Benchmark Model
- Use the previous 24 hour data as a forcast

### Import packages and load the data

In [1]:
import pandas as pd
import numpy as np
from tslearn.clustering import TimeSeriesKMeans
import matplotlib.pyplot as plt
import plotly.express as px
import datetime
import plotly.graph_objects as go

from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

In [2]:
data_input_path = '/Users/szejozsef00/Desktop/MSC/MSC 2. félév/DS Lab I/DSLAB1/data/processed/'

In [3]:
prediction_output_path = '/Users/szejozsef00/Desktop/MSC/MSC 2. félév/DS Lab I/DSLAB1/data/predictions/'

In [4]:
df = pd.read_csv(data_input_path + 'processed_data.csv',sep=';',decimal=',')
df['DATETIME'] = pd.to_datetime(df['DATETIME'])
df = df.fillna(0)

In [5]:
# Add Date column
df['DATE'] = pd.to_datetime(df['DATETIME']).dt.date

### Transform the data

In [6]:
# Melt the dataframe to DATE-DATETIME-LOCATION-VALUE format
melted_fact_df = pd.melt(df, id_vars=['DATE','DATETIME'],var_name='LOCATION', value_name='VALUE')
melted_fact_df['LOCATION'] = melted_fact_df['LOCATION'].astype(int)
lmelted_fact_df = melted_fact_df.sort_values('LOCATION')
melted_fact_df.head(5)

Unnamed: 0,DATE,DATETIME,LOCATION,VALUE
0,2009-07-02,2009-07-02 00:00:00,0,-79.5
1,2009-07-02,2009-07-02 00:05:00,0,-22.81
2,2009-07-02,2009-07-02 00:10:00,0,23.02
3,2009-07-02,2009-07-02 00:15:00,0,21.36
4,2009-07-02,2009-07-02 00:20:00,0,25.18


In [7]:
melted_fact_df = melted_fact_df[melted_fact_df['LOCATION'] < 197]

### Benchamrk model

In [8]:
start_date  = datetime.date(2010, 6, 1)
end_date = datetime.date(2010, 6, 30)

In [9]:
# 1 day model
fc_interval_day = 1

model_1d_df = melted_fact_df[(melted_fact_df['DATE'] >= start_date) & (melted_fact_df['DATE'] <= end_date)].copy(deep=True)

model_1d_df['HOUR_MIN'] = (model_1d_df['DATETIME'].dt.hour).astype(str) + "_" +(model_1d_df['DATETIME'].dt.minute).astype(str)

# Calculate the moving average for the past similar values
model_1d_df['bm_1d_prediction'] = model_1d_df.groupby(['LOCATION','HOUR_MIN'])['VALUE'].transform(lambda x: x.rolling(window=fc_interval_day).mean())

fc_1d_df = model_1d_df[['DATETIME','LOCATION','bm_1d_prediction']].copy(deep=True)

fc_1d_df['DATETIME'] = fc_1d_df['DATETIME'] + datetime.timedelta(days=fc_interval_day)
fc_1d_df = fc_1d_df[fc_1d_df['DATETIME'].dt.date > end_date].copy(deep=True)

fc_1d_df

Unnamed: 0,DATETIME,LOCATION,bm_1d_prediction
104544,2010-07-01 00:00:00,0,363.16
104545,2010-07-01 00:05:00,0,449.45
104546,2010-07-01 00:10:00,0,504.09
104547,2010-07-01 00:15:00,0,506.39
104548,2010-07-01 00:20:00,0,453.43
...,...,...,...
25506427,2010-07-01 23:35:00,196,-155.03
25506428,2010-07-01 23:40:00,196,-157.24
25506429,2010-07-01 23:45:00,196,-153.46
25506430,2010-07-01 23:50:00,196,-157.43


In [10]:
# 1 day model
fc_interval_day = 3

model_3d_df = melted_fact_df[(melted_fact_df['DATE'] >= start_date) & (melted_fact_df['DATE'] <= end_date)].copy(deep=True)

model_3d_df['HOUR_MIN'] = (model_3d_df['DATETIME'].dt.hour).astype(str) + "_" +(model_3d_df['DATETIME'].dt.minute).astype(str)

# Calculate the moving average for the past similar values
model_3d_df['bm_3d_prediction'] = model_3d_df.groupby(['LOCATION','HOUR_MIN'])['VALUE'].transform(lambda x: x.rolling(window=fc_interval_day).mean())

fc_3d_df = model_3d_df[['DATETIME','LOCATION','bm_3d_prediction']].copy(deep=True)

fc_3d_df['DATETIME'] = fc_3d_df['DATETIME'] + datetime.timedelta(days=fc_interval_day)
fc_3d_df = fc_3d_df[fc_3d_df['DATETIME'].dt.date > end_date].copy(deep=True).reset_index(drop=True)

fc_3d_df

Unnamed: 0,DATETIME,LOCATION,bm_3d_prediction
0,2010-07-01 00:00:00,0,283.013333
1,2010-07-01 00:05:00,0,307.866667
2,2010-07-01 00:10:00,0,311.186667
3,2010-07-01 00:15:00,0,312.716667
4,2010-07-01 00:20:00,0,336.980000
...,...,...,...
170203,2010-07-03 23:35:00,196,-143.203333
170204,2010-07-03 23:40:00,196,-146.083333
170205,2010-07-03 23:45:00,196,-142.676667
170206,2010-07-03 23:50:00,196,-144.986667


In [12]:
#merge together
fact_df = melted_fact_df[
    (melted_fact_df['DATE'] > (end_date - datetime.timedelta(days=3)))
    & (melted_fact_df['DATE'] <= end_date + datetime.timedelta(days=1))
].copy(deep=True)

eval_df = fact_df.merge(fc_1d_df, on=['DATETIME','LOCATION'], how='left')
eval_df = eval_df.merge(fc_3d_df, on=['DATETIME','LOCATION'], how='left')

In [13]:
#plot the results
def plot_predictions(location):
    filtered_df = eval_df[eval_df['LOCATION'] == location]
    
    fig = go.Figure()
    
    fig.add_trace(go.Scatter(x=filtered_df['DATETIME'], y=filtered_df['VALUE'], mode='lines', name='Actual Value'))
    fig.add_trace(go.Scatter(x=filtered_df['DATETIME'], y=filtered_df['bm_1d_prediction'], mode='lines', name='1 Day Prediction'))
    fig.add_trace(go.Scatter(x=filtered_df['DATETIME'], y=filtered_df['bm_3d_prediction'], mode='lines', name='3 Day Prediction'))
    
    fig.update_layout(title=f'Predictions for Location {location}',
                      xaxis_title='Datetime',
                      yaxis_title='Values')
    
    fig.show()

plot_predictions(3)

In [15]:
eval_df.to_csv(prediction_output_path + 'bm_pred_2010_07_01_1_3_d.csv', index=False)