In [None]:
!pip install statsmodels

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.metrics import mean_squared_error
import seaborn as sns
import copy
from sklearn.preprocessing import MinMaxScaler
import time

import random

from statsmodels.tsa.ar_model import AutoReg
from statsmodels.graphics.tsaplots import plot_pacf, plot_acf
from statsmodels.tsa.arima.model import ARIMA
import statsmodels.api as sm


In [None]:
data = pd.read_csv('station_153211-2022-09_2023-01.csv')

# Adding / removing columns.
data['DateTime'] = pd.to_datetime(data['created_at'], infer_datetime_format=True)
data.set_index('DateTime', inplace=True)
data.info()
data.drop('created_at', axis=1, inplace=True)

In [None]:
data = data[~data.index.duplicated(keep='first')]

start_date = data.index[5]
end_date = data.index[-5]
datetime_index = pd.date_range(start=start_date, end=end_date, freq='5T')

data_r = pd.DataFrame(index=datetime_index, columns=data.columns)

k = 1

for target_time in datetime_index:

    data_c = copy.copy(data)
    data_c['time_difference'] = abs(data_c.index - target_time)
    data_c_sorted = data_c.sort_values(by='time_difference')
    k_closest_rows = data_c_sorted.head(k)
    k_closest_rows= k_closest_rows.drop(columns=['time_difference'])
    new_col = k_closest_rows.mean(axis=0)
    data_r.loc[target_time] = new_col

In [None]:
data_r.info()

In [None]:
data_r['day_of_week'] = data_r.index.dayofweek
data_r['month'] = data_r.index.month
data_r['is_weekend'] = (data_r.index.dayofweek >= 5).astype(int)
data_r['5minute_bin'] = (data_r.index.hour * 60 + data_r.index.minute) // 5
data_r.drop('station_id', axis=1, inplace=True)
data_r.drop('outlet_count', axis=1, inplace=True)

In [None]:
print(data_r['5minute_bin'].value_counts())

In [None]:
scaler = MinMaxScaler()

df_normalized = pd.DataFrame(scaler.fit_transform(data_r),
                             index=data_r.index,
                             columns=data_r.columns)

print(df_normalized['occupied_count'].value_counts())
(data_r['occupied_count'].value_counts())

In [None]:
import matplotlib.pyplot as plt

# Get the unique dates of the first 5 days in the DataFrame
first_five_dates = set(df_normalized.index.date[:500])
print(first_five_dates)
# Plot data for each day in a separate window
for date in first_five_dates:
    # Filter the data for the current date and the hours from 6 am to 10 pm
    data_day = df_normalized[df_normalized.index.date == date]
    data_daytime = data_day.between_time('00:00', '23:00')
    
    # Create a new plot for each day
    plt.figure(figsize=(12, 6))
    
    # Plot the data for the current day
    plt.plot(data_daytime.index, data_daytime['occupied_count'], color='blue', marker='o', linestyle='-')
    plt.title(f'Data for {date}: 6:00 AM to 10:00 PM')
    plt.xlabel('Time')
    plt.ylabel('Occupied Count')
    plt.xticks(rotation=45)
    plt.grid(True)
    
    # Show plot for the current day
    plt.show()

### MA (Moving Average)

In [None]:
# Determine split index for 20% validation set
second_feature_data = pd.to_numeric(df_normalized.iloc[:, 1], errors='coerce').astype(float)

In [None]:
# Plot autocorrelation
plt.figure(figsize=(10, 6))
plot_acf(second_feature_data, lags=4032, alpha=0.05)
plt.title('Autocorrelation Plot')
plt.xlabel('Lag')
plt.ylabel('Autocorrelation')
plt.show()

In [None]:
import numpy as np
import pandas as pd
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import random

# Assuming second_feature_data is your dataset
# second_feature_data = np.random.rand(10000)  # Example, replace with your actual dataset

window_size = 576  
prediction_length = 36  

all_forecasts = []
all_actuals = []

max_start_index = len(second_feature_data) - window_size - prediction_length

# Selecting 25 random start indices for the prediction
random_start_indices = random.sample(range(max_start_index), 25)

for start_index in random_start_indices:
    print(start_index)
    train = second_feature_data[start_index:start_index + window_size]
    test = second_feature_data[start_index + window_size:start_index + window_size + prediction_length]
    
    model = ARIMA(train, order=(0, 0, 36))
    model_fitted = model.fit()
    
    forecast = model_fitted.forecast(steps=prediction_length)
    all_forecasts.append(forecast)
    all_actuals.append(test)

# Calculate MSE for each prediction set
mse_losses = [mean_squared_error(actuals, forecasts) for actuals, forecasts in zip(all_actuals, all_forecasts)]

# Compute the average MSE
average_mse = np.mean(mse_losses)
print("Average Mean Squared Error (MSE) across all sequences:", average_mse)

# Plotting each sequence in its own plot
for i, start_index in enumerate(random_start_indices):
    plt.figure(figsize=(10, 6))  # Create a new figure for each plot
    plt.plot(range(prediction_length), all_actuals[i], label='Actual Future Values', marker='o', linestyle='-', color='blue')
    plt.plot(range(prediction_length), all_forecasts[i], label='Forecasted Future Values', marker='x', linestyle='--', color='red')
    plt.title(f'Sequence starting at {start_index + 1}: Predictions vs Actuals')
    plt.ylabel('Value')
    plt.ylim(0, 1)  # Ensure the y-axis ranges from 0 to 1
    plt.legend()
    plt.xlabel('Time Step')
    plt.show()  # Show the plot

### AR (Auto Regressive)

In [None]:
import numpy as np
import pandas as pd
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import random

# Assuming second_feature_data is your dataset
# second_feature_data = np.random.rand(10000)  # Example, replace with your actual dataset

window_size = 576  
prediction_length = 36  

all_forecasts = []
all_actuals = []

max_start_index = len(second_feature_data) - window_size - prediction_length

# Selecting 25 random start indices for the prediction
random_start_indices = random.sample(range(max_start_index), 25)

for start_index in random_start_indices:
    print(start_index)
    train = second_feature_data[start_index:start_index + window_size]
    test = second_feature_data[start_index + window_size:start_index + window_size + prediction_length]
    
    model = ARIMA(train, order=(36, 0, 0))
    model_fitted = model.fit()
    
    forecast = model_fitted.forecast(steps=prediction_length)
    all_forecasts.append(forecast)
    all_actuals.append(test)

# Calculate MSE for each prediction set
mse_losses = [mean_squared_error(actuals, forecasts) for actuals, forecasts in zip(all_actuals, all_forecasts)]

# Compute the average MSE
average_mse = np.mean(mse_losses)
print("Average Mean Squared Error (MSE) across all sequences:", average_mse)

# Plotting each sequence in its own plot
for i, start_index in enumerate(random_start_indices):
    plt.figure(figsize=(10, 6))  # Create a new figure for each plot
    plt.plot(range(prediction_length), all_actuals[i], label='Actual Future Values', marker='o', linestyle='-', color='blue')
    plt.plot(range(prediction_length), all_forecasts[i], label='Forecasted Future Values', marker='x', linestyle='--', color='red')
    plt.title(f'Sequence starting at {start_index + 1}: Predictions vs Actuals')
    plt.ylabel('Value')
    plt.ylim(0, 1)  # Ensure the y-axis ranges from 0 to 1
    plt.legend()
    plt.xlabel('Time Step')
    plt.show()  # Show the plot

### ARIMA

In [None]:
import numpy as np
import pandas as pd
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import random

# Assuming second_feature_data is your dataset
# second_feature_data = np.random.rand(10000)  # Example, replace with your actual dataset

window_size = 576  
prediction_length = 36  

all_forecasts = []
all_actuals = []

max_start_index = len(second_feature_data) - window_size - prediction_length

# Selecting 25 random start indices for the prediction
random_start_indices = random.sample(range(max_start_index), 25)

for start_index in random_start_indices:
    print(start_index)
    train = second_feature_data[start_index:start_index + window_size]
    test = second_feature_data[start_index + window_size:start_index + window_size + prediction_length]
    
    model = ARIMA(train, order=(36, 1, 36))
    model_fitted = model.fit()
    
    forecast = model_fitted.forecast(steps=prediction_length)
    all_forecasts.append(forecast)
    all_actuals.append(test)

# Calculate MSE for each prediction set
mse_losses = [mean_squared_error(actuals, forecasts) for actuals, forecasts in zip(all_actuals, all_forecasts)]

# Compute the average MSE
average_mse = np.mean(mse_losses)
print("Average Mean Squared Error (MSE) across all sequences:", average_mse)

# Plotting each sequence in its own plot
for i, start_index in enumerate(random_start_indices):
    plt.figure(figsize=(10, 6))  # Create a new figure for each plot
    plt.plot(range(prediction_length), all_actuals[i], label='Actual Future Values', marker='o', linestyle='-', color='blue')
    plt.plot(range(prediction_length), all_forecasts[i], label='Forecasted Future Values', marker='x', linestyle='--', color='red')
    plt.title(f'Sequence starting at {start_index + 1}: Predictions vs Actuals')
    plt.ylabel('Value')
    plt.ylim(0, 1)  # Ensure the y-axis ranges from 0 to 1
    plt.legend()
    plt.xlabel('Time Step')
    plt.show()  # Show the plot

### SARIMA

In [None]:
import numpy as np
import pandas as pd
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import random

# Assuming second_feature_data is your dataset
# second_feature_data = np.random.rand(10000)  # Example, replace with your actual dataset

window_size = 576  
prediction_length = 36  

all_forecasts = []
all_actuals = []

max_start_index = len(second_feature_data) - window_size - prediction_length

# Selecting 25 random start indices for the prediction
random_start_indices = random.sample(range(max_start_index), 3)

for start_index in random_start_indices:
    print(start_index)
    train = second_feature_data[start_index:start_index + window_size]
    test = second_feature_data[start_index + window_size:start_index + window_size + prediction_length]
    
    model = SARIMAX(train, order=(36, 1, 36), seasonal_order=(36, 1, 36, 72))

    model_fitted = model.fit()
    
    forecast = model_fitted.forecast(steps=prediction_length)
    all_forecasts.append(forecast)
    all_actuals.append(test)

# Calculate MSE for each prediction set
mse_losses = [mean_squared_error(actuals, forecasts) for actuals, forecasts in zip(all_actuals, all_forecasts)]

# Compute the average MSE
average_mse = np.mean(mse_losses)
print("Average Mean Squared Error (MSE) across all sequences:", average_mse)

# Plotting each sequence in its own plot
for i, start_index in enumerate(random_start_indices):
    plt.figure(figsize=(10, 6))  # Create a new figure for each plot
    plt.plot(range(prediction_length), all_actuals[i], label='Actual Future Values', marker='o', linestyle='-', color='blue')
    plt.plot(range(prediction_length), all_forecasts[i], label='Forecasted Future Values', marker='x', linestyle='--', color='red')
    plt.title(f'Sequence starting at {start_index + 1}: Predictions vs Actuals')
    plt.ylabel('Value')
    plt.ylim(0, 1)  # Ensure the y-axis ranges from 0 to 1
    plt.legend()
    plt.xlabel('Time Step')
    plt.show()  # Show the plot
