In [7]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error
from src.holt_winters import create_holt_winters_model

In [3]:
# Import and visualise dataset
df = pd.read_csv("datasets/nasa_requests_per_minute.csv")
df['time'] = pd.to_datetime(df['time'])
df = df.set_index('time')

print(f"Dataset shape: {df.shape}")
print(f"Total entries in dataset: {len(df)}")
print(f"Date range: {df.index.min()} - {df.index.max()}")
print(f"First 5 rows: \n {df.head()}")

Dataset shape: (87015, 1)
Total entries in dataset: 87015
Date range: 1995-07-01 00:00:00 - 1995-08-31 23:59:00
First 5 rows: 
                      number_of_requests
time                                   
1995-07-01 00:00:00                  42
1995-07-01 00:01:00                  61
1995-07-01 00:02:00                  57
1995-07-01 00:03:00                  71
1995-07-01 00:04:00                  70


In [4]:
# Split dataset into train and test sets at 80/20 ratio
train_size = int(len(df) * 0.8)

train_dataset = df.iloc[:train_size]
test_dataset = df.iloc[train_size:]

print(f"Train data: {len(train_dataset)} entries")
print(f"Date range: {train_dataset.index.min()} - {train_dataset.index.max()} \n")
print(f"Test data: {len(test_dataset)} entries")
print(f"Date range: {test_dataset.index.min()} - {test_dataset.index.max()}")

Train data: 69612 entries
Date range: 1995-07-01 00:00:00 - 1995-08-19 21:56:00 

Test data: 17403 entries
Date range: 1995-08-19 21:57:00 - 1995-08-31 23:59:00


In [5]:
# Create and train model
holt_winters_model = create_holt_winters_model(seasonal_periods=1440)

print("Training model...")
holt_winters_model.fit(train_dataset['number_of_requests'].tolist())
print(f"Model training complete! \n Model fitted: {holt_winters_model.is_fitted()}")

Training model...




Model training complete! 
 Model fitted: True


In [9]:
import importlib
import src.holt_winters
importlib.reload(src.holt_winters)

<module 'src.holt_winters' from 'C:\\Users\\ajaylk\\uni\\fyp\\code\\tsf-models\\src\\holt_winters.py'>

In [10]:
# Test fitted model
predictions = []
actual_values = []
prediction_intervals = []

print("Running model against test data...")
for i in range(len(test_dataset)):
    prediction = holt_winters_model.predict(steps=1)
    actual = test_dataset.iloc[i]['number_of_requests']

    predictions.append(prediction['prediction'])
    actual_values.append(actual)
    prediction_intervals.append(
        {
            'lower_bound': prediction['lower_bound'],
            'upper_bound': prediction['upper_bound'],
            'std': prediction['std']
        }
    )

    # Append actual number of requests
    holt_winters_model.update([actual])
    if (i + 1) % 1000 == 0:
        print(f"Processed {i + 1}/{len(test_dataset)} test samples", flush=True)

Running model against test data...
Processed 1000/17403 test samples
Processed 2000/17403 test samples
Processed 3000/17403 test samples
Processed 4000/17403 test samples
Processed 5000/17403 test samples
Processed 6000/17403 test samples
Processed 7000/17403 test samples
Processed 8000/17403 test samples
Processed 9000/17403 test samples
Processed 10000/17403 test samples
Processed 11000/17403 test samples
Processed 12000/17403 test samples
Processed 13000/17403 test samples
Processed 14000/17403 test samples
Processed 15000/17403 test samples
Processed 16000/17403 test samples
Processed 17000/17403 test samples


In [None]:
# Save the trained model
path_to_save = 

In [12]:
cleaned_predictions = []
for prediction in predictions:
    cleaned_predictions.append(prediction['mean'])

In [13]:
# Evaluation metrics
predictions = np.array(cleaned_predictions)
actual_values = np.array(actual_values)

mae = mean_absolute_error(actual_values, predictions)
mse = mean_squared_error(actual_values, predictions)
rmse = np.sqrt(mse)

# Calculate % of actual requests within prediction intervals
actuals_within_interval = sum(
    1 for i, value in enumerate(actual_values) if prediction_intervals[i]['lower_bound'] <= value <= prediction_intervals[i]['upper_bound']
) 
coverage_percentage = (actuals_within_interval / len(actual_values)) * 100

print("="*60)
print("MODEL PERFORMANCE METRICS")
print("="*60)
print(f"Mean Absolute Error (MAE):           {mae:.2f} requests")
print(f"Root Mean Squared Error (RMSE):      {rmse:.2f} requests")
print(f"Prediction Interval Coverage (95%):  {coverage_percentage:.2f}%")
print("="*60)

MODEL PERFORMANCE METRICS
Mean Absolute Error (MAE):           10.96 requests
Root Mean Squared Error (RMSE):      14.44 requests
Prediction Interval Coverage (95%):  93.27%


In [None]:
# Debug actual requests vs predicted request values
comparison_df = pd.DataFrame({
    'Timestamp': test_dataset.index,
    'Actual': actual_values,
    'Predicted': [round(p, 2) for p in predictions],
    'Error': [round(a - p, 2) for a, p in zip(actual_values, predictions)],
    'Abs_Error': [round(abs(a - p), 2) for a, p in zip(actual_values, predictions)],
    'Lower_95%': [round(pi['lower_bound'], 2) for pi in prediction_intervals],
    'Upper_95%': [round(pi['upper_bound'], 2) for pi in prediction_intervals]
})

print("\n" + "="*100)
print("PREDICTION vs ACTUAL COMPARISON (First 50 samples)")
print("="*100)
print(comparison_df.head(50).to_string(index=False))

print("\n" + "="*100)
print("PREDICTION vs ACTUAL COMPARISON (Last 50 samples)")
print("="*100)
print(comparison_df.tail(50).to_string(index=False))