In [7]:
import pandas as pd

# Loading

In [8]:
# Read csv
df = pd.read_csv('first_5_points copy.csv', parse_dates=True, index_col=0)
df.index = pd.to_datetime(df.index)
df.head(2)

Unnamed: 0_level_0,Date,Hour,Little Collins St-Swanston St (East),Faraday St-Lygon St (West),Melbourne Central,Chinatown-Lt Bourke St (South),Lonsdale St (South),Lygon St (West),Weekday,Month,Season,IsPublicHoliday
Datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2022-04-01 00:00:00,01/04/2022,0,166.0,24.0,380.0,73.0,215.0,48.0,5,4,Spring,0
2022-04-01 01:00:00,01/04/2022,1,108.0,9.0,198.0,79.0,156.0,11.0,5,4,Spring,0


In [9]:
train = df[df.index < "2024-10-16"]
test = df[df.index >= "2024-10-16"]

# SARIMA

In [10]:
train.columns

Index(['Date', 'Hour', 'Little Collins St-Swanston St (East)',
       'Faraday St-Lygon St (West)', 'Melbourne Central',
       'Chinatown-Lt Bourke St (South)', 'Lonsdale St (South)',
       'Lygon St (West)', 'Weekday', 'Month', 'Season', 'IsPublicHoliday'],
      dtype='object')

## Staionary d

In [None]:
from statsmodels.tsa.stattools import adfuller

target_column = ['Little Collins St-Swanston St (East)',
       'Faraday St-Lygon St (West)', 'Melbourne Central',
       'Chinatown-Lt Bourke St (South)', 'Lonsdale St (South)',
       'Lygon St (West)']

for column in target_column:
    print(column)
    result = adfuller(train[column])
    print('p-value:', result[1])  # A p-value > 0.05 indicates non-stationarity.


No differnecing needed

In [12]:
d = 0

## Seasonal

In [None]:
from statsmodels.tsa.seasonal import seasonal_decompose
import matplotlib.pyplot as plt

df_2 = train[:1000]

decomposition = seasonal_decompose(df_2['Little Collins St-Swanston St (East)'], model='additive', period=24)  # e.g., 24 for hourly data
fig = decomposition.plot()
# Make lines thinner
for ax in fig.axes:
    plt.setp(ax.lines, linewidth=1)

fig.set_size_inches(10, 6)  # Width x Height in inches
fig.tight_layout()  # Adjust spacing
plt.show()


In [None]:
S = 24

## Parameter search

In [None]:
import itertools
import pandas as pd
from statsmodels.tsa.statespace.sarimax import SARIMAX
from tqdm import tqdm  # For progress bar

# Define parameter ranges
p = q = range(0, 4)  # Range for p and q
d = 0                # Fixed d
P = D = Q = range(0, 3)  # Range for seasonal P, D, Q
m = 24               # Daily seasonality

train = train.asfreq('H')

# Generate all parameter combinations
pdq = [(p_val, d, q_val) for p_val in p for q_val in q]  # Fix d = 0
print("Non-seasonal combinations (p, d, q):", pdq)

seasonal_pdq = [(P_val, D_val, Q_val, m) for P_val in P for D_val in D for Q_val in Q]
print("Seasonal combinations (P, D, Q, m):", seasonal_pdq)

total_combinations = len(pdq) * len(seasonal_pdq)

# Initialize a dictionary to store the best results for each column
best_results = {}

columns = ['Little Collins St-Swanston St (East)',
       'Faraday St-Lygon St (West)', 'Melbourne Central',
       'Chinatown-Lt Bourke St (South)', 'Lonsdale St (South)',
       'Lygon St (West)']

# Iterate over columns
for column in columns:
    print(f"Processing column: {column}")
    
    # Reset best AIC and parameters for each column
    best_aic = float('inf')
    best_params = None
    
    # Progress bar
    progress_bar = tqdm(total=total_combinations, desc=f"Testing SARIMAX for {column}")
    
    # Grid search over all parameter combinations
    for param in pdq:
        for seasonal_param in seasonal_pdq:
            try:
                # Fit the model
                model = SARIMAX(
                    train[column],
                    order=param,
                    seasonal_order=seasonal_param,
                    enforce_stationarity=False,
                    enforce_invertibility=False
                )
                results = model.fit(disp=False)

                # Check if the model is better
                if results.aic < best_aic:
                    best_aic = results.aic
                    best_params = (param, seasonal_param)

            except Exception as e:
                # Handle exceptions (e.g., convergence issues) silently
                pass

            # Update the progress bar
            progress_bar.update(1)

    # Close the progress bar
    progress_bar.close()

    # Store the best parameters and AIC for this column
    best_results[column] = {'params': best_params, 'aic': best_aic}

    print(f"Best SARIMAX parameters for {column}: {best_params} with AIC: {best_aic}")

# Display the results for all columns
for col, result in best_results.items():
    print(f"Column: {col}, Best Params: {result['params']}, Best AIC: {result['aic']}")


Column: Little Collins St-Swanston St (East): ((1, 0, 2), (1, 1, 1, 24)) with AIC: 230806.7825760926 <br>
Column: Faraday St-Lygon St (West), Best Params: ((1, 0, 2), (1, 1, 1, 24)), Best AIC: 190003.90829115754 <br>
Column: Melbourne Central, Best Params: ((1, 0, 2), (1, 1, 1, 24)), Best AIC: 233857.60412778181 <br>
Column: Chinatown-Lt Bourke St (South), Best Params: ((1, 0, 2), (1, 1, 1, 24)), Best AIC: 219997.16664512668 <br>
Column: Lonsdale St (South), Best Params: ((2, 0, 1), (1, 1, 1, 24)), Best AIC: 208037.09922881392 <br>
Column: Lygon St (West), Best Params: ((2, 0, 2), (1, 1, 1, 24)), Best AIC: 195383.3819879219 <br>

## Fit SARIMAX

In [None]:
from statsmodels.tsa.statespace.sarimax import SARIMAX
import pickle
best_params = {
    'Little Collins St-Swanston St (East)': ((1, 0, 2), (1, 1, 1, 24)),
    'Faraday St-Lygon St (West)': ((1, 0, 2), (1, 1, 1, 24)),
    'Melbourne Central': ((1, 0, 2), (1, 1, 1, 24)),
    'Chinatown-Lt Bourke St (South)': ((1, 0, 2), (1, 1, 1, 24)),
    'Lonsdale St (South)': ((2, 0, 1), (1, 1, 1, 24)),
}

trained_models = {}

for column, (order, seasonal_order) in best_params.items():
    print(f"Training SARIMAX model for column: {column}")
    
    # Define the SARIMAX model
    model = SARIMAX(
        train[column],
        order=order,
        seasonal_order=seasonal_order,
        enforce_stationarity=False,
        enforce_invertibility=False
    )
    
    # Fit the model
    results = model.fit(disp=False)
    trained_models[column] = results
    
    # Save the trained model to a file
    with open(f'{column.replace(" ", "_")}_sarimax_model.pkl', 'wb') as file:
        #pickle.dump(results, file)
    
        print(f"Model for {column} saved successfully.")

# Train Test save csv prediciton directly no pkl 

In [8]:
import pandas as pd
from statsmodels.tsa.statespace.sarimax import SARIMAX
import numpy as np

# Define evaluation metrics (optional, for verification)
def calculate_metrics(y_true, y_pred):
    rmse = mean_squared_error(y_true, y_pred, squared=False)
    mape = np.mean(np.abs((y_true - y_pred) / y_true)) * 100
    return rmse, mape

# Define the best parameters for SARIMAX
best_params = {
    'Little Collins St-Swanston St (East)': ((1, 0, 2), (1, 1, 1, 24)),
    'Faraday St-Lygon St (West)': ((1, 0, 2), (1, 1, 1, 24)),
    'Melbourne Central': ((1, 0, 2), (1, 1, 1, 24)),
    'Chinatown-Lt Bourke St (South)': ((1, 0, 2), (1, 1, 1, 24)),
    'Lonsdale St (South)': ((2, 0, 1), (1, 1, 1, 24)),
}

# Load the data
data = pd.read_csv('first_5_points copy.csv', parse_dates=['Datetime'], index_col='Datetime')
print("Data loaded successfully.")

# Initialize a DataFrame to store predictions
predictions = pd.DataFrame(index=data.iloc[-16 * 24:].index)  # Test period index

# Process each location
for column, (order, seasonal_order) in best_params.items():
    print(f"Processing column: {column}")
    
    if column not in data.columns:
        print(f"Column {column} not found in data. Skipping...")
        continue

    # Prepare the data
    y = data[column].dropna()
    
    # Split into training and testing (last 16 days for testing)
    split_index = len(y) - 16 * 24
    train, test = y.iloc[:split_index], y.iloc[split_index:]
    
    try:
        # Define and train the SARIMAX model
        model = SARIMAX(
            train,
            order=order,
            seasonal_order=seasonal_order,
            enforce_stationarity=False,
            enforce_invertibility=False
        )
        results = model.fit(disp=False)
        
        # Generate predictions for the test set
        y_pred = results.forecast(steps=len(test))
        
        # Store predictions in the DataFrame
        predictions[column] = y_pred.values
        print(f"Predictions for {column} generated successfully.")
        
    except Exception as e:
        print(f"Error processing {column}: {e}")

# Save predictions to a CSV file
predictions.to_csv('sarimax_predictions.csv')
print("Predictions saved to 'sarimax_predictions.csv'.")

Data loaded successfully.
Processing column: Little Collins St-Swanston St (East)


  self._init_dates(dates, freq)
  self._init_dates(dates, freq)
  return get_prediction_index(
  return get_prediction_index(
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


Predictions for Little Collins St-Swanston St (East) generated successfully.
Processing column: Faraday St-Lygon St (West)


  return get_prediction_index(
  return get_prediction_index(
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


Predictions for Faraday St-Lygon St (West) generated successfully.
Processing column: Melbourne Central


  return get_prediction_index(
  return get_prediction_index(
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


Predictions for Melbourne Central generated successfully.
Processing column: Chinatown-Lt Bourke St (South)


  return get_prediction_index(
  return get_prediction_index(
  self._init_dates(dates, freq)
  self._init_dates(dates, freq)


Predictions for Chinatown-Lt Bourke St (South) generated successfully.
Processing column: Lonsdale St (South)
Predictions for Lonsdale St (South) generated successfully.
Predictions saved to 'sarimax_predictions.csv'.


  return get_prediction_index(
  return get_prediction_index(
