In [None]:
from neuralforecast import NeuralForecast
from neuralforecast.models import LSTM, NHITS
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv('../data/cleaned_data/average_monthly_rating_messenger.csv')
df['Month-Year'] = pd.to_datetime(df['Month-Year'], format='%Y-%m')
df.rename(columns={'Month-Year': 'ds', 'averageRating': 'y'}, inplace=True)
df['ds'] = pd.to_datetime(df['ds'])
df['unique_id'] = 0

Y_df = df 
Y_df.head()

In [None]:
import numpy as np
from neuralforecast import NeuralForecast
from neuralforecast.models import NHITS, LSTM
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Calculate the split point (80% for training)
split_point = int(len(Y_df) * 0.8)
Y_train = Y_df.iloc[:split_point].copy()
Y_test = Y_df.iloc[split_point:].copy()

# Set horizon to match test set size
horizon = len(Y_test)
print(f"\nHorizon length: {horizon}")

# Define models
models = [
    LSTM(
        h=horizon,
        max_steps=500,
        scaler_type='standard',
        encoder_hidden_size=64,
        decoder_hidden_size=64,
    ),
    NHITS(
        h=horizon,
        input_size=2 * horizon,
        max_steps=100,
        n_freq_downsample=[2, 1, 1]
    )
]

In [None]:
# Train models on training data
nf = NeuralForecast(models=models, freq='M')
nf.fit(df=Y_train)

# Generate predictions for test period
Y_hat_df = nf.predict(Y_train).reset_index(drop=True)

# Calculate metrics and weights for ensemble
eval_results = {}
model_weights = {}

for model in ['LSTM', 'NHITS']:
    eval_df = pd.DataFrame({
        'ds': Y_test['ds'],
        'actual': Y_test['y'],
        'predicted': Y_hat_df[model].values[:len(Y_test)]
    })
    
    rmse = np.sqrt(mean_squared_error(eval_df['actual'], eval_df['predicted']))
    mae = mean_absolute_error(eval_df['actual'], eval_df['predicted'])
    mape = np.mean(np.abs((eval_df['actual'] - eval_df['predicted']) / eval_df['actual'])) * 100
    
    eval_results[model] = {
        'RMSE': rmse,
        'MAE': mae,
        'MAPE': mape
    }
    
    # Calculate weight as inverse of RMSE
    model_weights[model] = 1/rmse

# Normalize weights to sum to 1
weight_sum = sum(model_weights.values())
for model in model_weights:
    model_weights[model] /= weight_sum

print("\nModel Weights:")
for model, weight in model_weights.items():
    print(f"{model}: {weight:.3f}")

# Create and train ensemble model for future predictions
nf_future = NeuralForecast(models=models, freq='M')
nf_future.fit(df=Y_df)
Y_hat_future = nf_future.predict().reset_index(drop=True)

In [None]:
# Add ensemble predictions
Y_hat_future['ENSEMBLE'] = 0
for model in ['LSTM', 'NHITS']:
    Y_hat_future['ENSEMBLE'] += Y_hat_future[model] * model_weights[model]

# Create visualization
fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(20, 14))

# Plot 1: Training and Test Performance
Y_df.set_index('ds')['y'].plot(ax=ax1, linewidth=2, label='Actual', color='blue')

# Add ensemble predictions to test set
Y_hat_df['ENSEMBLE'] = 0
for model in ['LSTM', 'NHITS']:
    Y_hat_df['ENSEMBLE'] += Y_hat_df[model] * model_weights[model]

# Plot individual model and ensemble predictions
for model in ['LSTM', 'NHITS', 'ENSEMBLE']:
    Y_hat_df.set_index('ds')[model].plot(ax=ax1, linewidth=2, label=f'{model} Predictions')

train_split_date = Y_train.iloc[-1]['ds']
ax1.axvline(x=train_split_date, color='r', linestyle='--', label='Train-Test Split')
ax1.axvspan(Y_test['ds'].min(), Y_test['ds'].max(), alpha=0.1, color='gray', label='Test Period')

ax1.set_title('Model Performance Comparison (Including Ensemble)', fontsize=22)
ax1.set_ylabel('Monthly Rating', fontsize=20)
ax1.set_xlabel('Timestamp [t]', fontsize=20)
ax1.legend(prop={'size': 15})
ax1.grid()

# Plot 2: Future Predictions
Y_df.set_index('ds')['y'].plot(ax=ax2, linewidth=2, label='Actual', color='blue')

# Plot future predictions for all models including ensemble
for model in ['LSTM', 'NHITS', 'ENSEMBLE']:
    Y_hat_future.set_index('ds')[model].plot(ax=ax2, linewidth=2, label=f'{model} Predictions')

ax2.set_title('Future Predictions (Including Ensemble)', fontsize=22)
ax2.set_ylabel('Monthly Rating', fontsize=20)
ax2.set_xlabel('Timestamp [t]', fontsize=20)
ax2.legend(prop={'size': 15})
ax2.grid()

plt.tight_layout()

# Calculate ensemble metrics for test period
ensemble_predictions = Y_hat_df['ENSEMBLE'].values[:len(Y_test)]
ensemble_rmse = np.sqrt(mean_squared_error(Y_test['y'], ensemble_predictions))
ensemble_mae = mean_absolute_error(Y_test['y'], ensemble_predictions)
ensemble_mape = np.mean(np.abs((Y_test['y'] - ensemble_predictions) / Y_test['y'])) * 100

print("\nModel Performance Metrics:")
print("\nEnsemble Model:")
print(f"RMSE: {ensemble_rmse:.3f}")
print(f"MAE: {ensemble_mae:.3f}")
print(f"MAPE: {ensemble_mape:.2f}%")

for model in ['LSTM', 'NHITS']:
    print(f"\n{model}:")
    print(f"RMSE: {eval_results[model]['RMSE']:.3f}")
    print(f"MAE: {eval_results[model]['MAE']:.3f}")
    print(f"MAPE: {eval_results[model]['MAPE']:.2f}%")

# Print future predictions
print("\nFuture Predictions (next 5 periods):")
future_predictions = Y_hat_future[['ds', 'LSTM', 'NHITS', 'ENSEMBLE']].head()
future_predictions[['LSTM', 'NHITS', 'ENSEMBLE']] = future_predictions[['LSTM', 'NHITS', 'ENSEMBLE']].round(3)
print(future_predictions)