In [None]:

import numpy as np
import tensorflow as tf
import random

# Set global random seeds for reproducibility
SEED = 45
np.random.seed(SEED)
tf.random.set_seed(SEED)
random.seed(SEED)
# Cell 1: Import all necessary libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
from sklearn.metrics import mean_squared_error, mean_absolute_error
import matplotlib.pyplot as plt

try:
  from tensorflow.keras.optimizers import Adam
except ImportError:
  from keras.optimizers import Adam

print("All libraries imported successfully!ÔºÅ")

In [None]:

path_prefix = "******"

try:
    all_data_pd = pd.read_csv(path_prefix + "original_data.csv")

    print("Original data loaded successfully.")
except FileNotFoundError:
    print("Error: Original data file not found. Please check the path!")

# 2. Load the three key CSV files generated in R
try:
    training_residuals = pd.read_csv(path_prefix + "training_set_residuals.csv")
    arima_validation_forecasts = pd.read_csv(path_prefix + "validation_set_predictions.csv")
    arima_final_forecasts = pd.read_csv(path_prefix + "final_forecast_results.csv")
    print("R-generated CSV files loaded successfully.")
except FileNotFoundError as e:
    print(f"Error: CSV file not found: {e}. Please check the path!")

In [None]:
def create_dataset(data, time_steps=1):

    X, y = [], []
    for i in range(len(data) - time_steps):

        X.append(data[i:(i + time_steps)])

        y.append(data[i + time_steps])
    return np.array(X), np.array(y)

In [None]:
# --- [Final Complete Version] Cell 4: Includes forced sorting logic and calculates both RMSE and MAE ---

# Initialize lists
final_results_list = []
model_selection_list = []

# Hyperparameters
TIME_STEPS = 3
LSTM_UNITS = 50
EPOCHS = 200

unique_diseases = all_data_pd['metric2'].unique()

for disease in unique_diseases:
    if pd.isna(disease):
        continue
    print(f"--- Processing disease: {disease} ---")

    try:
        # a. Prepare data and force sorting
        current_residuals_train_df = training_residuals[training_residuals['metric'] == disease].sort_values(by='time')
        current_arima_validation_df = arima_validation_forecasts[arima_validation_forecasts['metric'] == disease].sort_values(by='time')
        current_arima_final_df = arima_final_forecasts[arima_final_forecasts['metric'] == disease].sort_values(by='time')

        current_residuals_train = current_residuals_train_df['residual'].values
        current_arima_validation = current_arima_validation_df['predicted_incidence_rate'].values

        if len(current_residuals_train) < TIME_STEPS + 5:
            print("  >> Insufficient residual data, skipping LSTM modeling.")
            continue

        # 2. Create a custom Adam optimizer instance and specify the learning rate
        custom_optimizer = Adam(learning_rate=0.001)


        # b. LSTM training (no changes in this part)
        scaler = MinMaxScaler(feature_range=(0, 1))
        residuals_scaled = scaler.fit_transform(current_residuals_train.reshape(-1, 1))
        X_train, y_train = create_dataset(residuals_scaled, TIME_STEPS)
        X_train = np.reshape(X_train, (X_train.shape[0], X_train.shape[1], 1))
        lstm_model = Sequential([ LSTM(units=LSTM_UNITS, input_shape=(TIME_STEPS, 1)), Dense(units=1) ])
        lstm_model.compile(optimizer=custom_optimizer, loss='mean_squared_error')
        lstm_model.fit(X_train, y_train, epochs=EPOCHS, batch_size=1, verbose=0, shuffle=False)

        # c. LSTM prediction (no changes in this part)
        n_predict = 6
        last_known_residuals = residuals_scaled[-TIME_STEPS:]
        predicted_residuals_scaled = []
        current_input = last_known_residuals.reshape(1, TIME_STEPS, 1)
        for _ in range(n_predict):
            next_pred = lstm_model.predict(current_input, verbose=0)
            predicted_residuals_scaled.append(next_pred[0, 0])
            new_input_sequence = np.append(current_input.flatten()[1:], next_pred)
            current_input = new_input_sequence.reshape(1, TIME_STEPS, 1)
        predicted_residuals = scaler.inverse_transform(np.array(predicted_residuals_scaled).reshape(-1, 1)).flatten()
        predicted_validation_residuals = predicted_residuals[0:2]
        predicted_final_residuals = predicted_residuals[2:6]

        # d. Calculate hybrid model predicted values on the validation set
        hybrid_validation_pred = current_arima_validation + predicted_validation_residuals

        # e. Calculate RMSE and MAE for both models on the validation set
        actual_validation_df = all_data_pd[
            (all_data_pd['metric2'] == disease) &
            (all_data_pd['time'].between(2018, 2019))
        ].sort_values(by='time')
        actual_validation_values = actual_validation_df['incidence_rate'].values

        if len(actual_validation_values) != 2:
            print(f"  >> Validation set actual data length is not 2, skipping {disease}")
            continue

        # Calculate RMSE
        arima_mse = mean_squared_error(actual_validation_values, current_arima_validation)
        hybrid_mse = mean_squared_error(actual_validation_values, hybrid_validation_pred)
        arima_rmse = np.sqrt(arima_mse)
        hybrid_rmse = np.sqrt(hybrid_mse)

        # [NEW] Calculate MAE (Mean Absolute Error)
        arima_mae = mean_absolute_error(actual_validation_values, current_arima_validation)
        hybrid_mae = mean_absolute_error(actual_validation_values, hybrid_validation_pred)

        # f. Model tournament... (subsequent code unchanged) ...
        current_arima_final_df = arima_final_forecasts[arima_final_forecasts['metric'] == disease].sort_values(by='time')

        if np.isnan(hybrid_rmse) or arima_rmse <= hybrid_rmse:
            winner = "ARIMA"
            final_prediction = current_arima_final_df.copy()
        else:
            winner = "ARIMA-LSTM"
            hybrid_final_prediction_values = current_arima_final_df['predicted_incidence_rate'].values + predicted_final_residuals
            final_prediction = current_arima_final_df.copy()
            final_prediction['predicted_incidence_rate'] = hybrid_final_prediction_values

        # g. Store results
        final_prediction['predicted_incidence_rate'] = final_prediction['predicted_incidence_rate'].clip(lower=0)
        final_results_list.append(final_prediction)


        model_selection_list.append({
            'metric': disease,
            'ARIMA_RMSE': arima_rmse,
            'ARIMA_MAE': arima_mae,
            'Hybrid_RMSE': hybrid_rmse,
            'Hybrid_MAE': hybrid_mae,
            'Selected_Model': winner
        })

    except Exception as e:
        print(f"  >> Error occurred while processing {disease}: {e}")

print("\n--- All diseases processed! ---")

In [None]:

final_predictions_df = pd.concat(final_results_list, ignore_index=True)
model_selection_df = pd.DataFrame(model_selection_list)

median_hybrid_rmse = model_selection_df['Hybrid_RMSE'].median()
median_hybrid_mae = model_selection_df['Hybrid_MAE'].median()

In [None]:

output_excel_path = "***/final_prediction_results.xlsx"