In [None]:
import yfinance as yf
import pandas as pd
import numpy as np
from datetime import date
import seaborn as sns
import random

import matplotlib.pyplot as plt

from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, mean_absolute_percentage_error
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.mixture import GaussianMixture


from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.stats.diagnostic import acorr_ljungbox

import scipy.stats as stats
from scipy.stats import probplot, laplace, norm, t


import statsmodels.api as sm
from statsmodels.nonparametric.kde import KDEUnivariate
from statsmodels.tsa.stattools import adfuller, kpss
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.arima_process import ArmaProcess

import pymc as pm
import pytensor.tensor as pt
import arviz as az

import tensorflow as tf
from tensorflow import keras


#from tensorflow.keras.utils import plot_model


######################################
#from pmdarima import auto_arima
#from diptest import diptest

In [None]:
class SARIMAModel:
    def __init__(self, series):
        self.model = None
        self.series = series

    def fit_predict(self, df_org, nfuture, order, seasonal_order, ndiff, logg=True):
        df=self.series
        split_index = int(len(df) - nfuture)
        ts = df.iloc[:split_index]

        model = SARIMAX(ts, order=order, seasonal_order=seasonal_order)
        result_sarima = model.fit()
        print (result_sarima.summary())

        ##ANALYSIS
        # Train sample fitted values & confidence intervals
        train_pred = result_sarima.get_prediction(start=0, end=split_index-1)
        train_values = train_pred.predicted_mean

        #Fittedvalues
        pred_values=train_values
        ts = ts.values

        # 1. Correlation between actual and fitted values
        correlation = np.corrcoef(ts, pred_values)[0, 1]
        print(f"Correlation (Actual vs. Fitted): {correlation:.4f}")

        # 2. Residuals analysis
        residuals = ts - pred_values

        # 2a. Q-Q Plot for normality check
        plt.figure(figsize=(6, 6))
        probplot(residuals, dist="norm", plot=plt)
        plt.title("Q-Q Plot of Residuals")
        plt.show()

        # 2b. Histogram of residuals
        plt.figure(figsize=(6, 4))
        plt.hist(residuals, bins=20, density=True, alpha=0.6, color='b')
        # Fit and plot KDE
        kde = KDEUnivariate(residuals)
        kde.fit()
        x_range = np.linspace(min(residuals), max(residuals), 100)
        plt.plot(x_range, kde.evaluate(x_range), color='red', label="KDE Fit")

        plt.title("Histogram and KDE of Residuals")
        plt.legend()
        plt.show()

        # 3. Ljung-Box Test (Autocorrelation in Residuals)
        # Perform Ljung-Box Test for multiple lags
        lags = np.arange(1, 21)  # Checking for first 20 lags
        ljung_box_results = sm.stats.acorr_ljungbox(residuals, lags=lags, return_df=True)

        # Extract p-values
        p_values = ljung_box_results['lb_pvalue']

        # Plot Ljung-Box p-values
        plt.figure(figsize=(10, 5))
        plt.stem(lags, p_values, basefmt=" ")
        plt.axhline(y=0.05, color='r', linestyle='--', label='Significance Level (0.05)')
        plt.xlabel('Lag')
        plt.ylabel('p-value')
        plt.title('Ljung-Box Test P-values for Different Lags')
        plt.legend()
        plt.show()

        # 4. Error Metrics
        mae = np.mean(np.abs(residuals))
        mse = np.mean(residuals**2)
        rmse = np.sqrt(mse)

        print(f"MAE: {mae:.4f}")
        print(f"MSE: {mse:.4f}")
        print(f"RMSE: {rmse:.4f}")


        #####################################################
        #forecast value

        # All sample fitted values & confidence intervals
        fitted_pred = result_sarima.get_prediction(start=0, end=len(df)-1)
        fitted_values = fitted_pred.predicted_mean
        fitted_conf_int = fitted_pred.conf_int()

        # Plot actual, fitted, and predicted values with confidence intervals
        plt.figure(figsize=(10, 5))

        # Plot observed data
        plt.plot(df, label="Actual", linestyle="dashed", alpha=0.7, color="blue")

        # Plot fitted values
        plt.plot(df.index, fitted_values, label="Fitted", color='red')

        # Confidence interval for fitted values
        plt.fill_between(df.index,
                        fitted_conf_int.iloc[:, 0],  # Lower bound
                        fitted_conf_int.iloc[:, 1],  # Upper bound
                        color="red", alpha=0.2, label="95% CI (Fitted)")

        plt.axvline(x=df.index[split_index], color='k', linestyle='--', label='Train/Test split')
        plt.title("SARIMA Model - Actual & Fitted & Forecast with Confidence Intervals")
        plt.legend()
        plt.show()

        ############################################
        #forecast value to original scale
        if logg:
            log_reverse_series = pd.Series(fitted_values[split_index:].cumsum() + np.log(df_org.iloc[split_index])) #diff order = 1
            ICI = np.exp(fitted_conf_int.iloc[split_index:, 0] + np.log(df_org.iloc[split_index]))
            FCI = np.exp(fitted_conf_int.iloc[split_index:, 1] + np.log(df_org.iloc[split_index]))
            reverse_series = np.exp(log_reverse_series).dropna()
        else:
            reverse_series = fitted_values[split_index-1:].cumsum() #diff order = 1
            ICI = (fitted_conf_int.iloc[split_index:, 0] + df_org.iloc[split_index])
            FCI = (fitted_conf_int.iloc[split_index:, 1] + df_org.iloc[split_index])

        # Plot actual vs. fitted values
        plt.figure(figsize=(10,5))
        plt.plot(df_org, label="Actual", linestyle="dashed", alpha=0.7)
        plt.plot(df_org.index[split_index+1:], reverse_series , label="Fitted", color='green')
        plt.axvline(x=df_org.index[split_index], color='k', linestyle='--', label='Train/Test split')

        # Confidence interval for fitted values
        plt.fill_between(df_org.index[split_index+1:],
                        ICI,  # Lower bound
                        FCI,  # Upper bound
                        color="red", alpha=0.2, label="95% CI (Fitted)")

        plt.title("SARIMA Model - Actual vs. Fitted Values")

        plt.legend()
        plt.show()

        return reverse_series


    def difference(self, ndiff, plot=True):
        diff = self.series.diff(ndiff)
        self.series = diff
        self.check_stationarity()
        return diff

    def log_difference(self, ndiff, plot=True):
        log_series = np.log(self.series).dropna()
        diff = log_series.diff(ndiff).dropna() if ndiff > 0 else log_series
        self.series = diff
        self.check_stationarity()
        return diff

    def check_stationarity(self, window=30, lags=30):
        # Rolling Mean & Standard Deviation
        rolling_mean = self.series.rolling(window=window).mean()
        rolling_std = self.series.rolling(window=window).std()

        fig, ax1 = plt.subplots(figsize=(10, 5))
        ax1.plot(self.series, label="Original", color="gray", alpha=0.5)
        ax1.plot(rolling_mean, label="Rolling Mean", color="blue")
        ax1.axhline(y=0, color='black', linestyle='dashed', linewidth=0.8)
        ax1.legend(loc="upper left")

        ax2 = ax1.twinx()
        ax2.plot(rolling_std, label="Rolling Std Dev", color="red", linestyle="dashed", alpha=0.7)
        ax2.legend(loc="upper right")

        plt.title("Rolling Mean & Std Deviation")
        plt.show()

        # Augmented Dickey-Fuller Test
        adf_result = adfuller(self.series.dropna())
        print("ADF Test")
        print(f"ADF Statistic: {adf_result[0]:.4f}, p-value: {adf_result[1]:.4f}")
        print("Stationary" if adf_result[1] < 0.05 else "Not Stationary")

        # KPSS Test
        kpss_stat, kpss_p, _, crit = kpss(self.series.dropna(), regression='c')
        print("KPSS Test")
        print(f"KPSS Statistic: {kpss_stat:.4f}, p-value: {kpss_p:.4f}")
        print("Stationary" if kpss_p > 0.05 else "Not Stationary")

        # ACF and PACF
        plt.figure(figsize=(12, 5))
        plot_acf(self.series.dropna(), lags=lags)
        plt.title("Autocorrelation Function (ACF)")
        plt.show()

        plt.figure(figsize=(12, 5))
        plot_pacf(self.series.dropna(), lags=lags)
        plt.title("Partial Autocorrelation Function (PACF)")
        plt.show()