In [1]:
from datetime import date
import random
import time
import yfinance as yf
import pandas as pd

import seaborn as sns

import matplotlib.pyplot as plt
import matplotlib.colors as mcolors

from numpy.fft import fft, ifft, fftshift
import numpy as np
from numpy import log, sqrt, exp


from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, mean_absolute_percentage_error
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.mixture import GaussianMixture
from sklearn.linear_model import LinearRegression

from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.stats.diagnostic import acorr_ljungbox

import scipy.stats as stats
from scipy.stats import probplot, laplace, norm, t, poisson
from scipy.linalg import solve_banded
from scipy.optimize import minimize, differential_evolution
from scipy.integrate import quad
from scipy.special import roots_laguerre
from scipy.interpolate import interp1d
from scipy.sparse import diags, kron, identity, csr_matrix
from scipy.sparse.linalg import spsolve
from scipy.stats import multivariate_normal, kstest

import statsmodels.api as sm
from statsmodels.nonparametric.kde import KDEUnivariate
from statsmodels.tsa.stattools import adfuller, kpss
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.arima_process import ArmaProcess

import pymc as pm
import arviz as az

#import aesara.tensor as at

from tensorflow import keras
#from tensorflow.keras.utils import plot_model

#import pyswarms as ps

######################################
#from pmdarima import auto_arima
#from diptest import diptest



In [2]:
class FactorModels:
    def __init__(self, rfr, log_return_pf, log_return_index):
        """
        Parameters:
        - rfr: scalar, annualized or periodic risk-free rate
        - log_return_index: Series, market index log returns
        - log_return_pf: DataFrame, asset log returns (columns = assets)
        """
        self.rfr = rfr
        self.trading_days = 252  #
        self.lam = 2  #  # placeholder for other models like Black-Litterman
        self.fit_type = None

        log_return_pf, log_return_index = self.clean_df(log_return_pf, log_return_index)
        self.log_return_pf =  log_return_pf
        self.log_return_index = log_return_index

    def clean_df(self, df1, df2):
        # Assume df1 and df2 are your time-indexed DataFrames
        # Ensure the index is datetime
        df1.index = pd.to_datetime(df1.index)
        df2.index = pd.to_datetime(df2.index)

        # Step 1: Align on common time index (inner join on index)
        common_index = df1.index.intersection(df2.index)
        df1_common = df1.loc[common_index]
        df2_common = df2.loc[common_index]

        # Step 2: Concatenate to identify rows with any NaNs across both
        combined = pd.concat([df1_common, df2_common], axis=1)

        # Step 3: Drop any rows with NaNs across both
        combined_clean = combined.dropna()

        # Step 4: Split back into two DataFrames with identical indices
        n1 = df1.shape[1]
        df1_clean = combined_clean.iloc[:, :n1]
        df2_clean = combined_clean.iloc[:, n1:]

        # Optional: verify they have the same index
        assert all(df1_clean.index == df2_clean.index)

        return df1_clean, df2_clean
#########################################################################################

    def CAPM_OLS(self, fit_type):
        """
        Estimate CAPM expected returns using OLS regression and print summary for each asset.

        Returns:
        - DataFrame with beta, alpha, CAPM expected return
        - Prints statsmodels summary for each regression
        """
        self.fit_type = fit_type
        mu_index = self.fit_marginal(self.log_return_index)[0]
        mu_pf = self.fit_marginal(self.log_return_pf)[0]

        market_expected_return = mu_index[0]
        results = {}

        X = self.log_return_index.values
        X = sm.add_constant(X)  # Adds intercept term

        for asset in self.log_return_pf.columns:
            Y = self.log_return_pf[asset].values
            model = sm.OLS(Y, X)
            res = model.fit()

            alpha = res.params[0]
            beta = res.params[1]
            capm_return = self.rfr + beta * (market_expected_return - self.rfr)

            print(f"--- {asset} ---")
            print(res.summary())

            results[asset] = {
                'alpha': alpha,
                'beta': beta,
                'CAPM_return': capm_return
            }

        return pd.DataFrame(results).T


    def CAPM_GLS(self, fit_type):
        """
        Estimate CAPM expected returns using GLS regression and print summary for each asset.

        Returns:
        - DataFrame with beta, alpha, and CAPM expected return for each asset
        - Prints statsmodels GLS summary for each regression
        """
        self.fit_type = fit_type
        mu_index = self.fit_marginal(self.log_return_index)[0]
        mu_pf = self.fit_marginal(self.log_return_pf)[0]

        market_expected_return = mu_index[0]
        results = {}

        X = self.log_return_index.values
        X = sm.add_constant(X)  # Add intercept

        for asset in self.log_return_pf.columns:
            Y = self.log_return_pf[asset].values

            # Estimate weights (variance of residuals from OLS as a proxy)
            ols_model = sm.OLS(Y, X).fit()
            resid_var = np.var(ols_model.resid)
            weights = 1 / resid_var  # Inverse-variance weights (constant here but placeholder for real GLS)

            # In general GLS, you'd use a full covariance matrix or per-observation variance
            gls_model = sm.GLS(Y, X, sigma=np.eye(len(Y)) * resid_var).fit()

            alpha = gls_model.params[0]
            beta = gls_model.params[1]
            capm_return = self.rfr + beta * (market_expected_return - self.rfr)

            print(f"--- GLS Summary for {asset} ---")
            print(gls_model.summary())

            results[asset] = {
                'alpha': alpha,
                'beta': beta,
                'CAPM_return': capm_return
            }

        return pd.DataFrame(results).T
##########################################################################################
    def log_return_val(self, pf):
        """
        Compute log returns for each stock (ignoring NaNs),
        fit normal and t-distributions, and plot boxplots and KS stats.
        """
        log_returns = pd.DataFrame()

        for col in self.pf.columns:
            series = pd.to_numeric(pf[col], errors='coerce')
            first_valid = series[series > 0].first_valid_index()
            if first_valid is not None:
                trimmed = series.loc[first_valid:]
                log_ret = np.log(trimmed / trimmed.shift(1)).dropna()
                log_returns[col] = log_ret

        return log_returns

    def fit_marginal(self, log_returns):
        """
        Marginal (univariate) fit of log_returns based on self.fit_type ('norm' or 't').
        Returns:
            mu_vector: daily means,
            cov_matrix: daily diagonal covariance matrix,
            ks_stats: KS statistics per asset
        """
        mu_vector = []
        std_vector = []
        ks_stats = {}

        for col in log_returns.columns:
            data = log_returns[col].dropna()

            if self.fit_type == "norm":
                mu, std = stats.norm.fit(data)
                ks = stats.kstest(data, 'norm', args=(mu, std)).statistic
                mu_vector.append(mu)
                std_vector.append(std)
                ks_stats[col] = ks

            elif self.fit_type == "t":
                df_t, loc_t, scale_t = stats.t.fit(data)
                ks = stats.kstest(data, 't', args=(df_t, loc_t, scale_t)).statistic
                mu_vector.append(loc_t)
                std = scale_t * np.sqrt(df_t / (df_t - 2)) if df_t > 2 else np.nan
                std_vector.append(std)
                ks_stats[col] = ks

            else:
                raise ValueError("fit_type must be 'norm' or 't'")

        # Convert to numpy arrays
        mu_vector = np.array(mu_vector)
        std_vector = np.array(std_vector)

        mu_annual = mu_vector * self.trading_days
        std_annual = std_vector * np.sqrt(self.trading_days)

        if self.fit_type == 'norm':
            args = mu_annual, std_annual
        elif self.fit_type == 't':
            args = mu_annual, std_annual, df_t

        #self.plot_log_return(list(log_returns.columns), list(ks_stats.values()), mu_annual, np.sqrt(np.diag(cov_annual)), corr_matrix)

        return args

    def plot_log_return(self, asset_names, ks_values, mu_annual, sigma_annual, log_corr):

        # Create subplots
        fig, axes = plt.subplots(3, 1, figsize=(14, 16))

        # --- Subplot 1: Annualized Mean ± 1 Std Dev ---
        axes[0].errorbar(asset_names, mu_annual*100, yerr=sigma_annual*100, fmt='o', capsize=5, color='dodgerblue', label="Annual Mean ± 1 Std")
        axes[0].set_title(f"Annualized Mean ± 1 Std Dev ({self.fit_type})")
        axes[0].set_ylabel("Annual Return")
        axes[0].grid(True)
        axes[0].legend()

        # --- Subplot 2: KS Statistic ---
        axes[1].bar(asset_names, ks_values, color='mediumpurple')
        axes[1].axhline(0.05, color='green', linestyle='--', label='Excellent Fit (<0.05)')
        axes[1].axhline(0.10, color='orange', linestyle='--', label='Acceptable Fit (<0.10)')
        axes[1].set_title(f"Marginal KS Statistics ({self.fit_type})")
        axes[1].set_ylabel("KS Statistic")
        axes[1].legend()
        axes[1].grid(True)

        # --- Subplot 3: Correlation Heatmap ---
        sns.heatmap(log_corr, ax=axes[2], cmap="coolwarm", annot=True, fmt=".2f", center=0,
                    xticklabels=asset_names, yticklabels=asset_names)
        axes[2].set_title("Pearson Correlation Matrix of Log Returns")

        plt.tight_layout()
        plt.show()


