**Dependencies & Imports**

In [None]:
!pip install pandas_market_calendars ripser persim

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.manifold import MDS
import statsmodels.api as sm
from scipy.cluster.hierarchy import linkage, dendrogram
import networkx as nx
from matplotlib.colors import Normalize
from scipy.spatial.distance import squareform
from tqdm import tqdm
import yfinance as yf
import pandas_datareader.data as web
import pandas_market_calendars as mcal
from datetime import datetime
import requests
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import matplotlib.cm as cm
from ripser import ripser
import networkx as nx
from numpy.polynomial.polynomial import Polynomial
from persim import plot_diagrams
from persim import PersLandscapeExact
from scipy.stats import spearmanr, kendalltau
from sklearn.decomposition import PCA
from persim import PersistenceImager
from persim.landscapes import PersLandscapeExact
from ripser import ripser
from persim import plot_diagrams
from sklearn.preprocessing import StandardScaler

**Data Processing**

In [None]:
prices_raw = pd.read_csv('/content/prices.csv')

prices_raw['date'] = pd.to_datetime(prices_raw['date'])

prices_df = prices_raw.pivot_table(index='date', columns='TICKER', values='PRC', aggfunc='first')

prices_df = prices_df.sort_index(axis=1)

prices_df = prices_df.abs()

mask = pd.DataFrame(False, index=prices_df.index, columns=prices_df.columns)

for date, row in components_df.iterrows():
    tickers = row['tickers']

    valid = list(set(tickers) & set(prices_df.columns))
    mask.loc[date, valid] = True

prices_df = prices_df.where(mask)

spx = yf.download('^GSPC', start='1996-01-01', end='2002-12-31')['Close']
prices_df.index = pd.to_datetime(prices_df.index)
spx.index = pd.to_datetime(spx.index)
prices_df = prices_df.join(spx, how='left')

prices_df

In [None]:
spx_log_returns = np.log(spx / spx.shift(1)).dropna()
component_log_returns = np.log(prices_df / prices_df.shift(1))

component_log_returns = component_log_returns.loc[spx_log_returns.index]
component_log_returns = component_log_returns.fillna(0)

X = component_log_returns
y = spx_log_returns

model_no_alpha = sm.OLS(y, X).fit()
print(model_no_alpha.summary())

predicted = model_no_alpha.predict(X).to_numpy().flatten()
y_flat = y.to_numpy().flatten()

comparison = pd.DataFrame({
    'Actual': y_flat,
    'Synthetic': predicted
})

ax = comparison.cumsum().plot(title='Cumulative S&P 500 Log Return: Actual vs. Synthetic')
ax.set_xlabel("Day")
ax.set_ylabel("Cumulative Log Return")
plt.show()

In [None]:
weights = model_no_alpha.params
print(weights)

In [None]:
normalized_weights = weights / weights.sum()
print(normalized_weights)

**Correlation & Distance Matrices**

In [None]:
def rolling_correlation_array(returns, synthetic, window):
    n_stocks = returns.shape[1]
    n_windows = len(returns) - window + 1

    corr_array = np.zeros((n_windows, n_stocks + 1, n_stocks + 1))

    all_returns = np.column_stack([returns.values, synthetic.values])

    for i in range(n_windows):
        window_data = all_returns[i:i+window, :]
        corr_matrix = np.corrcoef(window_data, rowvar=False)
        corr_array[i, :, :] = corr_matrix

    dates = returns.index[window-1:]

    return corr_array, dates

window = 126
corr_array, dates = rolling_correlation_array(component_returns, synthetic_returns, window)

print(corr_array[0, :, :])

print(corr_array[0, 0, -1])

In [None]:
def correlation_to_distance(corr_matrix):

    corr_matrix = np.nan_to_num(corr_matrix, nan=0)

    distance_matrix = np.sqrt(2 * (1 - corr_matrix))

    np.fill_diagonal(distance_matrix, 0)

    return distance_matrix

In [None]:
distance_array = np.array([correlation_to_distance(m) for m in corr_array])

**Traditional Data Exploration**

In [None]:
mean_corr = corr_array[:, :-1, :-1].mean(axis=(1,2))
print(mean_corr)

In [None]:
stock_vs_index_corr = corr_array[:, :-1, -1]
print(stock_vs_index_corr)

In [None]:
def summarize_3d_array(arr, name):
    n_windows = arr.shape[0]
    flat_stats = {
        "mean_corr": np.mean(arr, axis=(1,2)),
        "std_corr": np.std(arr, axis=(1,2)),
        "max_corr": np.max(arr, axis=(1,2)),
        "min_corr": np.min(arr, axis=(1,2))
    }
    return pd.DataFrame(flat_stats, index=range(n_windows)).describe().T

datasets = {
    "components": components_df,
    "prices": prices_df
}

summary_stats = {name: df.describe().T for name, df in datasets.items()}
missing_values = {name: df.isna().sum() for name, df in datasets.items()}

upper_triu = np.triu_indices(corr_array.shape[1], k=1)
all_corrs = corr_array[:, upper_triu[0], upper_triu[1]].flatten()
corr_summary_flat = pd.Series(all_corrs).describe()

dist_summary = summarize_3d_array(distance_array, "distance")

spearman_tests = {}
kendall_tests = {}

for name, df in datasets.items():
    if df.shape[1] > 1:
        spearman_tests[name] = df.corr(method='spearman')
        kendall_tests[name] = df.corr(method='kendall')

pca_results = {}
for name, df in datasets.items():
    if df.shape[1] > 1:
        df_clean = df.dropna()
        if not df_clean.empty:
            pca = PCA().fit(df_clean)
            pca_results[name] = {
                "explained_variance": pca.explained_variance_ratio_,
                "components": pca.components_
            }

print("Summary stats for 2D datasets:")
for name, stats in summary_stats.items():
    print(f"\n{name}:\n", stats)

print("\nFlattened correlation summary:")
print(corr_summary_flat)

print("\nDistance 3D summary:")
print(dist_summary)

In [None]:
components_per_day = components_df.notna().sum(axis=1)
components_per_day.index = pd.to_datetime(components_df.index)
components_monthly = components_per_day.resample('M').mean()

plt.figure(figsize=(12,4))
components_monthly.plot()
plt.title("Average Number of S&P 500 Components per Month")
plt.xlabel("Month")
plt.ylabel("Number of Components")
plt.grid(True)
plt.show()

n_windows = corr_array.shape[0]
high_corr_counts = [np.sum(np.triu(corr_array[i], k=1) > 0.5) for i in range(n_windows)]
window_dates = returns.index[window-1:]

plt.figure(figsize=(12,4))
plt.plot(window_dates, high_corr_counts)
plt.title("Number of Stock Pairs with Correlation > 0.5 Over Time")
plt.xlabel("Date")
plt.ylabel("Number of Pairs")
plt.grid(True)
plt.show()

plt.figure(figsize=(8,4))
sns.histplot(corr_array[0][np.triu_indices(corr_array.shape[1], k=1)], bins=50, kde=True)
plt.title("Distribution of Correlations (First Rolling Window)")
plt.xlabel("Correlation")
plt.ylabel("Count")
plt.show()

plt.figure(figsize=(8,4))
sns.histplot(distance_array[0][np.triu_indices(distance_array.shape[1], k=1)], bins=50, kde=True)
plt.title("Distribution of Distances (First Rolling Window)")
plt.xlabel("Distance")
plt.ylabel("Count")
plt.show()

Rolling Windows

In [None]:
window_size = 252
start_date = '1996-01-01'
end_date = '2002-12-31'

dates = pd.date_range(start=start_date, end=end_date, freq='W-FRI')
n_windows = len(dates)

for date in dates:
    window_idx = np.searchsorted(window_dates, date, side='right') - 1
    if window_idx < 0 or window_idx >= len(corr_array):
        continue

    corr_mat = corr_array[window_idx]
    dist_mat = distance_array[window_idx]

    upper_triu = np.triu_indices(corr_mat.shape[0], k=1)

    avg_corr = np.nanmean(corr_mat[upper_triu])
    high_corr_count = np.sum(corr_mat[upper_triu] > 0.5)

    print(f"Week ending {date.date()}: Avg Corr={avg_corr:.3f}, High Corr Pairs={high_corr_count}")

    plt.figure(figsize=(6,3))
    sns.histplot(corr_mat[upper_triu], bins=30, kde=True)
    plt.title(f"Correlation Distribution: Week ending {date.date()}")
    plt.xlabel("Correlation")
    plt.ylabel("Count")
    plt.show()

    plt.figure(figsize=(6,3))
    sns.histplot(dist_mat[upper_triu], bins=30, kde=True)
    plt.title(f"Distance Distribution: Week ending {date.date()}")
    plt.xlabel("Distance")
    plt.ylabel("Count")
    plt.show()

In [None]:
plt.figure(figsize=(12,6))
for i, date in enumerate(window_dates):
    if i % 4 != 0:
        continue
    corr_mat = corr_array[i]
    corr_vals = corr_mat[np.triu_indices(corr_mat.shape[0], k=1)]
    sns.kdeplot(corr_vals, label=str(date.date()), alpha=0.5)

plt.title("Correlation Distributions Across Weeks")
plt.xlabel("Correlation")
plt.ylabel("Density")
plt.legend(title="Week Ending", bbox_to_anchor=(1.05, 1), loc='upper left', ncol=1, fontsize=8)
plt.show()

plt.figure(figsize=(12,6))
for i, date in enumerate(window_dates):
    if i % 4 != 0:
        continue
    dist_mat = distance_array[i]
    dist_vals = dist_mat[np.triu_indices(dist_mat.shape[0], k=1)]
    sns.kdeplot(dist_vals, label=str(date.date()), alpha=0.5)

plt.title("Distance Distributions Across Weeks")
plt.xlabel("Distance")
plt.ylabel("Density")
plt.legend(title="Week Ending", bbox_to_anchor=(1.05, 1), loc='upper left', ncol=1, fontsize=8)
plt.show()

In [None]:
window_size = 252
start_date = '1996-01-01'
end_date = '2002-12-31'

dates = pd.date_range(start=start_date, end=end_date, freq='W-FRI')

plt.figure(figsize=(12,5))
for date in dates:
    window_idx = np.searchsorted(window_dates, date, side='right') - 1
    if window_idx < 0 or window_idx >= len(corr_array):
        continue

    corr_mat = corr_array[window_idx]
    upper_triu = np.triu_indices(corr_mat.shape[0], k=1)
    sns.kdeplot(corr_mat[upper_triu], alpha=0.3, linewidth=1)

plt.title("Correlation Distributions Across Weekly Rolling Windows")
plt.xlabel("Correlation")
plt.ylabel("Density")
plt.grid(True)
plt.show()

plt.figure(figsize=(12,5))
for date in dates:
    window_idx = np.searchsorted(window_dates, date, side='right') - 1
    if window_idx < 0 or window_idx >= len(distance_array):
        continue

    dist_mat = distance_array[window_idx]
    upper_triu = np.triu_indices(dist_mat.shape[0], k=1)
    sns.kdeplot(dist_mat[upper_triu], alpha=0.3, linewidth=1)

plt.title("Distance Distributions Across Weekly Rolling Windows")
plt.xlabel("Distance")
plt.ylabel("Density")
plt.grid(True)
plt.show()

**Vietoris-Rips Complexes, Betti Numbers, \& Persistence Diagrams**

In [None]:
def compute_betti_numbers(dist_mat, maxdim=2):
    diagrams = ripser(dist_mat, distance_matrix=True, maxdim=maxdim)['dgms']
    betti_counts = [len(dg[~np.isinf(dg[:,1])]) for dg in diagrams]  # counts per dimension
    return betti_counts, diagrams

window_size = 252
start_date = '1996-01-01'
end_date = '2002-12-31'

dates = pd.date_range(start=start_date, end=end_date, freq='W-FRI')

betti_results = []

for date in dates:
    window_idx = np.searchsorted(window_dates, date, side='right') - 1
    if window_idx < 0 or window_idx >= len(distance_array):
        continue

    dist_mat = distance_array[window_idx]
    betti_counts, diagrams = compute_betti_numbers(dist_mat, maxdim=2)
    betti_results.append({
        'date': date,
        'betti_0': betti_counts[0],
        'betti_1': betti_counts[1],
        'betti_2': betti_counts[2]
    })

betti_df = pd.DataFrame(betti_results).set_index('date')

plt.figure(figsize=(12,5))
plt.plot(betti_df.index, betti_df['betti_0'], label='Betti-0')
plt.plot(betti_df.index, betti_df['betti_1'], label='Betti-1')
plt.plot(betti_df.index, betti_df['betti_2'], label='Betti-2')
plt.title("Betti Numbers Over Time (Vietoris-Rips Complexes)")
plt.xlabel("Date")
plt.ylabel("Count")
plt.legend()
plt.grid(True)
plt.show()

**Persistence Landscapes**

In [None]:
def compute_persistence_landscape(dist_mat, maxdim=1):
    diagrams = ripser(dist_mat, distance_matrix=True, maxdim=maxdim)['dgms']
    landscapes = {}
    for dim in range(maxdim + 1):
        pl = PersLandscapeExact(dgms=[diagrams[dim]])
        landscapes[f'landscape_dim{dim}'] = pl
    return landscapes, diagrams

window_size = 252
dates = pd.date_range(start='1996-01-01', end='2002-12-31', freq='W-FRI')

landscape_results = []

for date in dates:
    window_idx = np.searchsorted(window_dates, date, side='right') - 1
    if window_idx < 0 or window_idx >= len(distance_array):
        continue

    dist_mat = distance_array[window_idx]
    landscapes, diagrams = compute_persistence_landscape(dist_mat, maxdim=1)
    landscape_results.append({
        'date': date,
        'landscapes': landscapes,
        'diagrams': diagrams
    })

from persim.landscapes import PersLandscapeExact, plot_landscape_simple
import matplotlib.pyplot as plt

first_week = landscape_results[0]
dgms = first_week['diagrams']

if len(dgms) > 1 and dgms[1].size > 0:
    landscape_dim1 = PersLandscapeExact(dgms=dgms, hom_deg=1)

    plt.figure(figsize=(10,5))
    plot_landscape_simple(landscape_dim1)
    plt.title(f'Persistence Landscape (Dimension 1) - Week ending {first_week["date"].date()}')
    plt.show()
else:
    print("No dimension-1 features for this week.")

In [None]:
plt.figure(figsize=(12,6))

for week_data in landscape_results:
    dgms = week_data['diagrams']
    if len(dgms) > 1 and dgms[1].size > 0:
        landscape = PersLandscapeExact(dgms=dgms, hom_deg=1)
        plot_landscape_simple(landscape, alpha=0.3)

plt.title("Overlay of Dimension-1 Persistence Landscapes Across Weeks")
plt.xlabel("Filtration Value")
plt.ylabel("Landscape Value")
plt.grid(True)
plt.show()

**Persistence Images**

In [None]:
pi = PersistenceImager(pixel_size=0.1)

persistence_images = []

for week_data in landscape_results:
    dgms = week_data['diagrams']
    if len(dgms) > 1 and dgms[1].size > 0:
        diagram = dgms[1]
        pi.fit(diagram)
        img = pi.transform(diagram)
        persistence_images.append({
            'date': week_data['date'],
            'image': img
        })

plt.figure(figsize=(6,5))
plt.imshow(persistence_images[0]['image'], origin='lower', cmap='viridis', aspect='auto')
plt.colorbar(label='Intensity')
plt.title(f'Persistence Image (Dimension 1) - Week ending {persistence_images[0]["date"].date()}')
plt.xlabel('X-axis (birth/persistence)')
plt.ylabel('Y-axis (persistence)')
plt.show()

In [None]:
dim1_diagrams = [week['diagrams'][1] for week in landscape_results if len(week['diagrams']) > 1 and week['diagrams'][1].size > 0]

pi = PersistenceImager(pixel_size=0.1)
pi.fit(np.vstack(dim1_diagrams))

all_images = []
for diagram in dim1_diagrams:
    img = pi.transform(diagram)
    all_images.append(img)

all_images_array = np.stack(all_images)

plt.figure(figsize=(8,6))
for img in all_images_array:
    plt.imshow(img, origin='lower', cmap='viridis', alpha=0.1, aspect='auto')
plt.colorbar(label='Intensity')
plt.title('Overlay of Persistence Images (Dimension 1) Across All Weeks')
plt.xlabel('X-axis (birth/persistence)')
plt.ylabel('Y-axis (persistence)')
plt.show()

avg_image = np.mean(all_images_array, axis=0)
plt.figure(figsize=(8,6))
plt.imshow(avg_image, origin='lower', cmap='viridis', aspect='auto')
plt.colorbar(label='Average Intensity')
plt.title('Average Persistence Image (Dimension 1) Across All Weeks')
plt.xlabel('X-axis (birth/persistence)')
plt.ylabel('Y-axis (persistence)')
plt.show()

**Final Clustering & Results**

In [None]:
feature_list = []

for i, week in enumerate(landscape_results):
    features = {}

    features['betti_0'] = week['betti_0'] if 'betti_0' in week else np.nan
    features['betti_1'] = week['betti_1'] if 'betti_1' in week else np.nan
    features['betti_2'] = week['betti_2'] if 'betti_2' in week else np.nan

    pl = week['landscapes'].get('landscape_dim1', None)
    if pl:
        features['pl_supnorm'] = pl.sup_norm()
        features['pl_l2norm'] = pl.p_norm(p=2)
    else:
        features['pl_supnorm'] = np.nan
        features['pl_l2norm'] = np.nan

    if 'persistence_image_dim1' in week:
        img = week['persistence_image_dim1']
        features.update({f'pi_{j}': val for j, val in enumerate(img.flatten())})

    feature_list.append(features)

feature_df = pd.DataFrame(feature_list, index=[week['date'] for week in landscape_results])
feature_df = feature_df.fillna(0)

scaler = StandardScaler()
X_scaled = scaler.fit_transform(feature_df)

pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

kmeans = KMeans(n_clusters=3, random_state=42)
clusters = kmeans.fit_predict(X_scaled)

plt.figure(figsize=(10,6))
sns.scatterplot(x=X_pca[:,0], y=X_pca[:,1], hue=clusters, palette='Set2')
plt.title('Clustering of Weeks Based on Topological Features')
plt.xlabel('PCA 1')
plt.ylabel('PCA 2')
plt.legend(title='Cluster')
plt.show()

feature_df['cluster'] = clusters

for c in sorted(feature_df['cluster'].unique()):
    print(f"\nCluster {c} Summary:")
    cluster_data = feature_df[feature_df['cluster'] == c].drop(columns='cluster')
    summary_stats = cluster_data.agg(['mean', 'max', 'min', 'std']).transpose()
    print(summary_stats)

In [None]:
features_df = betti_df[['betti_0', 'betti_1', 'betti_2']].copy()

scaler = StandardScaler()
X_scaled = scaler.fit_transform(features_df)

kmeans = KMeans(n_clusters=3, random_state=42)
cluster_labels = kmeans.fit_predict(X_scaled)

features_df['cluster'] = cluster_labels
features_df['date'] = features_df.index

synthetic_index = pd.Series(predicted, index=spx_log_returns.index)
cumulative_index = (1 + synthetic_index).cumprod()

cumulative_index_weekly = cumulative_index.resample('W-FRI').last()

cumulative_index_weekly = cumulative_index_weekly.reindex(features_df['date'])

plt.figure(figsize=(15,6))
plt.plot(cumulative_index_weekly.index, cumulative_index_weekly.values, color='blue', label='Synthetic Index', linewidth=2)

cluster_colors = {0: 'green', 1: 'orange', 2: 'red'}
for cluster in np.unique(cluster_labels):
    cluster_dates = features_df[features_df['cluster'] == cluster]['date']
    plt.scatter(cluster_dates,
                cumulative_index_weekly.loc[cluster_dates],
                color=cluster_colors[cluster],
                alpha=0.5,
                label=f'Cluster {cluster}')

plt.title("Market Clusters Over Time with Synthetic Index")
plt.xlabel("Date")
plt.ylabel("Cumulative Index Value")
plt.legend()
plt.grid(True)
plt.show()