In [3]:
import pandas as pd
import numpy as np
import itertools
from statsmodels.tsa.stattools import coint
import warnings

# Suppress warnings to keep the output clean
warnings.filterwarnings("ignore")

# Load the Sectors
df_sectors = pd.read_csv("SP500_clean.csv")

# Load the Prices
df_prices = pd.read_csv("final_price_clean.csv", index_col=0, parse_dates=True)

# Create a dictionary
sectors_dict = df_sectors.groupby('Sector')['Symbol'].apply(list).to_dict()

In [4]:
# Cointegration scanner loop
scores = []

for sector, tickers in sectors_dict.items():
    
    # Generate all unique pairs
    pairs = itertools.combinations(tickers, 2)

    for ticker1, ticker2 in pairs:
        # Get Data & Clean
        pair_data = df_prices[[ticker1, ticker2]].dropna()

        # prevent zero-size array
        if len(pair_data) < 252:  
            continue
            
        # Log normalize
        s1 = np.log(pair_data[ticker1])
        s2 = np.log(pair_data[ticker2])

        # Run Cointegration Test
        # trend='c' checks for Mean Reversion (Stationarity)
        result = coint(s1, s2, trend='c')
        p_value = result[1]

        # Save statistically significant pairs
        if p_value < 0.05:
            scores.append({
                'Sector': sector,
                'Stock1': ticker1,
                'Stock2': ticker2,
                'P-Value': p_value
            })

In [6]:
# Final report
print(f"\nScan Complete. Found {len(scores)} cointegrated pairs.")

if len(scores) > 0:
    results_df = pd.DataFrame(scores)
    # Sort by strongest relationship (Lowest P-Value is best)
    results_df = results_df.sort_values(by="P-Value", ascending=True)
    
    # Save to CSV
    results_df.to_csv("cointegration_results.csv", index=False)
    print(results_df.head(5))
else:
    print("No pairs found. Check your data normalization.")


Scan Complete. Found 905 cointegrated pairs.
                     Sector Stock1 Stock2   P-Value
257              Financials    BLK    SYF  0.000006
846             Real Estate    KIM    PSA  0.000009
104  Consumer Discretionary   NCLH   SBUX  0.000028
107  Consumer Discretionary   NCLH    DIS  0.000035
105  Consumer Discretionary   NCLH    TPR  0.000109
