In [5]:
import pandas as pd
import numpy as np
import yfinance as yf
from statsmodels.tsa.stattools import coint

def get_sp500_tickers():
    url = "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies"
    table = pd.read_html(url)[0]
    tickers = table['Symbol'].tolist()
    return tickers

def get_data(tickers, start_date, end_date):
    data = yf.download(tickers, start=start_date, end=end_date)['Adj Close']
    return data

def clean_data(data):
    # Forward fill missing values
    data = data.fillna(method='ffill')
    # Backward fill any remaining missing values
    data = data.fillna(method='bfill')
    # Remove any columns with infinity values
    data = data.replace([np.inf, -np.inf], np.nan).dropna(axis=1)
    return data

def find_cointegrated_pairs(data, significance=0.05):
    n = data.shape[1]
    score_matrix = np.zeros((n, n))
    pvalue_matrix = np.ones((n, n))
    keys = data.keys()
    pairs = []
    
    for i in range(n):
        for j in range(i+1, n):
            S1 = data[keys[i]]
            S2 = data[keys[j]]
            result = coint(S1, S2)
            score = result[0]
            pvalue = result[1]
            score_matrix[i, j] = score
            pvalue_matrix[i, j] = pvalue
            if pvalue < significance:
                pairs.append((keys[i], keys[j]))
    
    return pairs

def main():
    # Get S&P 500 tickers
    tickers = get_sp500_tickers()
    
    # Set parameters
    start_date = '2020-01-01'
    end_date = '2023-12-31'
    
    print(f"Fetching data for {len(tickers)} S&P 500 companies...")
    
    # Get data
    data = get_data(tickers, start_date, end_date)
    
    print("Cleaning data...")
    # Clean data
    data = clean_data(data)
    
    # Check if we have enough data after cleaning
    if data.empty or data.shape[1] < 2:
        print("Not enough valid data after cleaning.")
        return
    
    print(f"Data shape after cleaning: {data.shape}")
    
    print("Finding cointegrated pairs...")
    # Find cointegrated pairs
    pairs = find_cointegrated_pairs(data)
    
    if not pairs:
        print("No cointegrated pairs found.")
        return
    
    print(f"Number of cointegrated pairs found: {len(pairs)}")
    print("Sample of cointegrated pairs:")
    for pair in pairs[:10]:  # Print first 10 pairs
        print(pair)
    
    # Here you would continue with your backtesting logic
    # For example:
    # results = backtest_pairs(data, pairs)
    # analyze_results(results)

# if __name__ == "__main__":
#     main()

In [6]:
main()

Fetching data for 503 S&P 500 companies...


[**********************53%%                      ]  268 of 503 completed

$BF.B: possibly delisted; No price data found  (1d 2020-01-01 -> 2023-12-31)


[*********************100%%**********************]  503 of 503 completed

4 Failed downloads:
['SOLV', 'GEV']: YFChartError("%ticker%: Data doesn't exist for startDate = 1577854800, endDate = 1703998800")
['BF.B']: YFPricesMissingError('$%ticker%: possibly delisted; No price data found  (1d 2020-01-01 -> 2023-12-31)')
['BRK.B']: YFTzMissingError('$%ticker%: possibly delisted; No timezone found')


Cleaning data...
Data shape after cleaning: (1006, 499)
Finding cointegrated pairs...


  data = data.fillna(method='ffill')
  data = data.fillna(method='bfill')


KeyboardInterrupt: 