In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import yfinance as yf
from datetime import datetime
import json

In [2]:
def get_price_data(tickers, start_date=None, end_date=None, period="1y", interval="1d"):
    """
    Fetch historical price data for pairs from Yahoo Finance.
    
    Parameters:
    -----------
    tickers : list
        List of ticker symbols to fetch data for
    start_date : str, optional
        Start date in 'YYYY-MM-DD' format. If None, uses period instead.
    end_date : str, optional
        End date in 'YYYY-MM-DD' format. If None, uses current date.
    period : str, optional
        Time period to download data for if start_date is None. 
        Valid values: '1d', '5d', '1mo', '3mo', '6mo', '1y', '2y', '5y', '10y', 'ytd', 'max'
    interval : str, optional
        Data interval. Valid values: '1m', '2m', '5m', '15m', '30m', '60m', '90m', '1h', '1d', '5d', '1wk', '1mo', '3mo'
    
    Returns:
    --------
    pd.DataFrame
        DataFrame containing closing prices for all tickers
    """
    if end_date is None:
        end_date = datetime.now().strftime('%Y-%m-%d')
    
    if start_date is None:
        # Use period parameter
        data = yf.download(tickers, period=period, interval=interval, group_by='ticker', auto_adjust=True)
    else:
        data = yf.download(tickers, start=start_date, end=end_date, interval=interval, group_by='ticker', auto_adjust=True)
    
    # Extract closing prices
    if len(tickers) > 1:
        prices = pd.DataFrame()
        for ticker in tickers:
            prices[ticker] = data[ticker]['Close']
    else:
        prices = pd.DataFrame(data['Close'])
        prices.columns = tickers
    
    return prices

In [3]:
def compute_features(price_A: pd.Series, price_B: pd.Series, window: int = 20) -> pd.DataFrame:
    # Static beta estimation using statsmodels OLS instead of LinearRegression
    X = sm.add_constant(price_B)
    model = sm.OLS(price_A, X).fit()

    beta = model.params.iloc[1]  # Slope coefficient
    alpha = model.params.iloc[0]  # Intercept
    
    # Calculate the spread with alpha correction
    spread = price_A - beta * price_B - alpha

    # Z-score of spread
    spread_mean = spread.rolling(window).mean()
    spread_std = spread.rolling(window).std()
    spread_z = (spread - spread_mean) / spread_std

    # Volatility of spread
    spread_vol = spread.rolling(window).std()

    # Half-life of mean reversion
    def calc_halflife(series):
        lagged = series.shift(1)
        delta = series - lagged
        lagged = sm.add_constant(lagged.dropna())
        model = sm.OLS(delta.dropna(), lagged).fit()
        halflife = -np.log(2) / model.params.iloc[1]  # Using index 1 for the coefficient
        return halflife

    halflife_series = spread.rolling(window).apply(calc_halflife, raw=False)

    # Bollinger %B
    upper_band = spread_mean + 2 * spread_std
    lower_band = spread_mean - 2 * spread_std
    bollinger_pct = (spread - lower_band) / (upper_band - lower_band)

    # Rolling beta
    def rolling_beta(a, b, window):
        betas = []
        for i in range(len(a)):
            if i < window:
                betas.append(np.nan)
                continue
            y = a[i-window:i]
            x = b[i-window:i]
            cov = np.cov(y, x)[0, 1]
            var = np.var(x)
            betas.append(cov / var if var != 0 else np.nan)
        return pd.Series(betas, index=a.index)

    rolling_beta_est = rolling_beta(price_A, price_B, window)

    # Volatility ratio
    vol_A = price_A.diff().rolling(window).std()
    vol_B = price_B.diff().rolling(window).std()
    vol_ratio = vol_A / vol_B

    # Lead-lag difference using percentage change
    lead_lag_diff = price_A.pct_change().shift(1) - price_B.pct_change()

    # Combine into DataFrame
    features = pd.DataFrame({
        'price_A': price_A,
        'price_B': price_B,
        'spread': spread,
        'spread_z': spread_z,
        'spread_vol': spread_vol,
        'halflife': halflife_series,
        'bollinger_pct': bollinger_pct,
        'rolling_beta': rolling_beta_est,
        'vol_ratio': vol_ratio,
        'lead_lag_diff': lead_lag_diff,
        'hedge_ratio': beta,
        'alpha': alpha
    })

    features['date'] = features.index

    # Drop rows with NaNs
    features = features.dropna()

    # Add target variable for next day's spread only
    features['next_day_spread'] = features['spread'].shift(-1)

    # Drop the last row as it will have NaNs in the target column
    features = features.dropna()

    return features

In [4]:
def compute_features_for_pair(ticker_A, ticker_B, start_date=None, end_date=None, period="1y", window=20, formation_year=None):
    """
    Download price data for a stock pair and compute features.
    
    Parameters:
    -----------
    ticker_A, ticker_B : str
        Stock ticker symbols
    start_date, end_date : str, optional
        Start and end dates in 'YYYY-MM-DD' format
    period : str, optional
        Time period if start_date is None
    window : int, optional
        Rolling window size for feature calculation
    formation_year : int, optional
        The formation year for the data, to be added as a column in the DataFrame
        
    Returns:
    --------
    pd.DataFrame
        DataFrame containing all computed features
    """
    # Fetch data
    tickers = [ticker_A, ticker_B]
    prices = get_price_data(tickers, start_date, end_date, period)
        
    # Compute features
    features = compute_features(prices[ticker_A], prices[ticker_B], window)
    
    # Add the formation_year column
    if formation_year is not None:
        features['formation_year'] = formation_year
    
    return features

In [7]:
# Load yearly_cointegrations from the JSON file
with open("yearly_cointegrations.json", "r") as json_file:
    yearly_cointegrations = json.load(json_file)

# Collect data for all pairs
all_features = pd.DataFrame()

for start_year, co_pairs_list in yearly_cointegrations.items():
    start_date = f"{start_year}-01-01"
    end_date = f"{int(start_year) + 1}-01-01"  # Convert to int for arithmetic
    print(f"\n=== Running feature engineering for {start_date} to {end_date} ===")

    for pair_data in co_pairs_list:
        tickerA, tickerB = pair_data[0]  # Extract the pair (tickerA, tickerB)
        print(f"  -> Processing pair: {tickerA}-{tickerB}")
        try:
            # Compute features for the pair
            pair_features = compute_features_for_pair(
                ticker_A=tickerA, 
                ticker_B=tickerB, 
                start_date=start_date, 
                end_date=end_date,
                formation_year=int(start_year)  # Pass the start_year as formation_year
            )
            pair_features['pair'] = f"{tickerA}-{tickerB}"  # Add pair identifier
            all_features = pd.concat([all_features, pair_features], ignore_index=True)
        except Exception as e:
            print(f"    Error processing {tickerA}-{tickerB}: {str(e)}")

print("\nFeature engineering complete.")
print("Total rows in all_features:", len(all_features))

# Optionally, save to CSV
all_features.to_csv("yearly_pairs_features.csv", index=False)

[*********************100%***********************]  2 of 2 completed


=== Running feature engineering for 2019-01-01 to 2020-01-01 ===
  -> Processing pair: ACN-MKC



[*********************100%***********************]  2 of 2 completed

  -> Processing pair: AMT-VMC



[*********************100%***********************]  2 of 2 completed

  -> Processing pair: ACGL-MLM



[*********************100%***********************]  2 of 2 completed


  -> Processing pair: CMG-SNPS
  -> Processing pair: AEP-JCI


[*********************100%***********************]  2 of 2 completed
[*********************100%***********************]  2 of 2 completed

  -> Processing pair: EW-FI



[*********************100%***********************]  2 of 2 completed

  -> Processing pair: AVB-XEL



[*********************100%***********************]  2 of 2 completed

  -> Processing pair: FE-T



[*********************100%***********************]  2 of 2 completed

  -> Processing pair: AMAT-LULU



[*********************100%***********************]  2 of 2 completed

  -> Processing pair: COP-RL



[*********************100%***********************]  2 of 2 completed

  -> Processing pair: MCD-YUM



[*********************100%***********************]  2 of 2 completed

  -> Processing pair: PLD-SO



[*********************100%***********************]  2 of 2 completed

  -> Processing pair: AVY-ORLY



[*********************100%***********************]  2 of 2 completed

  -> Processing pair: ETN-GRMN



[*********************100%***********************]  2 of 2 completed

  -> Processing pair: BX-HIG



[*********************100%***********************]  2 of 2 completed

  -> Processing pair: AME-CHTR



[*********************100%***********************]  2 of 2 completed

  -> Processing pair: HLT-TMO



[*********************100%***********************]  2 of 2 completed

  -> Processing pair: LNT-PG



[*********************100%***********************]  2 of 2 completed

  -> Processing pair: BDX-NWSA



[*********************100%***********************]  2 of 2 completed

  -> Processing pair: BRO-MCO



[*********************100%***********************]  2 of 2 completed

  -> Processing pair: TDG-TROW



[*********************100%***********************]  2 of 2 completed

  -> Processing pair: CINF-LHX



[*********************100%***********************]  2 of 2 completed

  -> Processing pair: IPG-MTB



[*********************100%***********************]  2 of 2 completed

  -> Processing pair: CSGP-KMB



[*********************100%***********************]  2 of 2 completed

  -> Processing pair: IT-NSC



[                       0%                       ]

  -> Processing pair: ITW-RTX


[*********************100%***********************]  2 of 2 completed
[*********************100%***********************]  2 of 2 completed

  -> Processing pair: DGX-MA



[*********************100%***********************]  2 of 2 completed


=== Running feature engineering for 2020-01-01 to 2021-01-01 ===
  -> Processing pair: A-VRSK
  -> Processing pair: CF-RJF



[*********************100%***********************]  2 of 2 completed
[*********************100%***********************]  2 of 2 completed

  -> Processing pair: ALLE-JPM
  -> Processing pair: AMCR-KKR



[*********************100%***********************]  2 of 2 completed
[*********************100%***********************]  2 of 2 completed

  -> Processing pair: CAT-MTD



[*********************100%***********************]  2 of 2 completed


  -> Processing pair: ADBE-ODFL
  -> Processing pair: BKNG-NWSA


[*********************100%***********************]  2 of 2 completed
[*********************100%***********************]  2 of 2 completed

  -> Processing pair: MAR-RF



[*********************100%***********************]  2 of 2 completed

  -> Processing pair: AIG-KEY



[*********************100%***********************]  2 of 2 completed

  -> Processing pair: DOC-USB



[*********************100%***********************]  2 of 2 completed

  -> Processing pair: HST-WYNN



[*********************100%***********************]  2 of 2 completed


  -> Processing pair: MCHP-ON


[*********************100%***********************]  2 of 2 completed


  -> Processing pair: ERIE-IDXX


[*********************100%***********************]  2 of 2 completed

  -> Processing pair: CSX-STLD



[*********************100%***********************]  2 of 2 completed

  -> Processing pair: MCK-PTC



[*********************100%***********************]  2 of 2 completed

  -> Processing pair: ABT-CMG



[*********************100%***********************]  2 of 2 completed

  -> Processing pair: BEN-VICI



[*********************100%***********************]  2 of 2 completed

  -> Processing pair: MDLZ-WM



[*********************100%***********************]  2 of 2 completed


  -> Processing pair: NTRS-UHS


[*********************100%***********************]  2 of 2 completed

  -> Processing pair: AOS-NOW



[*********************100%***********************]  2 of 2 completed

  -> Processing pair: BAC-HIG



[*********************100%***********************]  2 of 2 completed

  -> Processing pair: AVY-HON



[*********************100%***********************]  2 of 2 completed


  -> Processing pair: MHK-ROST


[*********************100%***********************]  2 of 2 completed

  -> Processing pair: ADM-BALL



[*********************100%***********************]  2 of 2 completed


  -> Processing pair: LVS-WBD
  -> Processing pair: CTSH-PPG


[*********************100%***********************]  2 of 2 completed
[*********************100%***********************]  2 of 2 completed

  -> Processing pair: COP-CVX



[*********************100%***********************]  2 of 2 completed

  -> Processing pair: BLK-GRMN



[*********************100%***********************]  2 of 2 completed


  -> Processing pair: CTVA-SBUX


[*********************100%***********************]  2 of 2 completed

  -> Processing pair: MOS-WY



[*********************100%***********************]  2 of 2 completed


  -> Processing pair: AXON-PAYC
  -> Processing pair: HUM-LULU


[*********************100%***********************]  2 of 2 completed
[*********************100%***********************]  2 of 2 completed

  -> Processing pair: AWK-PG



[*********************100%***********************]  2 of 2 completed

  -> Processing pair: DE-QCOM



[*********************100%***********************]  2 of 2 completed

  -> Processing pair: ANSS-EMN



[*********************100%***********************]  2 of 2 completed

  -> Processing pair: APD-ORLY



[*********************100%***********************]  2 of 2 completed

  -> Processing pair: FIS-MMC



[*********************100%***********************]  2 of 2 completed

  -> Processing pair: SYF-TXT



[*********************100%***********************]  2 of 2 completed


  -> Processing pair: BG-PAYX


[*********************100%***********************]  2 of 2 completed

  -> Processing pair: SPG-UAL



[*********************100%***********************]  2 of 2 completed

  -> Processing pair: ANET-KEYS
  -> Processing pair: ATO-EVRG



[*********************100%***********************]  2 of 2 completed
[*********************100%***********************]  2 of 2 completed

  -> Processing pair: IR-PNR



[*********************100%***********************]  2 of 2 completed

  -> Processing pair: HES-TPL



[*********************100%***********************]  2 of 2 completed


  -> Processing pair: EOG-FANG


[*********************100%***********************]  2 of 2 completed

  -> Processing pair: DVN-MPC
  -> Processing pair: OKE-REG



[*********************100%***********************]  2 of 2 completed
[*********************100%***********************]  2 of 2 completed

  -> Processing pair: CTAS-GLW



[*********************100%***********************]  2 of 2 completed


=== Running feature engineering for 2021-01-01 to 2022-01-01 ===
  -> Processing pair: EVRG-YUM



[*********************100%***********************]  2 of 2 completed

  -> Processing pair: EQR-ZTS



[*********************100%***********************]  2 of 2 completed

  -> Processing pair: AZO-FAST



[*********************100%***********************]  2 of 2 completed


  -> Processing pair: DHR-ISRG


[*********************100%***********************]  2 of 2 completed

  -> Processing pair: NDSN-WAB



[*********************100%***********************]  2 of 2 completed


  -> Processing pair: AMT-OTIS


[*********************100%***********************]  2 of 2 completed

  -> Processing pair: ARE-UDR



[*********************100%***********************]  2 of 2 completed


  -> Processing pair: CBRE-MOH


[*********************100%***********************]  2 of 2 completed

  -> Processing pair: ORLY-PSA



[*********************100%***********************]  2 of 2 completed

  -> Processing pair: MA-NWSA
  -> Processing pair: CNP-O



[*********************100%***********************]  2 of 2 completed
[*********************100%***********************]  2 of 2 completed


  -> Processing pair: WDC-WYNN
  -> Processing pair: GOOGL-IT


[*********************100%***********************]  2 of 2 completed
[*********************100%***********************]  2 of 2 completed

  -> Processing pair: HCA-IPG
  -> Processing pair: CBOE-FTNT



[*********************100%***********************]  2 of 2 completed
[*********************100%***********************]  2 of 2 completed

  -> Processing pair: LLY-NKE



[*********************100%***********************]  2 of 2 completed



=== Running feature engineering for 2022-01-01 to 2023-01-01 ===
  -> Processing pair: AMAT-BBY


[*********************100%***********************]  2 of 2 completed


  -> Processing pair: AMD-META
  -> Processing pair: CDW-ORCL


[*********************100%***********************]  2 of 2 completed
[*********************100%***********************]  2 of 2 completed

  -> Processing pair: AVB-CME



[*********************100%***********************]  2 of 2 completed

  -> Processing pair: AES-MSI



[*********************100%***********************]  2 of 2 completed

  -> Processing pair: BALL-GRMN
  -> Processing pair: MLM-VRSN



[*********************100%***********************]  2 of 2 completed
[*********************100%***********************]  2 of 2 completed

  -> Processing pair: ADP-GWW



[*********************100%***********************]  2 of 2 completed


  -> Processing pair: LOW-RF


[*********************100%***********************]  2 of 2 completed


  -> Processing pair: ECL-PLD


[*********************100%***********************]  2 of 2 completed

  -> Processing pair: IPG-OMC



[*********************100%***********************]  2 of 2 completed


  -> Processing pair: PNR-SWKS


[*********************100%***********************]  2 of 2 completed

  -> Processing pair: BAC-NWSA



[*********************100%***********************]  2 of 2 completed


  -> Processing pair: JNPR-UHS


[*********************100%***********************]  2 of 2 completed

  -> Processing pair: LNT-SBAC



[*********************100%***********************]  2 of 2 completed

  -> Processing pair: AEE-EIX



[*********************100%***********************]  2 of 2 completed

  -> Processing pair: EFX-SPGI



[*********************100%***********************]  2 of 2 completed

  -> Processing pair: HUM-TKO



[*********************100%***********************]  2 of 2 completed

  -> Processing pair: CINF-LYB



[*********************100%***********************]  2 of 2 completed


=== Running feature engineering for 2023-01-01 to 2024-01-01 ===
  -> Processing pair: EXPD-MLM



[*********************100%***********************]  2 of 2 completed

  -> Processing pair: CMS-RTX



[*********************100%***********************]  2 of 2 completed

  -> Processing pair: FE-WEC



[*********************100%***********************]  2 of 2 completed


  -> Processing pair: AWK-HOLX
  -> Processing pair: BEN-FRT


[*********************100%***********************]  2 of 2 completed
[*********************100%***********************]  2 of 2 completed

  -> Processing pair: HSIC-UPS



[*********************100%***********************]  2 of 2 completed


  -> Processing pair: ES-TSCO


[*********************100%***********************]  2 of 2 completed

  -> Processing pair: GE-TTWO



[*********************100%***********************]  2 of 2 completed


  -> Processing pair: ENPH-WBD
  -> Processing pair: SCHW-USB


[*********************100%***********************]  2 of 2 completed
[*********************100%***********************]  2 of 2 completed

  -> Processing pair: AVGO-KLAC



[*********************100%***********************]  2 of 2 completed


  -> Processing pair: SRE-VICI
  -> Processing pair: DIS-T


[*********************100%***********************]  2 of 2 completed
[*********************100%***********************]  2 of 2 completed


  -> Processing pair: AVB-LOW


[*********************100%***********************]  2 of 2 completed

  -> Processing pair: BAX-TECH



[*********************100%***********************]  2 of 2 completed

  -> Processing pair: IR-TDG



[*********************100%***********************]  2 of 2 completed

  -> Processing pair: KIM-NSC






Feature engineering complete.
Total rows in all_features: 29332


In [8]:
# Load yearly_cointegrations from the JSON file
with open("yearly_cointegrations.json", "r") as json_file:
    yearly_cointegrations = json.load(json_file)

# Collect data for all pairs
all_features = pd.DataFrame()

for year, co_pairs_list in yearly_cointegrations.items():
    if year != "2023":
        continue  # Skip all years except 2023
    
    start_year = 2024
    
    # Extend the start date to include the rolling window buffer
    buffer_start_date = f"{int(start_year) - 1}-12-01"  # 30 days before 2024-01-01
    target_start_date = f"{start_year}-01-01"
    end_date = f"{int(start_year) + 1}-01-01"

    print(f"\n=== Running feature engineering for {target_start_date} to {end_date} ===")

    for pair_data in co_pairs_list:
        tickerA, tickerB = pair_data[0]  # Extract the pair (tickerA, tickerB)
        print(f"  -> Processing pair: {tickerA}-{tickerB}")
        try:
            # Compute features for the pair
            pair_features = compute_features_for_pair(
                ticker_A=tickerA, 
                ticker_B=tickerB, 
                start_date=buffer_start_date,  # Use the buffer start date
                end_date=end_date,
                formation_year=int(start_year)  # Pass the start_year as formation_year
            )
            
            # Filter features to only include the target date range
            pair_features = pair_features.loc[target_start_date:end_date]
            
            pair_features['pair'] = f"{tickerA}-{tickerB}"  # Add pair identifier
            all_features = pd.concat([all_features, pair_features], ignore_index=True)
        except Exception as e:
            print(f"    Error processing {tickerA}-{tickerB}: {str(e)}")

print("\nFeature engineering complete.")
print("Total rows in all_features:", len(all_features))

# Optionally, save to CSV
all_features.to_csv("yearly_pairs_features_test.csv", index=False)

[*********************100%***********************]  2 of 2 completed


=== Running feature engineering for 2024-01-01 to 2025-01-01 ===
  -> Processing pair: EXPD-MLM



[*********************100%***********************]  2 of 2 completed

  -> Processing pair: CMS-RTX



[*********************100%***********************]  2 of 2 completed

  -> Processing pair: FE-WEC



[*********************100%***********************]  2 of 2 completed

  -> Processing pair: AWK-HOLX



[*********************100%***********************]  2 of 2 completed

  -> Processing pair: BEN-FRT



[*********************100%***********************]  2 of 2 completed

  -> Processing pair: HSIC-UPS



[*********************100%***********************]  2 of 2 completed

  -> Processing pair: ES-TSCO



[*********************100%***********************]  2 of 2 completed

  -> Processing pair: GE-TTWO



[*********************100%***********************]  2 of 2 completed


  -> Processing pair: ENPH-WBD
  -> Processing pair: SCHW-USB


[*********************100%***********************]  2 of 2 completed
[*********************100%***********************]  2 of 2 completed


  -> Processing pair: AVGO-KLAC


[*********************100%***********************]  2 of 2 completed

  -> Processing pair: SRE-VICI



[*********************100%***********************]  2 of 2 completed

  -> Processing pair: DIS-T



[*********************100%***********************]  2 of 2 completed

  -> Processing pair: AVB-LOW



[*********************100%***********************]  2 of 2 completed

  -> Processing pair: BAX-TECH



[*********************100%***********************]  2 of 2 completed

  -> Processing pair: IR-TDG



[*********************100%***********************]  2 of 2 completed

  -> Processing pair: KIM-NSC

Feature engineering complete.
Total rows in all_features: 4267



