# Anonymous Feature Correlation Notebook

This notebook correlates the anonymous features from `train.csv` with public, open-source data from **Yahoo Finance** and **FRED (Federal Reserve Economic Data)**.

**Objective:** To de-anonymize the features by finding strong correlations with real-world indicators.

**Logic:**
1.  **Install** required libraries.
2.  **Load Public Data:** Download 10+ years of daily data from Yahoo (`yfinance`) and FRED (`fredapi`).
3.  **Load Anonymous Data:** Load the `train.csv` file.
4.  **Map Dates:** Replicate the logic from the original notebook. We assume the `N` unique `date_id`s in `train.csv` map sequentially to the *last N* trading days of the S&P 500 (`^GSPC`) data.
5.  **Merge Data:** Join all dataframes on the mapped `Date`.
6.  **Calculate & Show Correlations:** Find the strongest (positive and negative) correlations between the anonymous and public features.

In [20]:
# --- 2. Load Public Data ---
import yfinance as yf
from fredapi import Fred
import polars as pl
from datetime import datetime
import pandas as pd
import dotenv

# *** START OF UPDATE ***

# --- Configuration ---
START_DATE = "2010-01-01"
END_DATE = datetime.now().strftime('%Y-%m-%d')
FRED_API_KEY = dotenv.get_key(dotenv.find_dotenv(), "FRED_API_KEY")

# Define tickers and FRED series IDs by category
YFINANCE_TICKERS = {
    # M* - Market Dynamics/Technical
    'M_GSPC': '^GSPC',        # S&P 500 Index
    'M_IXIC': '^IXIC',        # NASDAQ Composite
    'M_DJI': '^DJI',          # Dow Jones Industrial Average
    'M_RUT': '^RUT',          # Russell 2000

    # V* - Volatility
    'V_VIX': '^VIX',          # CBOE Volatility Index
    'V_VXN': '^VXN',          # CBOE NASDAQ 100 Volatility Index

    # P* - Price/Valuation
    'P_GC=F': 'GC=F',         # Gold Futures
    'P_CL=F': 'CL=F',         # Crude Oil Futures
    'P_SI=F': 'SI=F',         # Silver Futures
    'P_HG=F': 'HG=F',         # Copper Futures

    # I* - Interest Rates (from Yahoo)
    'I_FVX': '^FVX',          # Treasury Yield 5 Years
    'I_TNX': '^TNX',          # CBOE Interest Rate 10 Year T Note
    'I_TYX': '^TYX',          # Treasury Yield 30 Years
    
    # S* - Sentiment/Alternative
    'S_BTC-USD': 'BTC-USD',   # Bitcoin
}

FRED_SERIES = {
    # E* - Macro Economic
    'E_GDP': 'GDP',                       # Gross Domestic Product
    'E_UNRATE': 'UNRATE',                 # Unemployment Rate
    'E_CPIAUCSL': 'CPIAUCSL',             # Consumer Price Index
    'E_PPIACO': 'PPIACO',                 # Producer Price Index
    'E_INDPRO': 'INDPRO',                 # Industrial Production Index
    'E_PAYEMS': 'PAYEMS',                 # Non-Farm Payrolls
    'E_ICSA': 'ICSA',                     # Initial Claims

    # I* - Interest Rates
    'I_DFF': 'DFF',                       # Federal Funds Effective Rate
    'I_DTB3': 'DTB3',                     # 3-Month Treasury Bill
    'I_DGS2': 'DGS2',                     # 2-Year Treasury Yield
    'I_DGS10': 'DGS10',                   # 10-Year Treasury Yield
    'I_T10Y2Y': 'T10Y2Y',                 # 10-Year vs 2-Year Treasury Spread

    # V* - Volatility
    'V_VIXCLS': 'VIXCLS',                 # VIX (from FRED)

    # S* - Sentiment
    'S_UMCSENT': 'UMCSENT',               # University of Michigan Consumer Sentiment
    'S_BAMLH0A0HYM2': 'BAMLH0A0HYM2',    # BofA US High Yield Index Option-Adjusted Spread
}

# Define momentum periods (in days)
MOMENTUM_PERIODS = [5, 21, 63, 252] # 1 week, 1 month, 1 quarter, 1 year

# --- Data Fetching Functions ---
def fetch_yfinance_data(tickers, start, end):
    """Fetches and processes data from Yahoo Finance."""
    print(f"Fetching {len(tickers)} tickers from Yahoo Finance...")
    df = yf.download(list(tickers.values()), start=start, end=end, progress=False)
    
    # Use only 'Adj Close' and rename columns to custom names
    df = df['Close']
    df = df.rename(columns={v: k for k, v in tickers.items()})
    
    df = df.reset_index()
    return pl.from_pandas(df)

def fetch_fred_data(series_ids, api_key, start, end):
    """Fetches and processes data from FRED."""
    print(f"Fetching {len(series_ids)} series from FRED...")
    try:
        fred = Fred(api_key=api_key)
    except ValueError as e:
        print(f"Error initializing Fred API: {e}")
        print("Please make sure you have set a valid FRED_API_KEY.")
        return pl.DataFrame()

    # Fetch all series
    df_list = []
    for code, name in series_ids.items():
        try:
            s = fred.get_series(name, start_date=start, end_date=end)
            s.name = code
            df_list.append(s)
        except Exception as e:
            print(f"Could not fetch series {name} ({code}): {e}")
            
    if not df_list:
        print("No data fetched from FRED.")
        return pl.DataFrame()
    
    # Combine into a single DataFrame
    df = pd.concat(df_list, axis=1).reset_index()
    df = df.rename(columns={'index': 'Date'})
    return pl.from_pandas(df)

# --- Fetch and Process Data ---
yf_data = fetch_yfinance_data(YFINANCE_TICKERS, START_DATE, END_DATE)
fred_data = fetch_fred_data(FRED_SERIES, FRED_API_KEY, START_DATE, END_DATE)

# Ensure Date columns are of the same type
yf_data = yf_data.with_columns(pl.col("Date").cast(pl.Date))
if not fred_data.is_empty():
    fred_data = fred_data.with_columns(pl.col("Date").cast(pl.Date))

    # Join the two public datasets
    public_data = yf_data.join(fred_data, on='Date', how='left')
else:
    public_data = yf_data

# Sort and forward-fill missing values (common for economic data)
public_data = public_data.sort("Date").fill_null(strategy='forward')

# --- Calculate Momentum Features ---
mom_features = []
for col in public_data.columns:
    if col != 'Date': # Don't calculate momentum on the date column
        for period in MOMENTUM_PERIODS:
            # Calculate rolling average (SMA)
            sma_col_name = f"MOM_SMA_{period}D_{col}"
            public_data = public_data.with_columns(
                pl.col(col).rolling_mean(window_size=period).alias(sma_col_name)
            )
            
            # Calculate Rate of Change (ROC)
            roc_col_name = f"MOM_ROC_{period}D_{col}"
            public_data = public_data.with_columns(
                ((pl.col(col) / pl.col(col).shift(period)) - 1).alias(roc_col_name)
            )
            

# Drop rows with nulls created by momentum calculations
# public_data = public_data.drop_nulls()

print("\n--- Public Data Shape ---")
print(public_data.shape)
print("\n--- Public Data Head ---")
print(public_data.head())

# *** END OF UPDATE ***

Fetching 14 tickers from Yahoo Finance...
Fetching 15 series from FRED...


  df = yf.download(list(tickers.values()), start=start, end=end, progress=False)



--- Public Data Shape ---
(5234, 262)

--- Public Data Head ---
shape: (5, 262)
┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐
│ Date      ┆ S_BTC-USD ┆ P_CL=F    ┆ P_GC=F    ┆ … ┆ MOM_SMA_6 ┆ MOM_ROC_6 ┆ MOM_SMA_2 ┆ MOM_ROC_ │
│ ---       ┆ ---       ┆ ---       ┆ ---       ┆   ┆ 3D_S_BAML ┆ 3D_S_BAML ┆ 52D_S_BAM ┆ 252D_S_B │
│ date      ┆ f64       ┆ f64       ┆ f64       ┆   ┆ H0A0HYM2  ┆ H0A0HYM2  ┆ LH0A0HYM2 ┆ AMLH0A0H │
│           ┆           ┆           ┆           ┆   ┆ ---       ┆ ---       ┆ ---       ┆ YM2      │
│           ┆           ┆           ┆           ┆   ┆ f64       ┆ f64       ┆ f64       ┆ ---      │
│           ┆           ┆           ┆           ┆   ┆           ┆           ┆           ┆ f64      │
╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡
│ 2010-01-0 ┆ null      ┆ 81.510002 ┆ 1117.6999 ┆ … ┆ null      ┆ null      ┆ null      ┆ null     │
│ 4       

In [15]:
# --- 3. Load Anonymous Data ---
try:
    # Load the training data
    train = pl.read_csv('./kaggle/train.csv').slice(1000)
    
    # Get the feature columns (assuming they are named feature_0, feature_1, etc.)
    ANONYMOUS_FEATURES = [col for col in train.columns if col not in ['id', 'date_id', 'target']]
    
    print("--- Anonymous Data Shape ---")
    print(train.shape)
    
    # Get unique date_ids to map
    anonymous_date_ids = train.select('date_id').unique().sort('date_id')
    
except FileNotFoundError:
    print("Error: 'train.csv' not found. Please make sure the file is in the correct directory.")
    train = None

--- Anonymous Data Shape ---
(7990, 98)


In [11]:
# --- 4. Map Dates ---
# Run the mapping
SPY_DATE_PATH = './kaggle/spy-historical.csv'
TRAIN_DATA_PATH = './kaggle/train.csv'
df_spy = pl.read_csv(SPY_DATE_PATH)
df_spy_date = df_spy.with_columns(pl.col("Date").str.to_date().alias("Date"))


"""Loads train.csv and maps date_id to actual dates."""

df_train = pl.read_csv(TRAIN_DATA_PATH)
df_train = df_train.slice(1000)
#print(df_train.glimpse())
anonymous_features = [col for col in df_train.columns if any(col.startswith(p) for p in ['M', 'E', 'I', 'P', 'V', 'S', 'D', 'MOM'])]
string_features_to_convert = []

for col_name in anonymous_features:
    if df_train[col_name].dtype == pl.String:
        string_features_to_convert.append(col_name)

if string_features_to_convert:
    print(f"Found {len(string_features_to_convert)} string columns to convert: {string_features_to_convert}")
    
    # 4. Convert them, turning bad values (like "N/A") into nulls
    df_train = df_train.with_columns(
        pl.col(string_features_to_convert).cast(pl.Float64, strict=False)
    )

# 1. Get unique, sorted date_ids from training data
unique_date_ids = df_train["date_id"].unique().sort()
n_dates_train = len(unique_date_ids)
print(f"Found {n_dates_train} unique date_ids in train.csv.")

# 2. Get the *last N* trading days from our public SPY data
# (This is the CRITICAL ASSUMPTION from your notebook)
spy_ground_truth_dates = df_spy_date.sort("date_id").tail(n_dates_train)["Date"]

if len(spy_ground_truth_dates) != n_dates_train:
    print("Error: Mismatch in date counts. Cannot perform mapping.")

# 3. Create the mapping dataframe
date_map = pl.DataFrame({
    "date_id": unique_date_ids,
    "Date": spy_ground_truth_dates
})

# print("Date mapping created. Example:")
# print(date_map.head(3))
# print("...")
# print(date_map.tail(3))

# 4. Join the map back to the training data
df_train_with_dates = df_train.join(date_map, on="date_id", how="left")

Found 85 string columns to convert: ['E1', 'E10', 'E11', 'E12', 'E13', 'E14', 'E15', 'E16', 'E17', 'E18', 'E19', 'E2', 'E20', 'E3', 'E4', 'E5', 'E6', 'E7', 'E8', 'E9', 'I1', 'I2', 'I3', 'I4', 'I5', 'I6', 'I7', 'I8', 'I9', 'M1', 'M10', 'M11', 'M12', 'M13', 'M14', 'M15', 'M16', 'M17', 'M18', 'M2', 'M3', 'M4', 'M5', 'M6', 'M7', 'M8', 'M9', 'P1', 'P10', 'P11', 'P12', 'P13', 'P2', 'P3', 'P4', 'P5', 'P6', 'P7', 'P8', 'P9', 'S1', 'S10', 'S11', 'S12', 'S2', 'S3', 'S4', 'S5', 'S6', 'S7', 'S8', 'S9', 'V1', 'V10', 'V11', 'V12', 'V13', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9']
Found 7990 unique date_ids in train.csv.


In [25]:
# --- 5. Merge Data ---
master_df = None

df_merged = df_train_with_dates.join(
    public_data, on="Date", how="left"
)

# Forward-fill nulls in public data (especially for FRED data)
public_feature_cols = list(YFINANCE_TICKERS.values()) + list(FRED_SERIES.values())
public_feature_cols = [f"{col}_Close" if col in YFINANCE_TICKERS.values() else col for col in public_feature_cols]

df_merged = df_merged.with_columns(
    pl.col(public_feature_cols).forward_fill()
)

# Drop any remaining nulls (e.g., at the very start of the history)
df_merged = df_merged.drop_nulls()

print("Master dataframe created. Shape:", df_merged.shape)
print(df_merged.tail())
df_numeric_features = df_merged.select([pl.selectors.numeric()])
print(f"\nNumeric features dataframe shape: {df_numeric_features.shape}")

master_df = df_merged

ColumnNotFoundError: "^GSPC_Close" not found

In [21]:
if 'df_merged' in locals() and df_merged is not None:
    # Define our two feature groups
    public_features = public_data.columns[1:]  # Exclude 'Date' column
    #anonymous_features = [col for col in df_numeric_features.columns if col not in public_features and any(col.startswith(p) for p in ['M', 'E', 'I', 'P', 'V', 'S', 'D', 'MOM'])]

    all_features = anonymous_features + public_features

    print(f"\nAnonymous features ({len(anonymous_features)}): {anonymous_features}")
    print(f"Public features ({len(public_features)}): {public_features}")
    
    print(f"Correlating {len(anonymous_features)} anonymous features with {len(public_features)} public features...")

    # Calculate the full correlation matrix for our subset of columns
    corr_matrix = df_merged.select(anonymous_features + public_features).corr()

    # *** START OF FIX ***
    
    # Manually add the feature names as a new column
    # This is the column we will filter on.
    corr_matrix_with_labels = corr_matrix.with_columns(
        pl.Series("anonymous_feature", all_features)
    )

    # Filter the matrix to only show (Anonymous Rows x Public Columns)
    corr_subset = corr_matrix_with_labels.filter(
        pl.col("anonymous_feature").is_in(anonymous_features)
    ).select(
        ["anonymous_feature"] + public_features  # Use the new column name
    )

    # Melt the matrix to a long format for easy sorting
    corr_long = corr_subset.melt(
        id_vars="anonymous_feature",  # Use the new column name
        variable_name="public_feature", 
        value_name="correlation"
    )
    
    # *** END OF FIX ***


    # Sort by absolute correlation to find strongest links
    corr_sorted = corr_long.with_columns(
        pl.col("correlation").abs().alias("abs_correlation")
    ).sort("abs_correlation", descending=True)

    print("\n--- Top 30 Most Correlated Feature Pairs --- (Positive or Negative)")
    print(corr_sorted.head(30))

    print("\n--- Top 10 Positive Correlations ---")
    print(corr_sorted.sort("correlation", descending=True).head(10))
    
    print("\n--- Top 10 Negative Correlations ---")
    print(corr_sorted.sort("correlation", descending=False).head(10))

else:
    print("\nSkipping correlation: Master dataframe was not created due to an error in a previous step.")




Anonymous features (94): ['D1', 'D2', 'D3', 'D4', 'D5', 'D6', 'D7', 'D8', 'D9', 'E1', 'E10', 'E11', 'E12', 'E13', 'E14', 'E15', 'E16', 'E17', 'E18', 'E19', 'E2', 'E20', 'E3', 'E4', 'E5', 'E6', 'E7', 'E8', 'E9', 'I1', 'I2', 'I3', 'I4', 'I5', 'I6', 'I7', 'I8', 'I9', 'M1', 'M10', 'M11', 'M12', 'M13', 'M14', 'M15', 'M16', 'M17', 'M18', 'M2', 'M3', 'M4', 'M5', 'M6', 'M7', 'M8', 'M9', 'P1', 'P10', 'P11', 'P12', 'P13', 'P2', 'P3', 'P4', 'P5', 'P6', 'P7', 'P8', 'P9', 'S1', 'S10', 'S11', 'S12', 'S2', 'S3', 'S4', 'S5', 'S6', 'S7', 'S8', 'S9', 'V1', 'V10', 'V11', 'V12', 'V13', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9']
Public features (261): ['S_BTC-USD', 'P_CL=F', 'P_GC=F', 'P_HG=F', 'P_SI=F', 'M_DJI', 'I_FVX', 'M_GSPC', 'M_IXIC', 'M_RUT', 'I_TNX', 'I_TYX', 'V_VIX', 'V_VXN', 'E_GDP', 'E_UNRATE', 'E_CPIAUCSL', 'E_PPIACO', 'E_INDPRO', 'E_PAYEMS', 'E_ICSA', 'I_DFF', 'I_DTB3', 'I_DGS2', 'I_DGS10', 'I_T10Y2Y', 'V_VIXCLS', 'S_UMCSENT', 'S_BAMLH0A0HYM2', 'MOM_SMA_5D_S_BTC-USD', 'MOM_ROC_5D_S_BTC-

  ret = umr_sum(arr, axis, dtype, out, keepdims, where=where)
  corr_long = corr_subset.melt(
