## Dependent and Independent Variables

This code creates the dependent and independent datasets for our project.

In [7]:
#Go to repo
%cd /home/jupyter-toomeyck/HelpHerInvest

/home/jupyter-toomeyck/HelpHerInvest


In [8]:
#Sync latest from GitHub before editing
!git pull --rebase origin main

error: Pulling is not possible because you have unmerged files.
[33mhint: Fix them up in the work tree, and then use 'git add/rm <file>'[m
[33mhint: as appropriate to mark resolution and make a commit.[m
fatal: Exiting because of an unresolved conflict.


In [9]:
# Imports 
import time
import requests
import pandas as pd
#%pip install yfinance --quiet
import yfinance as yf
from pathlib import Path
import numpy as np
import warnings
import zipfile

warnings.filterwarnings("ignore")

# X Variables (Independent) Dataset

In [10]:
# X-Variables #

repo_root = Path("/home/jupyter-toomeyck/HelpHerInvest")
dataset_path = repo_root / "Data" / "stock_symbols_new.csv.zip"
csv_file_name = "stock_symbols_new.csv"

# Open the zip file and then open the specific CSV file within it
with zipfile.ZipFile(dataset_path, 'r') as zf:
    with zf.open(csv_file_name) as file_handle:
        # Read the file-like object directly into pandas
        symbols = pd.read_csv(file_handle)


## Reading Symbols from Dataset ##
tickers = symbols['symbol'].tolist()
tickers.remove("SPY")
tickers = tickers[:50]  # limit to 50 for testing
benchmark = 'SPY'
all_tickers = tickers + [benchmark]

## Download Price Data ##
prices = yf.download(
    all_tickers,
    start='2010-01-01',
    auto_adjust=False,
    progress=False
)['Adj Close']

prices = prices.dropna(how='all')

print(prices.head(10))

## Compute Features ##
monthly_px = prices.resample('ME').last()  # month-end prices
mom_1m  = monthly_px / monthly_px.shift(1)  - 1  # 1-month momentum
mom_3m  = monthly_px / monthly_px.shift(3)  - 1  # 3-month momentum
mom_6m  = monthly_px / monthly_px.shift(6)  - 1  # 6-month momentum
mom_12m = monthly_px / monthly_px.shift(12) - 1  # 12-month momentum
mom_12m_ex_1m = (monthly_px.shift(1) / monthly_px.shift(12)) - 1  # 12-month momentum excluding most recent month

rel_3m_spy  = mom_3m.sub(mom_3m["SPY"], axis=0)  # relative strength against S&P 3-month
rel_6m_spy  = mom_6m.sub(mom_6m["SPY"], axis=0)  # relative strength against S&P 6-month
rel_12m_spy = mom_12m.sub(mom_12m["SPY"], axis=0) # relative strength against S&P 12-month

daily_ret = prices.pct_change() # daily returns

vol_3m = (daily_ret.rolling(63).std() * np.sqrt(252)).resample("M").last() # 3-month volatility
vol_6m = (daily_ret.rolling(126).std() * np.sqrt(252)).resample("M").last() # 6-month volatility


roll_max_6m  = monthly_px.rolling(6).max()  # 6-month rolling max
roll_max_12m = monthly_px.rolling(12).max() # 12-month rolling max

drawdown_6m  = monthly_px / roll_max_6m  - 1  # 6-month drawdown
drawdown_12m = monthly_px / roll_max_12m - 1  # 12-month drawdown

dma_200 = prices.rolling(200).mean().resample("M").last() # 200-day moving average
pct_above_200dma = monthly_px / dma_200 - 1  # pct above 200-day moving average


## Combine Features ##
X = pd.concat(
    {
        "mom_1m": mom_1m[tickers],
        "mom_3m": mom_3m[tickers],
        "mom_6m": mom_6m[tickers],
        "mom_12m": mom_12m[tickers],
        "mom_12m_ex_1m": mom_12m_ex_1m[tickers],
        "rel_3m_spy": rel_3m_spy[tickers],
        "rel_6m_spy": rel_6m_spy[tickers],
        "rel_12m_spy": rel_12m_spy[tickers],
        "vol_3m": vol_3m[tickers],
        "vol_6m": vol_6m[tickers],
        #"downside_vol_6m": downside_vol_6m[tickers],
        "drawdown_6m": drawdown_6m[tickers],
        "drawdown_12m": drawdown_12m[tickers],
        "pct_above_200dma": pct_above_200dma[tickers],
    },
    axis=1
)


## Standardize Data Function - z score ##
def zscore_cs(row: pd.Series) -> pd.Series:
    # row contains values across tickers for a single feature at a single date
    mu = row.mean()
    sd = row.std(ddof=0)
    if sd == 0 or np.isnan(sd):
        return row * 0.0
    return (row - mu) / sd # calcs z-score


## Normalize per feature across tickers at each date ##
X_z = X.copy()
print("X_z before normalization:")
print(X_z.head(20))

for feat in X.columns.get_level_values(0).unique():
    X_z[feat] = X[feat].apply(zscore_cs, axis=1)

print("X_z after normalization:")
print(X_z.head(20))

## Flatten X_z table so each  ticker is a row ##
X_panel = (
    X_z.stack(level=1)              # index becomes (Date, Ticker)
      .rename_axis(index=["Date","Ticker"])
      .reset_index()
)


print(X_panel.head())
print("-----  -----  -----")
print(X_panel.tail())
print("-----  -----  -----")

print("Size of dataset:",
"Rows:",X_panel.shape[0],
"Columns:",X_panel.shape[1])  


output = repo_root / "Data" / "dependent_variables.csv"
X_panel.to_csv(output, index=False)
print(f"Output file saved to: {output}")



8 Failed downloads:
['BABA', 'TM', 'GE', 'UNH', 'NVDA', 'WFC', 'NVS', 'HSBC']: TypeError("'NoneType' object is not subscriptable")


Ticker          AAPL  ABBV   AMD    AMZN       ASML      AVGO        AXP  \
Date                                                                       
2010-01-04  6.418382   NaN  9.70  6.6950  32.425724  1.328563  32.483315   
2010-01-05  6.429482   NaN  9.71  6.7345  32.678322  1.338425  32.411877   
2010-01-06  6.327210   NaN  9.57  6.6125  32.977695  1.348991  32.935791   
2010-01-07  6.315513   NaN  9.47  6.5000  32.060867  1.340538  33.469975   
2010-01-08  6.357502   NaN  9.43  6.6760  31.293722  1.350400  33.446072   
2010-01-11  6.301418   NaN  9.14  6.5155  30.629499  1.358853  33.063362   
2010-01-12  6.229739   NaN  8.65  6.3675  30.676268  1.338425  33.501858   
2010-01-13  6.317613   NaN  9.15  6.4555  31.527615  1.297567  33.605522   
2010-01-14  6.281024   NaN  9.00  6.3675  31.312443  1.298272  34.028076   
2010-01-15  6.176056   NaN  8.84  6.3570  30.620138  1.284888  33.796875   

Ticker            AZN  BABA        BAC  ...        RTX        SAP        SPY  \
Date   

## Y Variable (Dependent) Dataset

In [None]:
# Y-Variable #

repo_root = Path("/home/jupyter-toomeyck/HelpHerInvest")
dataset_path = repo_root / "Data" / "stock_symbols_new.csv.zip"
csv_file_name = "stock_symbols_new.csv"

# Open the zip file and then open the specific CSV file within it
with zipfile.ZipFile(dataset_path, 'r') as zf:
    with zf.open(csv_file_name) as file_handle:
        # Read the file-like object directly into pandas
        symbols = pd.read_csv(file_handle)

rand_symbols = symbols['symbol'].head(50).tolist()
print(rand_symbols)        

def forward_excess_return_monthly(tickers, benchmark="SPY", start="2010-01-01", end=None, horizon_months=3):

    universe = list(dict.fromkeys(list(tickers) + [benchmark]))

    px_daily = yf.download(
        universe, start=start, end=end, auto_adjust=False, progress=False
    )["Adj Close"]

    px_m = px_daily.resample("M").last().dropna(subset=[benchmark])

    fwd_ret = px_m.shift(-horizon_months) / px_m - 1.0
    bench_fwd = fwd_ret[benchmark].rename("bench_fwd_return")

    # Wide to long in one go
    out = pd.DataFrame(index=px_m.index)
    out["bench_fwd_return"] = bench_fwd

    for t in tickers:
        out[f"adj_close_{t}"] = px_m[t]
        out[f"fwd_return_{t}"] = fwd_ret[t]
        out[f"fwd_excess_{t}"] = fwd_ret[t] - bench_fwd

    return out

df_y_m = forward_excess_return_monthly(rand_symbols, benchmark="SPY", start="2010-01-01", horizon_months=6)
print(df_y_m.head(10))
print(df_y_m.tail(10))

bench_df = (
    df_y_m[["bench_fwd_return"]]
    .rename(columns={"bench_fwd_return": "bench_fwd_return"})
    .reset_index()
)

# Keep only stock-level columns
stock_cols = [c for c in df_y_m.columns if "_" in c and not c.startswith("fwd_ret_bench")]

y_long = (
    df_y_m[stock_cols]
    .reset_index()
    .melt(id_vars="Date", var_name="metric_ticker", value_name="value")
)

# Split "adj_close_AAPL" -> metric="adj_close", ticker="AAPL"
y_long[["metric", "ticker"]] = y_long["metric_ticker"].str.rsplit("_", n=1, expand=True)

y_long = y_long.drop(columns="metric_ticker")

df_y_m_output = (
    y_long
    .pivot(index=["Date", "ticker"], columns="metric", values="value")
    .reset_index()
)

df_y_m_output = df_y_m_output[["Date", "ticker", "adj_close", "fwd_excess", "fwd_return"]]
df_y_m_output = df_y_m_output[df_y_m_output["ticker"] != "return"]

print(df_y_m_output.head(10))
print(df_y_m_output.tail(10))

# Save to CSV
y_output = repo_root / "Data" / "independent_variables.csv"
df_y_m_output.to_csv(y_output, index=False)



20 Failed downloads:
['GS', 'NFLX', 'META', 'KO', 'ASML', 'LLY', 'HD', 'PLTR', 'NVO', 'JPM', 'CSCO', 'MA', 'AVGO', 'PG', 'AZN', 'SAP', 'SPY', 'V', 'TSLA', 'XOM']: TypeError("'NoneType' object is not subscriptable")


Empty DataFrame
Columns: [bench_fwd_return, adj_close_NVDA, fwd_return_NVDA, fwd_excess_NVDA, adj_close_GOOGL, fwd_return_GOOGL, fwd_excess_GOOGL, adj_close_AAPL, fwd_return_AAPL, fwd_excess_AAPL, adj_close_MSFT, fwd_return_MSFT, fwd_excess_MSFT, adj_close_AMZN, fwd_return_AMZN, fwd_excess_AMZN, adj_close_META, fwd_return_META, fwd_excess_META, adj_close_AVGO, fwd_return_AVGO, fwd_excess_AVGO, adj_close_TSLA, fwd_return_TSLA, fwd_excess_TSLA, adj_close_BRK-B, fwd_return_BRK-B, fwd_excess_BRK-B, adj_close_LLY, fwd_return_LLY, fwd_excess_LLY, adj_close_WMT, fwd_return_WMT, fwd_excess_WMT, adj_close_JPM, fwd_return_JPM, fwd_excess_JPM, adj_close_V, fwd_return_V, fwd_excess_V, adj_close_ORCL, fwd_return_ORCL, fwd_excess_ORCL, adj_close_XOM, fwd_return_XOM, fwd_excess_XOM, adj_close_MA, fwd_return_MA, fwd_excess_MA, adj_close_ASML, fwd_return_ASML, fwd_excess_ASML, adj_close_JNJ, fwd_return_JNJ, fwd_excess_JNJ, adj_close_PLTR, fwd_return_PLTR, fwd_excess_PLTR, adj_close_BAC, fwd_return_BAC,

ValueError: Columns must be same length as key