In [14]:
# 00_macro_observables_fred_requests.py
import os
import requests
import pandas as pd

START = "2000-01-01"
API_KEY = os.environ.get("FRED_API_KEY", "086e7d754c3e13ccb52cb9b1c1cef71d")  # <-- replace if not in env

def fetch_fred(series_id):
    url = f"https://api.stlouisfed.org/fred/series/observations"
    params = {
        "series_id": series_id,
        "api_key": API_KEY,
        "file_type": "json",
        "observation_start": START
    }
    r = requests.get(url, params=params, timeout=30)
    r.raise_for_status()
    data = r.json()["observations"]
    df = pd.DataFrame(data)
    df["date"] = pd.to_datetime(df["date"])
    df[series_id] = pd.to_numeric(df["value"], errors="coerce")
    return df.set_index("date")[[series_id]]

# --- Pull series ---
vix    = fetch_fred("VIXCLS")
hy_oas = fetch_fred("BAMLH0A0HYM2")
ig_oas = fetch_fred("BAMLC0A0CM")
cfnai  = fetch_fred("CFNAI")
term   = fetch_fred("T10Y3M")

# --- Resample / derive ---
vix = vix.resample("ME").mean().rename(columns={"VIXCLS":"VIX"})

credit = pd.concat([hy_oas, ig_oas], axis=1).resample("ME").last()
credit = credit.rename(columns={"BAMLH0A0HYM2":"HY_OAS","BAMLC0A0CM":"IG_OAS"})
credit["HY_IG_SPREAD"] = credit["HY_OAS"] - credit["IG_OAS"]
credit["HY_IG_SPREAD_CHG_1M"] = credit["HY_IG_SPREAD"].diff()

cfnai.index = pd.to_datetime(cfnai.index) + pd.offsets.MonthEnd(0)
cfnai["CFNAI_3MMA"] = cfnai["CFNAI"].rolling(3, min_periods=1).mean()

term = term.resample("ME").last().rename(columns={"T10Y3M":"TERM_10Y_3M"})

# --- Merge final monthly panel ---
macro = vix.join(
    [credit[["HY_OAS","IG_OAS","HY_IG_SPREAD","HY_IG_SPREAD_CHG_1M"]],
     cfnai[["CFNAI","CFNAI_3MMA"]], term],
    how="inner"
).dropna()

print("Macro shape:", macro.shape)
print(macro.tail())


Macro shape: (305, 8)
                  VIX  HY_OAS  IG_OAS  HY_IG_SPREAD  HY_IG_SPREAD_CHG_1M  \
date                                                                       
2025-02-28  16.968000    2.87    0.88          1.99                 0.13   
2025-03-31  21.841429    3.55    0.97          2.58                 0.59   
2025-04-30  31.966190    3.94    1.09          2.85                 0.27   
2025-05-31  20.462273    3.32    0.92          2.40                -0.45   
2025-06-30  18.403333    2.96    0.86          2.10                -0.30   

            CFNAI  CFNAI_3MMA  TERM_10Y_3M  
date                                        
2025-02-28   0.39    0.120000        -0.08  
2025-03-31   0.15    0.050000        -0.09  
2025-04-30  -0.41    0.043333        -0.14  
2025-05-31  -0.16   -0.140000         0.05  
2025-06-30  -0.10   -0.223333        -0.17  


In [15]:
macro.head()

Unnamed: 0_level_0,VIX,HY_OAS,IG_OAS,HY_IG_SPREAD,HY_IG_SPREAD_CHG_1M,CFNAI,CFNAI_3MMA,TERM_10Y_3M
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2000-02-29,23.5955,5.08,1.29,3.79,0.14,-0.22,-0.02,0.64
2000-03-31,22.718261,5.75,1.48,4.27,0.48,0.45,0.136667,0.15
2000-04-30,27.164211,5.88,1.61,4.27,0.0,0.43,0.22,0.41
2000-05-31,26.373182,6.16,1.77,4.39,0.12,-0.34,0.18,0.66
2000-06-30,21.54,6.17,1.71,4.46,0.07,0.13,0.073333,0.15


In [16]:
macro.to_csv("C:/Users/woota/OneDrive/Desktop/새 폴더/Personal projects/HMM ML strategy/improved HMM based factor strategy/macro_data.xlsx")

## Downlading Factor Data

In [19]:
# 02_factor_panel_full_revised_capw_lagged_FIXED.py
import pandas as pd
import numpy as np
import wrds

# =========================
# Settings
# =========================
START = "2000-01-01"       # collect only from 2010 onward
START_YEAR = 2000          # FUNDQ loop start (see note below)
END_FUNDQ_YEAR = 2025      # inclusive
WINSORIZE = True           # within-month winsorize + median impute for features

# NOTE: For full 12m TTM coverage starting exactly in 2010-01,
# consider START_YEAR = 2008 to pull earlier quarters.

# =========================
# Connect (prompts once)
# =========================
db = wrds.Connection()

# =========================
# CRSP monthly (prices, returns, volume)
# =========================
msf = db.raw_sql(f"""
    select permno, permco, date, ret, prc, shrout, vol
    from crsp.msf
    where date >= '{START}'
""")
msf["date"] = pd.to_datetime(msf["date"]) + pd.offsets.MonthEnd(0)
msf["prc"] = msf["prc"].abs()
msf["mktcap"] = (msf["prc"] * msf["shrout"] * 1000).replace([np.inf, -np.inf], np.nan)  # shrout thousands -> $
msf["ret"] = pd.to_numeric(msf["ret"], errors="coerce")

# =========================
# Delisting returns → total returns
# =========================
msed = db.raw_sql(f"""
    select permno, dlstdt as date, dlret
    from crsp.msedelist
    where dlstdt >= '{START}'
""")
msed["date"] = pd.to_datetime(msed["date"]) + pd.offsets.MonthEnd(0)
msed["dlret"] = pd.to_numeric(msed["dlret"], errors="coerce")

msf = msf.merge(msed, on=["permno","date"], how="left")
msf["ret_total"] = (1.0 + msf["ret"].fillna(0.0)) * (1.0 + msf["dlret"].fillna(0.0)) - 1.0
msf = msf.drop(columns=["dlret"])

# =========================
# CCM link (CRSP ↔ Compustat), keep primary links
# =========================
ccm = db.raw_sql("""
    select gvkey, lpermno as permno, linkprim, linkdt, linkenddt
    from crsp.ccmxpf_linktable
    where lpermno is not null
""")
ccm = ccm[ccm["linkprim"].isin(["P","C"])].copy()
ccm["linkdt"]    = pd.to_datetime(ccm["linkdt"].fillna("1900-01-01"))
ccm["linkenddt"] = pd.to_datetime(ccm["linkenddt"].fillna("2099-12-31"))

# =========================
# S&P 500 membership periods
# =========================
spx = db.raw_sql(f"""
    select permno, start, ending
    from crsp.dsp500list
    where start <= current_date
      and (ending is null or ending >= '{START}')
""")
spx["start"]  = pd.to_datetime(spx["start"]) + pd.offsets.MonthEnd(0)
spx["ending"] = pd.to_datetime(spx["ending"]).fillna(pd.Timestamp("2100-12-31")) + pd.offsets.MonthEnd(0)

# Universe: in S&P 500 at some point AND linked to Compustat
keep_permnos = set(spx["permno"].unique()) & set(ccm["permno"].unique())
msf = msf[msf["permno"].isin(keep_permnos)].copy()

# =========================
# Restrict to SPX-active months per permno
# =========================
panel = msf.merge(spx[["permno","start","ending"]], on="permno", how="inner")
panel = panel[(panel["date"] >= panel["start"]) & (panel["date"] <= panel["ending"])]
panel = panel.drop(columns=["start","ending"]).sort_values(["permno","date"]).reset_index(drop=True)

# Clean numerics
for col in ["ret_total","mktcap","prc","vol"]:
    panel[col] = pd.to_numeric(panel[col], errors="coerce").replace([np.inf, -np.inf], np.nan)

# =========================
# Market return: cap-weighted (t-1 weights, start-of-month)
# =========================
panel["mktcap_lag"] = panel.groupby("permno")["mktcap"].shift(1)

cw = panel[["date","ret_total","mktcap_lag"]].copy()
cw = cw[cw["ret_total"].notna() & cw["mktcap_lag"].notna() & (cw["mktcap_lag"] > 0)]

num = cw.assign(wret=cw["ret_total"] * cw["mktcap_lag"]).groupby("date")["wret"].sum()
den = cw.groupby("date")["mktcap_lag"].sum()
mkt_ret = (num / den).to_frame(name="mkt_ret")  # S&P 500 cap-weighted total return proxy (panel universe)

panel = panel.merge(mkt_ret, left_on="date", right_index=True, how="left")
panel["excess_ret"] = panel["ret_total"] - panel["mkt_ret"]

# =========================
# Price-based factors (leakage-safe)
# =========================
ret_shift = panel.groupby("permno")["ret_total"].shift(1)

panel["mom_12m_ex1m"] = (
    (1 + ret_shift)
    .groupby(panel["permno"])
    .rolling(11, min_periods=8)
    .apply(lambda x: np.prod(x) - 1, raw=False)
    .reset_index(level=0, drop=True)
)

panel["mom_6m_ex1m"] = (
    (1 + ret_shift)
    .groupby(panel["permno"])
    .rolling(5, min_periods=4)
    .apply(lambda x: np.prod(x) - 1, raw=False)
    .reset_index(level=0, drop=True)
)

panel["vol_3m"] = (
    panel.groupby("permno")["ret_total"]
         .shift(1)
         .rolling(3, min_periods=2)
         .std()
         .reset_index(level=0, drop=True)
)

panel["log_mktcap"] = np.log(panel["mktcap_lag"].replace(0, np.nan))
panel["dollar_vol"] = ((panel["prc"] * panel["vol"]).groupby(panel["permno"]).shift(1))

# =========================
# Compustat FUNDQ (quarterly), +3m publication lag
# =========================
fundq_parts = []
for y in range(START_YEAR, END_FUNDQ_YEAR + 1):   # starts at 2010 per your request
    dfy = db.raw_sql(f"""
        select q.gvkey, q.datadate, q.fyearq, q.fqtr,
               q.atq, q.ltq, q.seqq, q.ceqq, q.txditcq,
               q.saleq, q.cogsq, q.oibdpq, q.ibq,
               q.actq, q.lctq, q.cheq,
               q.rectq, q.invtq, q.apq,
               q.dlcq, q.dlttq, q.ppentq,
               c.lpermno as permno, c.linkdt, coalesce(c.linkenddt, date '2099-12-31') as linkenddt
        from comp.fundq q
        join crsp.ccmxpf_linktable c
          on q.gvkey = c.gvkey
         and c.linkprim in ('P','C')
        join crsp.dsp500list s
          on c.lpermno = s.permno
        where q.datadate >= date '{y}-01-01'
          and q.datadate <  date '{y+1}-01-01'
          and (q.datadate + interval '3 months') between c.linkdt and coalesce(c.linkenddt, date '2099-12-31')
          and s.start <= current_date
          and (s.ending is null or s.ending >= date '{START}')
    """)
    if len(dfy):
        fundq_parts.append(dfy)
        print(f"FUNDQ {y}: {dfy.shape}")

fundq = pd.concat(fundq_parts, ignore_index=True) if fundq_parts else pd.DataFrame()
if not fundq.empty:
    fundq["datadate"] = pd.to_datetime(fundq["datadate"]) + pd.offsets.MonthEnd(0)
    fundq = fundq.sort_values(["gvkey","datadate"]).reset_index(drop=True)

    # TTM sums (min_periods=2 to start estimates earlier but still robust)
    for col in ["ibq","saleq","oibdpq","cogsq"]:
        fundq[col + "_ttm"] = (
            fundq.groupby("gvkey")[col]
                 .rolling(4, min_periods=2)
                 .sum()
                 .reset_index(level=0, drop=True)
        )

    # Book equity proxy and guards for division
    fundq["be"] = fundq["seqq"].fillna(0) + fundq["txditcq"].fillna(0)
    fundq[["atq","be","saleq_ttm"]] = fundq[["atq","be","saleq_ttm"]].replace(0, np.nan)

    fundq["earnings_yield"]   = fundq["ibq_ttm"] / fundq["atq"]
    fundq["pb"]               = fundq["be"] / fundq["atq"]              # book/asset proxy (inverse of P/B)
    fundq["roe"]              = fundq["ibq_ttm"] / fundq["ceqq"]
    fundq["gross_margin"]     = (fundq["saleq_ttm"] - fundq["cogsq_ttm"]) / fundq["saleq_ttm"]
    fundq["operating_margin"] = fundq["oibdpq_ttm"] / fundq["saleq_ttm"]
    fundq["leverage"]         = fundq["ltq"] / fundq["atq"]
    fundq["accruals_ratio"]   = (fundq["actq"] - fundq["lctq"] - fundq["cheq"]) / fundq["atq"]

    # Apply +3m publication lag, then merge to monthly by permno & date
    fundq["datadate_lag"] = fundq["datadate"] + pd.offsets.MonthEnd(3)
    fcols = ["permno","datadate_lag","earnings_yield","pb","roe",
             "gross_margin","operating_margin","leverage","accruals_ratio"]
    fund_ready = fundq[fcols].drop_duplicates(["permno","datadate_lag"])

    panel = panel.merge(fund_ready, left_on=["permno","date"],
                        right_on=["permno","datadate_lag"], how="left") \
                 .drop(columns=["datadate_lag"])
else:
    for c in ["earnings_yield","pb","roe","gross_margin","operating_margin","leverage","accruals_ratio"]:
        panel[c] = np.nan

panel = panel.sort_values(["permno","date"]).reset_index(drop=True)

# Carry fundamentals forward (LOCF ≤ 4 months) and clean inf
ff_cols = ["earnings_yield","pb","roe","gross_margin","operating_margin","leverage","accruals_ratio"]
panel[ff_cols] = (
    panel.groupby("permno")[ff_cols]
         .apply(lambda df: df.ffill(limit=4))
         .reset_index(level=0, drop=True)
)
panel[ff_cols] = panel[ff_cols].replace([np.inf, -np.inf], np.nan)

# =========================
# Optional: within-month winsorize & median impute (features only)
# =========================
x_cols = ["mom_12m_ex1m","mom_6m_ex1m","vol_3m","log_mktcap","dollar_vol",
          "earnings_yield","pb","roe","gross_margin","operating_margin","leverage","accruals_ratio"]

if WINSORIZE:
    def _wclip(s):
        s = pd.to_numeric(s, errors="coerce")
        if s.notna().sum() >= 20:
            lo, hi = s.quantile(0.01), s.quantile(0.99)
            return s.clip(lo, hi)
        return s.clip(lower=s.quantile(0.001), upper=s.quantile(0.999))
    for c in x_cols:
        panel[c] = panel.groupby("date")[c].transform(_wclip)
    for c in x_cols:
        panel[c] = panel.groupby("date")[c].transform(lambda s: s.fillna(s.median()))

# =========================
# Risk-free (monthly) from Fama-French
# =========================
try:
    # Primary: ff.factors_monthly with real date column
    ff = db.raw_sql(f"""
        select date, rf
        from ff.factors_monthly
        where date >= '{START}'
    """)
    if not len(ff):
        # Fallback: if 'date' is stored as yyyymm integer
        ff = db.raw_sql("""
            select cast(to_date(cast(date as text) || '01','YYYYMMDD') as date) as date, rf
            from ff.factors_monthly
        """)
        ff = ff[ff["date"] >= pd.to_datetime(START)]
    ff["date"] = pd.to_datetime(ff["date"]) + pd.offsets.MonthEnd(0)
    ff = ff.drop_duplicates("date").sort_values("date")
    # Convert percent → decimal if needed
    if ff["rf"].abs().mean() > 0.5:
        ff["rf"] = ff["rf"] / 100.0
    rf_m = ff.rename(columns={"rf":"rf_1m"})
    panel = panel.merge(rf_m, on="date", how="left")
    print("Attached monthly risk-free (rf_1m) from ff.factors_monthly.")
except Exception as e:
    print(f"[Info] Could not attach risk-free from WRDS FF table: {e}")
    panel["rf_1m"] = np.nan  # keep column for downstream code

# =========================
# Targets for modeling/backtests (t → t+1)
# =========================
mkt_by_month = panel[["date","mkt_ret"]].drop_duplicates().sort_values("date")
mkt_by_month["spx_ret_next"] = mkt_by_month["mkt_ret"].shift(-1)

panel["excess_ret_next"] = panel.groupby("permno")["excess_ret"].shift(-1)
panel = panel.merge(mkt_by_month[["date","spx_ret_next"]], on="date", how="left")

# =========================
# Final output
# =========================
keep_cols = ["permno","date",
             "excess_ret","excess_ret_next",
             "mkt_ret","spx_ret_next","rf_1m"] + x_cols

factor_panel = panel[keep_cols].sort_values(["permno","date"]).reset_index(drop=True)

print("Preview:")
print(factor_panel.head(10))
print("Shape:", factor_panel.shape)
print("Non-null counts:\n", factor_panel.count())
print("Months with NaN mkt_ret:", panel.groupby("date")["mkt_ret"].first().isna().sum())
print("Rows with NaN excess_ret:", factor_panel["excess_ret"].isna().sum())

# ===== Optional: save =====
# factor_panel.to_parquet("factor_panel.parquet", index=False)
# factor_panel.to_csv("factor_panel.csv", index=False)


Enter your WRDS username [woota]: chriswt
Enter your password: ········


WRDS recommends setting up a .pgpass file.


Create .pgpass file now [y/n]?:  y


Created .pgpass file successfully.
You can create this file yourself at any time with the create_pgpass_file() function.
Loading library list...
Done
FUNDQ 2000: (3331, 25)
FUNDQ 2001: (3321, 25)
FUNDQ 2002: (3300, 25)
FUNDQ 2003: (3296, 25)
FUNDQ 2004: (3310, 25)
FUNDQ 2005: (3304, 25)
FUNDQ 2006: (3247, 25)
FUNDQ 2007: (3214, 25)
FUNDQ 2008: (3153, 25)
FUNDQ 2009: (3096, 25)
FUNDQ 2010: (3091, 25)
FUNDQ 2011: (3074, 25)
FUNDQ 2012: (3077, 25)
FUNDQ 2013: (3083, 25)
FUNDQ 2014: (3073, 25)
FUNDQ 2015: (3037, 25)
FUNDQ 2016: (2961, 25)
FUNDQ 2017: (2922, 25)
FUNDQ 2018: (2884, 25)
FUNDQ 2019: (2855, 25)
FUNDQ 2020: (2781, 25)
FUNDQ 2021: (2734, 25)
FUNDQ 2022: (2691, 25)
FUNDQ 2023: (2675, 25)
FUNDQ 2024: (2661, 25)
FUNDQ 2025: (1315, 25)


  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, out=out, keepdims=keepdims)
  return np.nanmean(a, axis, ou

Attached monthly risk-free (rf_1m) from ff.factors_monthly.
Preview:
   permno       date  excess_ret  excess_ret_next   mkt_ret  spx_ret_next  \
0   10078 2000-01-31        <NA>         0.229868      <NA>     -0.017458   
1   10078 2000-02-29    0.229868        -0.114943 -0.017458      0.098703   
2   10078 2000-03-31   -0.114943         0.012552  0.098703     -0.031395   
3   10078 2000-04-30    0.012552        -0.144401 -0.031395     -0.022152   
4   10078 2000-05-31   -0.144401         0.161127 -0.022152      0.025659   
5   10078 2000-06-30    0.161127          0.17194  0.025659      -0.01249   
6   10078 2000-07-31     0.17194         0.141357  -0.01249      0.062555   
7   10078 2000-08-31    0.141357        -0.028168  0.062555     -0.052088   
8   10078 2000-09-30   -0.028168        -0.046389 -0.052088     -0.003932   
9   10078 2000-10-31   -0.046389        -0.235519 -0.003932     -0.078461   

    rf_1m  mom_12m_ex1m  mom_6m_ex1m    vol_3m  log_mktcap        dollar_vol  \
0  

In [20]:
factor_panel.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150541 entries, 0 to 150540
Data columns (total 19 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   permno            150541 non-null  Int64         
 1   date              150541 non-null  datetime64[ns]
 2   excess_ret        150037 non-null  Float64       
 3   excess_ret_next   149461 non-null  Float64       
 4   mkt_ret           150037 non-null  Float64       
 5   spx_ret_next      150038 non-null  Float64       
 6   rf_1m             150541 non-null  Float64       
 7   mom_12m_ex1m      146516 non-null  float64       
 8   mom_6m_ex1m       148535 non-null  float64       
 9   vol_3m            150541 non-null  float64       
 10  log_mktcap        150037 non-null  Float64       
 11  dollar_vol        150037 non-null  Float64       
 12  earnings_yield    147522 non-null  Float64       
 13  pb                149036 non-null  Float64       
 14  roe 

In [21]:
factor_panel.to_csv("C:/Users/woota/OneDrive/Desktop/새 폴더/Personal projects/HMM ML strategy/improved HMM based factor strategy/factor_panel_data.xlsx")