<a href="https://colab.research.google.com/github/yuxinl915/10701_proj_macro_var_selection/blob/main/data_lasso_selection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install wrds

Collecting wrds
  Downloading wrds-3.4.0-py3-none-any.whl.metadata (5.7 kB)
Collecting packaging<=24.2 (from wrds)
  Downloading packaging-24.2-py3-none-any.whl.metadata (3.2 kB)
Collecting psycopg2-binary<2.10,>=2.9 (from wrds)
  Downloading psycopg2_binary-2.9.11-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (4.9 kB)
Downloading wrds-3.4.0-py3-none-any.whl (14 kB)
Downloading packaging-24.2-py3-none-any.whl (65 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m65.5/65.5 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading psycopg2_binary-2.9.11-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (4.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.2/4.2 MB[0m [31m68.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: psycopg2-binary, packaging, wrds
  Attempting uninstall: packaging
    Found existing installation: packaging 25.0
    Uninstalling packaging-25.0:
      Successfully uninstall

In [1]:
!pip install fredapi

Collecting fredapi
  Downloading fredapi-0.5.2-py3-none-any.whl.metadata (5.0 kB)
Downloading fredapi-0.5.2-py3-none-any.whl (11 kB)
Installing collected packages: fredapi
Successfully installed fredapi-0.5.2


In [2]:
import wrds
import pandas as pd
from fredapi import Fred
from google.colab import userdata

## Connect to WRDS
wrds_db = wrds.Connection()

## Get FRED api key from Colab secrets
fred_api_key = userdata.get('FRED_API_KEY')
fred = Fred(api_key=fred_api_key)

Enter your WRDS username [root]:yimengs
Enter your password:··········
WRDS recommends setting up a .pgpass file.
Create .pgpass file now [y/n]?: y
Created .pgpass file successfully.
You can create this file yourself at any time with the create_pgpass_file() function.
Loading library list...
Done


In [3]:
## Documentation on all of these characteristics can be found here: https://jkpfactors.s3.amazonaws.com/documents/Documentation.pdf

# Take characteristics associated with fama-french factors - Size, B/M, Operating Profit, Asset Growth, Momentum (1-3 months - "randomly" chosen)
sql_query = f"""
    SELECT eom, ret_exc_lead1m, me, be_me, ope_be, at_gr1, ret_3_1
    FROM contrib.global_factor
    WHERE common=1 and exch_main=1 and primary_sec=1 and obs_main=1 and
    excntry='USA' and eom > '1990-01-01'
"""

data = wrds_db.raw_sql(sql_query)
data = data.sort_values([ 'eom'])
data

Unnamed: 0,eom,ret_exc_lead1m,me,be_me,ope_be,at_gr1,ret_3_1
48875,1990-01-31,0.052322,56.34075,0.518985,-0.006566,0.021864,0.137931
164067,1990-01-31,-0.022485,83.475,,,,-0.038776
62211,1990-01-31,-0.03523,2.567,1.635762,0.263634,0.080055,-0.026316
164075,1990-01-31,-0.020743,2.76375,1.498327,-0.03574,0.010914,-0.268293
62234,1990-01-31,-0.005818,4.432,,,,0.090667
...,...,...,...,...,...,...,...
78653,2025-02-28,,403.572,,,,
78654,2025-02-28,,5161.08975,,0.058305,0.078388,
78655,2025-02-28,,14210.21535,,-0.781149,-0.133325,
78657,2025-02-28,,228.298,,,,1.893878


In [4]:
data_clean=data.dropna()

In [5]:
data_clean.shape

(1729357, 7)

In [6]:
import numpy as np
# 2. Define the 20 macro series (FRED codes) --------------------------------
codes = {
    # Interest rates
    "TB3MS": "TB3MS",        # 3-Month T-Bill
    "GS1": "GS1",            # 1Y Treasury
    "GS5": "GS5",            # 5Y Treasury
    "GS10": "GS10",          # 10Y Treasury
    "GS30": "GS30",          # 30Y Treasury

    # Credit & spreads inputs
    "BAA": "BAA",            # Baa yield
    "AAA": "AAA",            # Aaa yield

    # Prices / inflation
    "CPI": "CPIAUCSL",       # CPI (headline)
    "CPICORE": "CPILFESL",   # Core CPI
    "PCEPI": "PCEPI",        # PCE price index

    # Real activity
    "INDPRO": "INDPRO",      # Industrial production
    "TCU": "TCU",            # Capacity utilization
    "RPI": "W875RX1",        # Real personal income

    # Labor market
    "UNRATE": "UNRATE",      # Unemployment rate
    "ICSA": "ICSA",          # Initial claims (weekly)

    # Money & policy
    "M2": "M2SL",            # M2 money stock
    "FEDFUNDS": "FEDFUNDS",  # Federal funds rate

    # Vol & commodities
    "VIX": "VIXCLS",         # VIX index (daily)
    "OIL": "DCOILWTICO",     # WTI oil price (daily)

    # Employment (replacing NAPM)
    "PAYEMS": "PAYEMS"       # Nonfarm payrolls
}


start_date = "1990-01-01"

raw = {}
for name, code in codes.items():
    s = fred.get_series(code, observation_start=start_date)
    raw[name] = s

macro_raw = pd.DataFrame(raw).sort_index()

# 4. Convert everything to MONTHLY frequency --------------------------------
# For rates/levels, a common choice is *end-of-month* values.
# For weekly/daily series (ICSA, VIX, OIL) this effectively picks the last obs of each month.
macro_m = macro_raw.resample("M").last()

# 5. Create derived macro features ------------------------------------------

# Term spread & default spread (in percentage points)
macro_m["term_spread"] = macro_m["GS10"] - macro_m["TB3MS"]
macro_m["def_spread"] = macro_m["BAA"] - macro_m["AAA"]

# Headline inflation (monthly log-diff of CPI)
macro_m["inflation"] = np.log(macro_m["CPI"]).diff()

# Core inflation (optional)
macro_m["core_inflation"] = np.log(macro_m["CPICORE"]).diff()

# Real short rate proxy: Fed funds minus inflation
macro_m["real_ff"] = macro_m["FEDFUNDS"] - (macro_m["inflation"] * 12)  # annualize inflation

# Real money growth: log-diff of M2
macro_m["m2_growth"] = np.log(macro_m["M2"]).diff()

# Industrial production growth
macro_m["indpro_growth"] = np.log(macro_m["INDPRO"]).diff()

# Real personal income growth
macro_m["rpi_growth"] = np.log(macro_m["RPI"]).diff()

# Payroll employment growth
macro_m["payems_growth"] = np.log(macro_m["PAYEMS"]).diff()

# Oil price growth
macro_m["oil_ret"] = np.log(macro_m["OIL"]).diff()

# 6. Drop initial NaNs from differences --------------------------------------
macro_m = macro_m.dropna()

feature_cols = [
    # original levels
    "TB3MS", "GS1", "GS5", "GS10", "GS30",
    "BAA", "AAA",
    "UNRATE", "ICSA",
    "M2", "FEDFUNDS",
    "VIX",
    "OIL",
    "INDPRO", "TCU", "RPI", "PAYEMS",
    "CPI", "CPICORE", "PCEPI",

    # derived features
    "term_spread",
    "def_spread",
    "inflation",
    "core_inflation",
    "real_ff",
    "m2_growth",
    "indpro_growth",
    "rpi_growth",
    "payems_growth",
    "oil_ret"
]

macro_X = macro_m[feature_cols].dropna()


  macro_m = macro_raw.resample("M").last()


In [7]:
macro_X.shape

(427, 30)

In [8]:
macro_X = macro_X.reset_index().rename(columns={"index": "date"})


In [9]:
data_clean['eom'] = pd.to_datetime(data_clean['eom'])
print(data_clean['eom'].dtype)

datetime64[ns]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_clean['eom'] = pd.to_datetime(data_clean['eom'])


In [10]:
data_full = data_clean.merge(macro_X, left_on='eom', right_on='date')
data_full = data_full.drop(columns=["date"])
print(data_full.shape)
data_full.head()


(1724828, 37)


Unnamed: 0,eom,ret_exc_lead1m,me,be_me,ope_be,at_gr1,ret_3_1,TB3MS,GS1,GS5,...,term_spread,def_spread,inflation,core_inflation,real_ff,m2_growth,indpro_growth,rpi_growth,payems_growth,oil_ret
0,1990-02-28,0.006494,3210.898125,0.905136,0.301758,0.123881,-0.023697,7.74,8.11,8.42,...,0.73,0.92,0.003914,0.004532,8.193033,0.003908,0.009041,0.003557,0.002195,-0.051548
1,1990-02-28,-0.00638,7.965625,0.359545,-0.394553,0.001343,0.0,7.74,8.11,8.42,...,0.73,0.92,0.003914,0.004532,8.193033,0.003908,0.009041,0.003557,0.002195,-0.051548
2,1990-02-28,0.24362,4.1425,0.256608,2.003763,-0.161535,-0.222222,7.74,8.11,8.42,...,0.73,0.92,0.003914,0.004532,8.193033,0.003908,0.009041,0.003557,0.002195,-0.051548
3,1990-02-28,-0.20638,1.267734,0.594423,-0.621413,-0.368524,0.2,7.74,8.11,8.42,...,0.73,0.92,0.003914,0.004532,8.193033,0.003908,0.009041,0.003557,0.002195,-0.051548
4,1990-02-28,0.009245,84.432,1.309373,0.230324,0.281382,0.052537,7.74,8.11,8.42,...,0.73,0.92,0.003914,0.004532,8.193033,0.003908,0.009041,0.003557,0.002195,-0.051548


In [11]:
data_full.to_csv('data.csv.gz', compression="gzip", index=False)