In [24]:
# --- Imports
import re
import numpy as np
import pandas as pd
import os

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import Normalizer
from sklearn.feature_selection import SelectKBest, mutual_info_regression
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error


In [25]:
FEATURE = "mgmt"
SRC_DIR = "../../data/text_data"
DST_DIR = "../../data/tfidf_global"
BASE_DIR = "../../data"
FEATURE  = "mgmt"
QUANT_PARQUET   = f"{BASE_DIR}/ret_sample.parquet"
LINKTABLE_CSV = f"{BASE_DIR}/cik_gvkey_linktable_USA_only.csv"

os.makedirs(DST_DIR, exist_ok=True)

def getTextData(start_year: int, end_year: int, src_dir: str = SRC_DIR) -> pd.DataFrame:
    """
    Load and concatenate yearly text data into a single DataFrame.
    
    Args:
        start_year (int): Starting year (inclusive).
        end_year (int): Ending year (inclusive).
        src_dir (str): Directory where the .pkl files are stored.
    
    Returns:
        pd.DataFrame: Combined dataframe of all years.
    """
    frames = []
    for year in range(start_year, end_year + 1):  # inclusive
        path = os.path.join(src_dir, f"text_us_{year}.pkl")
        if os.path.exists(path):
            df_year = pd.read_pickle(path)
            frames.append(df_year)
        else:
            print(f"File not found: {path}")

    if frames:
        return pd.concat(frames, ignore_index=True)
    else:
        return pd.DataFrame()  # empty if nothing found
    
# --- Light text cleaner (no heavy deps)
_clean_re = re.compile(r"[^a-z\s]+")
def clean_text(s: str) -> str:
    if not isinstance(s, str):
        return ""
    s = s.lower()
    s = _clean_re.sub(" ", s)          # keep letters/spaces only
    s = re.sub(r"\s+", " ", s).strip() # normalize whitespace
    return s


def prepare_linktable(df_link: pd.DataFrame) -> pd.DataFrame:
    df = df_link.copy()
    df["datadate"] = pd.to_datetime(df["datadate"])
    df["date"] = df["datadate"].dt.to_period("M").astype(str)
    df.drop(columns=["datadate"], inplace=True)
    df["cik"] = df["cik"].astype("Int64")
    return df


def prepare_quant_panel(quant_path: str, year: int) -> pd.DataFrame:
    dfq = pd.read_parquet(quant_path, engine="fastparquet")
    dfq = dfq[dfq["date"].dt.year == year].copy()
    dfq["date"] = dfq["date"].dt.to_period("M").astype(str)
    return dfq



In [44]:
df = getTextData(2005, 2005)
df

Unnamed: 0,date,cik,file_type,rf,mgmt,gvkey,cusip,year
0,20050103,16099,10Q,,Item 2 Management s Discussion and Analysis of...,6831.0,549282101,2005
1,20050103,779544,10K,,Item 7. Management's Discussion and Analysis o...,11872.0,040712101,2005
2,20050103,831641,10K,,Item 7 \n \n Management's Discussion and Analy...,24783.0,88162G103,2005
3,20050103,866415,10K,,ITEM 7. Management's Discussion and Analysis o...,61721.0,459412102,2005
4,20050103,1141240,10Q,,Item\n 2 Management s Discussion and Analysis ...,146117.0,53634X100,2005
...,...,...,...,...,...,...,...,...
16852,20051229,1100983,10K,,Item 7. Management s Discussion and Analysis\n...,133506.0,71086E107,2005
16853,20051229,1122668,10K,ITEM 1A. RISK FACTORS\n\n This Report contains...,ITEM 7. MANAGEMENT'S DISCUSSION AND ANALYSIS O...,141007.0,68382T101,2005
16854,20051229,1310094,10K,,ITEM 7. MANAGEMENT S DISCUSSION AND ANALYSIS O...,162956.0,00430L103,2005
16855,20051229,1311396,10K,Item 1A. \n\nRisk Factors ITEM 1A. Risk Factor...,Item\n 7. \n\nManagement s\n Discussion and An...,165666.0,05381A105,2005


In [45]:
df["date"] = pd.to_datetime(df["date"], format="%Y%m%d")
df["date"] = df["date"].dt.to_period("M").astype(str)
df = df.drop(columns=["rf"], axis=0)
df

Unnamed: 0,date,cik,file_type,mgmt,gvkey,cusip,year
0,2005-01,16099,10Q,Item 2 Management s Discussion and Analysis of...,6831.0,549282101,2005
1,2005-01,779544,10K,Item 7. Management's Discussion and Analysis o...,11872.0,040712101,2005
2,2005-01,831641,10K,Item 7 \n \n Management's Discussion and Analy...,24783.0,88162G103,2005
3,2005-01,866415,10K,ITEM 7. Management's Discussion and Analysis o...,61721.0,459412102,2005
4,2005-01,1141240,10Q,Item\n 2 Management s Discussion and Analysis ...,146117.0,53634X100,2005
...,...,...,...,...,...,...,...
16852,2005-12,1100983,10K,Item 7. Management s Discussion and Analysis\n...,133506.0,71086E107,2005
16853,2005-12,1122668,10K,ITEM 7. MANAGEMENT'S DISCUSSION AND ANALYSIS O...,141007.0,68382T101,2005
16854,2005-12,1310094,10K,ITEM 7. MANAGEMENT S DISCUSSION AND ANALYSIS O...,162956.0,00430L103,2005
16855,2005-12,1311396,10K,Item\n 7. \n\nManagement s\n Discussion and An...,165666.0,05381A105,2005


In [51]:
quant_df = prepare_quant_panel(QUANT_PARQUET, year=2005)
quant_df


Unnamed: 0,id,date,ret_eom,gvkey,iid,excntry,stock_ret,year,month,char_date,...,betadown_252d,prc_highprc_252d,corr_1260d,betabab_1260d,rmax5_rvol_21d,age,qmj,qmj_prof,qmj_growth,qmj_safety
0,comp_001081_01C,2005-02,20050228,1081.0,01C,CAN,-0.143457,2005,2,20050131,...,0.779315,0.672204,0.387781,0.845865,0.805580,541,-1.508294,-0.994164,-0.832048,-1.017248
1,comp_001096_01C,2005-02,20050228,1096.0,01C,CAN,0.028077,2005,2,20050131,...,0.445162,0.937664,0.245148,0.456872,0.923214,517,-0.706080,-0.247574,-0.155802,-0.485635
2,comp_001117_02,2005-02,20050228,1117.0,02,USA,-0.168627,2005,2,20050131,...,1.073565,0.708333,0.124188,0.863334,0.898113,373,1.344458,1.601108,1.612067,-0.566631
3,comp_001166_01W,2005-02,20050228,1166.0,01W,NLD,0.086271,2005,2,20050131,...,1.781000,0.676545,0.560895,1.560202,1.342814,289,-1.355529,-0.904719,-0.999531,-1.231687
4,comp_001186_01C,2005-02,20050228,1186.0,01C,CAN,0.149056,2005,2,20050131,...,1.326215,0.774557,0.174888,0.399060,0.777183,385,1.123762,0.154734,1.196690,0.939661
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
250310,crsp_92655,2005-12,20051231,10903.0,01,USA,0.038089,2005,12,20051130,...,0.734778,0.981633,0.295617,0.587895,1.559511,275,1.510260,1.345720,0.588637,1.188777
250311,crsp_92663,2005-12,20051231,16684.0,01,USA,0.042566,2005,12,20051130,...,1.412106,0.778558,0.318999,0.731467,1.513323,251,-0.803415,-1.314961,0.116605,-0.010656
250312,crsp_92807,2005-12,20051231,17269.0,01,USA,-0.087500,2005,12,20051130,...,0.878509,0.821918,0.135384,0.430993,1.263568,275,-0.910804,-1.200617,-0.601459,0.372950
250313,crsp_92874,2005-12,20051231,11169.0,01,USA,-0.101033,2005,12,20051130,...,0.673885,0.885696,0.045843,0.143423,0.350424,263,-0.329781,-0.261120,0.434766,-0.457529


In [47]:
linktable = pd.read_csv(LINKTABLE_CSV)
linktable = prepare_linktable(linktable)
linktable

Unnamed: 0,gvkey,iid,tic,cusip,conm,tpci,cik,date
0,1003,01,ANTQ,000354100,A.A. IMPORTING CO INC,0,730052,2005-01
1,1004,01,AIR,000361105,AAR CORP,0,1750,2005-01
2,1009,01,ABSI.1,000781104,ABS INDUSTRIES INC,0,313368,2005-01
3,1013,01,ADCT.1,000886309,ADC TELECOMMUNICATIONS INC,0,61478,2005-01
4,1019,01,AFAP,001038108,AFA PROTECTIVE SYSTEMS INC,0,2668,2005-01
...,...,...,...,...,...,...,...,...
4458162,364383,01,ITGLF,45829L107,INTEGRAL METALS CORP,0,2021206,2025-07
4458163,364659,01,AAGAF,827719105,SILVER47 EXPLORATION CORP,0,2079414,2025-07
4458164,364659,01C,AGA.,827719105,SILVER47 EXPLORATION CORP,0,2079414,2025-07
4458165,364873,01,TUNGF,030338107,AMERICAN TUNGSTEN CORP,0,2049560,2025-07


In [63]:
df_10K = df[df['file_type'] == '10K']
df_10Q = df[df['file_type'] == '10Q']

In [64]:

joined_10K = df_10K.merge(
    linktable,
    on=["cik", "date"],
    how="left",
    suffixes=("", "_linktable"),
)
joined_10K = joined_10K.merge(
    quant_df[["gvkey", "date", "ni_be"]],
    on=["gvkey", "date"],
    how="left",
)
joined_10K = joined_10K.dropna(subset=["ni_be"])

final_10K = joined_10K[["cik", "gvkey", "date", "file_type", FEATURE, "ni_be"]]
final_10K

Unnamed: 0,cik,gvkey,date,file_type,mgmt,ni_be
90,6284,1633.0,2005-02,10K,Item 7. Management s Discussion and Analysis o...,0.022688
92,796343,12540.0,2005-02,10K,Item 7. \n \n Management's Discussion and Anal...,0.326916
93,1090061,124254.0,2005-02,10K,Item 7. \n \n Management s Discussion and Anal...,-1.090630
94,860748,21503.0,2005-02,10K,Item 7. \n Management s Discussion and Analysi...,0.107652
95,878436,117905.0,2005-02,10K,ITEM\n7. MANAGEMENT S DISCUSSION AND ANALYSIS ...,-0.866514
...,...,...,...,...,...,...
4568,949298,61404.0,2005-12,10K,Item 7. Management s Discussion and Analysis o...,0.032125
4569,1060390,112623.0,2005-12,10K,Item 7. \n\nManagement s Discussion and Analys...,-0.533749
4570,1082735,121659.0,2005-12,10K,Item 7. Management's Discussion and Analysis o...,-0.142285
4571,1100983,133506.0,2005-12,10K,Item 7. Management s Discussion and Analysis\n...,0.037442


In [65]:

joined_10Q = df_10Q.merge(
    linktable,
    on=["cik", "date"],
    how="left",
    suffixes=("", "_linktable"),
)
joined_10Q = joined_10Q.merge(
    quant_df[["gvkey", "date", "ni_be"]],
    on=["gvkey", "date"],
    how="left",
)
joined_10Q = joined_10Q.dropna(subset=["ni_be"])

final_10Q = joined_10Q[["cik", "gvkey", "date", "file_type", FEATURE, "ni_be"]]
final_10Q

Unnamed: 0,cik,gvkey,date,file_type,mgmt,ni_be
182,70793,7798.0,2005-02,10Q,ITEM\n 2. Management s Discussion and\n Analys...,0.158770
183,72911,7980.0,2005-02,10Q,ITEM 2. \n\nManagement s Discussion and Analys...,-0.034289
184,72911,7980.0,2005-02,10Q,ITEM 2. \n\nManagement s Discussion and Analys...,-0.034289
185,72911,7980.0,2005-02,10Q,ITEM 2. \n\nManagement s Discussion and Analys...,-0.034289
186,72911,7980.0,2005-02,10Q,ITEM 2. \n\nManagement s Discussion and Analys...,-0.034289
...,...,...,...,...,...,...
13683,1048911,4598.0,2005-12,10Q,ITEM 2. \n\nManagement s\n Discussion and Anal...,0.134241
13684,59255,12578.0,2005-12,10Q,Item 2. Management's Discussion and Analysis o...,0.025615
13685,72162,7658.0,2005-12,10Q,Item 2. Management's Discussion and Analysis o...,0.089801
13686,868512,30152.0,2005-12,10Q,Item 2 \n\nManagement s Discussion and Analysi...,-1.076985


In [66]:
VEC_PARAMS = dict(
    min_df=0.002,           # keep tokens appearing in >=0.2% of docs (or use an int like 20)
    max_df=0.85,            # drop tokens in >85% of docs (boilerplate)
    ngram_range=(1, 2),     # unigrams + bigrams
    max_features=100_000,   # cap vocab; adjust upward if you have RAM
    strip_accents="unicode",
    stop_words="english",
    sublinear_tf=True,
    norm=None,              # don't L2 normalize here (we'll normalize after SVD)
)
SVD_DIM = 256               # dense vector size you’ll get at the end
K_FEATURES = 40_000         # supervised token selection BEFORE SVD (used only if y is present)


In [67]:
def make_text_vector_pipeline(has_target: bool) -> Pipeline:
    tfidf = TfidfVectorizer(preprocessor=clean_text, **VEC_PARAMS)

    steps = [("tfidf", tfidf)]

    if has_target:
        # Select the most predictive tokens w.r.t. ni_be (supervised feature selection)
        steps.append(("select", SelectKBest(mutual_info_regression, k=min(K_FEATURES, VEC_PARAMS["max_features"]))))

    # Reduce to a compact fixed-length vector & L2 normalize
    steps.extend([
        ("svd", TruncatedSVD(n_components=SVD_DIM, random_state=42)),
        ("l2", Normalizer(copy=False))
    ])

    return Pipeline(steps)


In [72]:
df

Unnamed: 0,date,cik,file_type,mgmt,gvkey,cusip,year
0,2005-01,16099,10Q,Item 2 Management s Discussion and Analysis of...,6831.0,549282101,2005
1,2005-01,779544,10K,Item 7. Management's Discussion and Analysis o...,11872.0,040712101,2005
2,2005-01,831641,10K,Item 7 \n \n Management's Discussion and Analy...,24783.0,88162G103,2005
3,2005-01,866415,10K,ITEM 7. Management's Discussion and Analysis o...,61721.0,459412102,2005
4,2005-01,1141240,10Q,Item\n 2 Management s Discussion and Analysis ...,146117.0,53634X100,2005
...,...,...,...,...,...,...,...
16852,2005-12,1100983,10K,Item 7. Management s Discussion and Analysis\n...,133506.0,71086E107,2005
16853,2005-12,1122668,10K,ITEM 7. MANAGEMENT'S DISCUSSION AND ANALYSIS O...,141007.0,68382T101,2005
16854,2005-12,1310094,10K,ITEM 7. MANAGEMENT S DISCUSSION AND ANALYSIS O...,162956.0,00430L103,2005
16855,2005-12,1311396,10K,Item\n 7. \n\nManagement s\n Discussion and An...,165666.0,05381A105,2005


In [74]:
# Assume your dataframe is called final_10K and has a 'mgmt' column of text
assert "mgmt" in final_10K.columns, "Expected a 'mgmt' column."

has_target = "ni_be" in final_10K.columns

if has_target:
    X_text = final_10K["mgmt"].astype(str)
    y = final_10K["ni_be"].astype(float).values
    X_train, X_test, y_train, y_test = train_test_split(X_text, y, test_size=0.2, random_state=42)
else:
    X_train = final_10K["mgmt"].astype(str)
    X_test  = None

pipe = make_text_vector_pipeline(has_target=has_target)
pipe.fit(X_train, y_train if has_target else None)

# Dense, fixed-length vectors
Z_train = pipe.transform(X_train)
print("Vector shape (train):", Z_train.shape)

if X_test is not None:
    Z_test = pipe.transform(X_test)
    print("Vector shape (test):", Z_test.shape)


Vector shape (train): (2998, 256)
Vector shape (test): (750, 256)


In [69]:
Z_train

array([[ 0.3007739 ,  0.072041  , -0.12593679, ..., -0.0039779 ,
         0.03657787,  0.02622641],
       [ 0.26959815,  0.0214102 , -0.16425062, ..., -0.03554562,
        -0.01037088, -0.00307035],
       [ 0.27663075,  0.01698303,  0.0017476 , ..., -0.00740929,
         0.04623623, -0.04354295],
       ...,
       [ 0.72981344, -0.09345256, -0.0404947 , ...,  0.02232543,
        -0.01174948,  0.02408612],
       [ 0.65243857, -0.08630008, -0.14919515, ...,  0.00233057,
         0.01479491,  0.02302135],
       [ 0.69688303,  0.09407221, -0.28031189, ...,  0.01265241,
         0.01877241,  0.01301656]], shape=(16857, 256))

In [70]:
# 1) Vocabulary size after min_df/max_df
tfidf = pipe.named_steps["tfidf"]
print("Vocabulary size:", len(tfidf.vocabulary_))

# 2) Show top tokens by IDF (lowest IDF = most frequent; highest IDF = rare)
idf = tfidf.idf_
feat_names = np.array(tfidf.get_feature_names_out())

top_freq_idx = np.argsort(idf)[:25]
top_rare_idx = np.argsort(idf)[-25:]

print("\nMost frequent tokens/phrases:")
print(feat_names[top_freq_idx])

print("\nRarest tokens (kept after min_df filtering):")
print(feat_names[top_rare_idx])

# 3) If supervised selection was used, show top tokens by mutual information weight
if has_target and "select" in pipe.named_steps:
    selector = pipe.named_steps["select"]
    # scores_ is aligned with tfidf feature order
    scores = selector.scores_
    keep_mask = selector.get_support()
    top_k_idx = np.argsort(scores[keep_mask])[::-1][:25]
    kept_names = feat_names[keep_mask]
    print("\nTop 25 selected tokens by mutual information (predictive):")
    print(kept_names[top_k_idx])


Vocabulary size: 100000

Most frequent tokens/phrases:
['period' 'primarily' 'factors' 'certain' 'increased' 'result' 'following'
 'compared' 'costs' 'expense' 'new' 'current' 'liquidity' 'quarter'
 'financial statements' 'general' 'cost' 'market' 'additional' 'assets'
 'total' 'information' 'include' 'approximately' 'included']

Rarest tokens (kept after min_df filtering):
['hematology' 'ipod' 'origination sales' 'station group'
 'containing products' 'rocket' 'level activities' 'eda' 'acquired place'
 'events held' 'eden' 'ams' 'nuts' 'income tenant' 'barr' 'nymex prices'
 'point impact' 'genetic testing' 'net license' 'gas margins'
 'networking market' 'geneva' 'pigment' 'encryption products' 'ltl']


In [71]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score

if has_target:
    rf = RandomForestRegressor(n_estimators=100, n_jobs=-1, random_state=42, max_depth=20)
    rf.fit(Z_train, y_train)
    y_pred = rf.predict(Z_test)

    rmse = mean_squared_error(y_test, y_pred, squared=False)
    mae  = mean_absolute_error(y_test, y_pred)
    r2   = r2_score(y_test, y_pred)

    print(f"RF on LSA vectors — RMSE: {rmse:.4f} | MAE: {mae:.4f} | R²: {r2:.4f}")
