In [2]:
import pyarrow
from typing import Dict, List, Iterable

import numpy as np
import pandas as pd
import re
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
example_text_data = pd.read_pickle("../../data/text_data/text_us_2005.pkl")
example_text_data

Unnamed: 0,date,cik,file_type,rf,mgmt,gvkey,cusip,year
177795,20050103,16099,10Q,,Item 2 Management s Discussion and Analysis of...,6831.0,549282101,2005
177791,20050103,779544,10K,,Item 7. Management's Discussion and Analysis o...,11872.0,040712101,2005
177794,20050103,831641,10K,,Item 7 \n \n Management's Discussion and Analy...,24783.0,88162G103,2005
177790,20050103,866415,10K,,ITEM 7. Management's Discussion and Analysis o...,61721.0,459412102,2005
177793,20050103,1141240,10Q,,Item\n 2 Management s Discussion and Analysis ...,146117.0,53634X100,2005
...,...,...,...,...,...,...,...,...
195708,20051229,1100983,10K,,Item 7. Management s Discussion and Analysis\n...,133506.0,71086E107,2005
195726,20051229,1122668,10K,ITEM 1A. RISK FACTORS\n\n This Report contains...,ITEM 7. MANAGEMENT'S DISCUSSION AND ANALYSIS O...,141007.0,68382T101,2005
195718,20051229,1310094,10K,,ITEM 7. MANAGEMENT S DISCUSSION AND ANALYSIS O...,162956.0,00430L103,2005
195714,20051229,1311396,10K,Item 1A. \n\nRisk Factors ITEM 1A. Risk Factor...,Item\n 7. \n\nManagement s\n Discussion and An...,165666.0,05381A105,2005


In [3]:
YEAR = 2005
FEATURE = "mgmt"
link_table = "../../data/cik_gvkey_linktable_USA_only.csv"
meta_file = f"../../data/tfidf_global/{YEAR}_mgmt_meta.pkl"
embedding_data = f"../../data/embeddings/2005_mgmt_embeddings.pkl"
parquet_data = f"../../data/ret_sample.parquet"

df_linktable = pd.read_csv(link_table)
embedding_df = pd.read_pickle(embedding_data)



In [9]:
df_quant = pd.read_parquet(parquet_data, engine='fastparquet')

In [10]:
df_quant = df_quant[df_quant['date'].dt.year == YEAR]


In [5]:
embedding_df['date'] = pd.to_datetime(embedding_df['date'], format='%Y%m%d')
embedding_df['date'] = embedding_df['date'].dt.to_period('M').astype(str)
embedding_df

Unnamed: 0,cik,date,file_type,mgmt_embedding_0,mgmt_embedding_1,mgmt_embedding_2,mgmt_embedding_3,mgmt_embedding_4,mgmt_embedding_5,mgmt_embedding_6,...,mgmt_embedding_758,mgmt_embedding_759,mgmt_embedding_760,mgmt_embedding_761,mgmt_embedding_762,mgmt_embedding_763,mgmt_embedding_764,mgmt_embedding_765,mgmt_embedding_766,mgmt_embedding_767
0,16099,2005-01,10Q,-0.305792,1.343399,-0.492189,-0.916062,0.661892,-0.360169,0.522189,...,0.476813,0.198960,-1.062229,-0.646398,-0.497701,-0.068906,0.082313,-0.573916,-0.645826,-0.032842
1,779544,2005-01,10K,-0.805109,1.195119,-0.030410,-0.856439,0.841005,0.278821,1.048616,...,1.213318,0.181607,-0.576749,-0.552399,-0.338223,-0.997854,0.054023,-0.287226,0.613801,0.092089
2,831641,2005-01,10K,-0.281814,1.105873,-0.295080,-0.254377,0.597247,-0.781304,0.520388,...,0.426295,0.854512,-0.748180,-1.039514,-0.605629,-0.419541,-0.103496,-0.391366,-0.518760,0.080490
3,866415,2005-01,10K,-0.113428,1.588657,-1.088950,-0.477637,0.694590,-0.552957,0.679156,...,0.613856,0.217191,-0.813693,-0.762072,-0.266510,-0.064579,0.219902,-0.715407,-0.728836,0.293196
4,1141240,2005-01,10Q,-0.073954,1.071652,-1.075287,-0.267978,0.377497,-0.957005,0.392173,...,0.308893,0.252338,-0.415474,-0.918528,-0.455256,-0.024974,0.110255,-0.743280,-0.786598,0.270175
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16852,1100983,2005-12,10K,-0.190206,1.051906,-0.336857,-0.547836,0.729207,-0.749525,0.428613,...,0.579640,0.786797,-0.942979,-0.958498,-0.530043,-0.275732,-0.214479,-0.320618,-0.372840,-0.061662
16853,1122668,2005-12,10K,-0.085501,0.982872,-0.855243,-0.180038,1.147978,-0.247654,0.418326,...,0.601210,-0.198217,-0.202060,-0.629187,-0.573010,-0.182263,0.313790,-0.327598,-0.284698,0.143504
16854,1310094,2005-12,10K,-0.132223,0.962541,-1.276241,-0.458970,0.433190,-0.832395,0.108932,...,0.090750,0.400165,-0.457431,-0.641406,-0.388500,0.364269,-0.049641,-0.723917,-0.774582,0.463443
16855,1311396,2005-12,10K,-0.626112,1.351202,-0.801620,-0.264799,0.554653,-0.323592,-0.013122,...,1.018066,0.593177,-0.521210,-0.736404,-0.467550,-0.339667,-0.050310,-0.530077,-0.244436,0.131554


In [6]:
# df_linktable['datadate'] 
df_linktable['datadate'] = pd.to_datetime(df_linktable['datadate'])
df_linktable['date'] = df_linktable['datadate'].dt.to_period('M').astype(str)
df_linktable.drop(columns=['datadate'], inplace=True) 
df_linktable

Unnamed: 0,gvkey,iid,tic,cusip,conm,tpci,cik,date
0,1003,01,ANTQ,000354100,A.A. IMPORTING CO INC,0,730052.0,2005-01
1,1004,01,AIR,000361105,AAR CORP,0,1750.0,2005-01
2,1009,01,ABSI.1,000781104,ABS INDUSTRIES INC,0,313368.0,2005-01
3,1013,01,ADCT.1,000886309,ADC TELECOMMUNICATIONS INC,0,61478.0,2005-01
4,1019,01,AFAP,001038108,AFA PROTECTIVE SYSTEMS INC,0,2668.0,2005-01
...,...,...,...,...,...,...,...,...
4458162,364383,01,ITGLF,45829L107,INTEGRAL METALS CORP,0,2021206.0,2025-07
4458163,364659,01,AAGAF,827719105,SILVER47 EXPLORATION CORP,0,2079414.0,2025-07
4458164,364659,01C,AGA.,827719105,SILVER47 EXPLORATION CORP,0,2079414.0,2025-07
4458165,364873,01,TUNGF,030338107,AMERICAN TUNGSTEN CORP,0,2049560.0,2025-07


In [11]:

df_quant['date'] = df_quant['date'].dt.to_period('M').astype(str)
df_quant


Unnamed: 0,id,date,ret_eom,gvkey,iid,excntry,stock_ret,year,month,char_date,...,betadown_252d,prc_highprc_252d,corr_1260d,betabab_1260d,rmax5_rvol_21d,age,qmj,qmj_prof,qmj_growth,qmj_safety
0,comp_001081_01C,2005-02,20050228,1081.0,01C,CAN,-0.143457,2005,2,20050131,...,0.779315,0.672204,0.387781,0.845865,0.805580,541,-1.508294,-0.994164,-0.832048,-1.017248
1,comp_001096_01C,2005-02,20050228,1096.0,01C,CAN,0.028077,2005,2,20050131,...,0.445162,0.937664,0.245148,0.456872,0.923214,517,-0.706080,-0.247574,-0.155802,-0.485635
2,comp_001117_02,2005-02,20050228,1117.0,02,USA,-0.168627,2005,2,20050131,...,1.073565,0.708333,0.124188,0.863334,0.898113,373,1.344458,1.601108,1.612067,-0.566631
3,comp_001166_01W,2005-02,20050228,1166.0,01W,NLD,0.086271,2005,2,20050131,...,1.781000,0.676545,0.560895,1.560202,1.342814,289,-1.355529,-0.904719,-0.999531,-1.231687
4,comp_001186_01C,2005-02,20050228,1186.0,01C,CAN,0.149056,2005,2,20050131,...,1.326215,0.774557,0.174888,0.399060,0.777183,385,1.123762,0.154734,1.196690,0.939661
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
250310,crsp_92655,2005-12,20051231,10903.0,01,USA,0.038089,2005,12,20051130,...,0.734778,0.981633,0.295617,0.587895,1.559511,275,1.510260,1.345720,0.588637,1.188777
250311,crsp_92663,2005-12,20051231,16684.0,01,USA,0.042566,2005,12,20051130,...,1.412106,0.778558,0.318999,0.731467,1.513323,251,-0.803415,-1.314961,0.116605,-0.010656
250312,crsp_92807,2005-12,20051231,17269.0,01,USA,-0.087500,2005,12,20051130,...,0.878509,0.821918,0.135384,0.430993,1.263568,275,-0.910804,-1.200617,-0.601459,0.372950
250313,crsp_92874,2005-12,20051231,11169.0,01,USA,-0.101033,2005,12,20051130,...,0.673885,0.885696,0.045843,0.143423,0.350424,263,-0.329781,-0.261120,0.434766,-0.457529


In [8]:
def pack_embeddings(df: pd.DataFrame, feature: str = "mgmt", out_col: str | None = None,
                    dtype=np.float32):
    """
    - Finds columns like '{feature}_embedding_0', '{feature}_embedding_1', ...
    - Sorts them by the numeric suffix
    - Packs them into a single array column
    - Drops the original embedding columns
    Returns: (df_packed, X, emb_cols)
      - df_packed: original df with new array column and without the per-dim cols
      - X: 2D numpy array, shape (n_rows, n_dims)
      - emb_cols: list of embedding column names in the order used
    """
    prefix = f"{feature}_embedding_"
    pattern = re.compile(rf"^{re.escape(prefix)}(\d+)$")

    # collect (index, colname) pairs
    idx_cols = []
    for c in df.columns:
        m = pattern.match(c)
        if m:
            idx_cols.append((int(m.group(1)), c))

    if not idx_cols:
        raise ValueError(f"No embedding columns found with prefix '{prefix}'")

    # correct numeric order
    idx_cols.sort(key=lambda t: t[0])
    emb_cols = [c for _, c in idx_cols]

    # build dense matrix
    X = df.loc[:, emb_cols].to_numpy(dtype=dtype, copy=False)

    # pack into a single column of arrays
    arr_col = out_col or f"{feature}_embedding"
    df_packed = df.drop(columns=emb_cols).copy()
    df_packed[arr_col] = list(X)

    return df_packed, X, emb_cols


packed_df, X, used_cols = pack_embeddings(embedding_df, feature="mgmt")


In [9]:
packed_df

Unnamed: 0,cik,date,file_type,mgmt_embedding
0,16099,2005-01,10Q,"[-0.30579212, 1.3433989, -0.49218938, -0.91606..."
1,779544,2005-01,10K,"[-0.8051092, 1.1951189, -0.03041017, -0.856438..."
2,831641,2005-01,10K,"[-0.2818144, 1.1058731, -0.2950802, -0.2543772..."
3,866415,2005-01,10K,"[-0.11342792, 1.5886573, -1.0889504, -0.477637..."
4,1141240,2005-01,10Q,"[-0.073953986, 1.0716524, -1.0752872, -0.26797..."
...,...,...,...,...
16852,1100983,2005-12,10K,"[-0.1902061, 1.051906, -0.33685672, -0.5478362..."
16853,1122668,2005-12,10K,"[-0.08550067, 0.9828718, -0.8552429, -0.180037..."
16854,1310094,2005-12,10K,"[-0.13222286, 0.9625406, -1.2762413, -0.458969..."
16855,1311396,2005-12,10K,"[-0.62611175, 1.3512015, -0.80161995, -0.26479..."


In [10]:
# Make sure both CIK columns are integers (nullable if NaN possible)
packed_df['cik'] = packed_df['cik'].astype('Int64')
df_linktable['cik'] = df_linktable['cik'].astype('Int64')

# Merge on CIK
joined_df = packed_df.merge(
    df_linktable,
    left_on=['cik', 'date'],
    right_on=['cik', 'date'],
    how='left',
    suffixes=('', '_link'),
)

joined_df


Unnamed: 0,cik,date,file_type,mgmt_embedding,gvkey,iid,tic,cusip,conm,tpci
0,16099,2005-01,10Q,"[-0.30579212, 1.3433989, -0.49218938, -0.91606...",6831.0,01,LUB,549282101,LUBYS INC,0
1,779544,2005-01,10K,"[-0.8051092, 1.1951189, -0.03041017, -0.856438...",11872.0,01,ARKR,040712101,ARK RESTAURANTS CORP,0
2,831641,2005-01,10K,"[-0.2818144, 1.1058731, -0.2950802, -0.2543772...",24783.0,01,TTEK,88162G103,TETRA TECH INC,0
3,866415,2005-01,10K,"[-0.11342792, 1.5886573, -1.0889504, -0.477637...",61721.0,01,IDWK,459412102,INTL DISPLAYWORKS INC,0
4,1141240,2005-01,10Q,"[-0.073953986, 1.0716524, -1.0752872, -0.26797...",146117.0,01,LQMT,53634X100,LIQUIDMETAL TECHNOLOGIES,0
...,...,...,...,...,...,...,...,...,...,...
17811,1100983,2005-12,10K,"[-0.1902061, 1.051906, -0.33685672, -0.5478362...",133506.0,01,PCBI,71086E107,PEOPLES CMNTY BANCORP INC,0
17812,1122668,2005-12,10K,"[-0.08550067, 0.9828718, -0.8552429, -0.180037...",141007.0,01,OCPI,68382T101,OPTICAL COMMUNICATION PRODS,0
17813,1310094,2005-12,10K,"[-0.13222286, 0.9625406, -1.2762413, -0.458969...",162956.0,01,ABPI,00430L103,ACCENTIA BIOPHARMACEUTICALS,0
17814,1311396,2005-12,10K,"[-0.62611175, 1.3512015, -0.80161995, -0.26479...",165666.0,01,AVZAQ,05381A105,AVIZA TECHNOLOGY INC,0


In [11]:
joined_quant_df = joined_df.merge(
    df_quant,
    left_on=['gvkey', 'date'],
    right_on=['gvkey', 'date'],
    how='left',
    suffixes=('', '_link'),
)


In [12]:
joined_quant_df = joined_quant_df.dropna(subset=['ni_be'])
joined_quant_df_selected = joined_quant_df[['cik', 'date', 'file_type', 'mgmt_embedding', 'ni_be']]

training_data_10K = joined_quant_df_selected[joined_quant_df_selected['file_type'] == '10K'].drop(columns='file_type')
training_data_10Q = joined_quant_df_selected[joined_quant_df_selected['file_type'] == '10Q'].drop(columns='file_type')


In [13]:
training_data_10K

Unnamed: 0,cik,date,mgmt_embedding,ni_be
264,6284,2005-02,"[-0.34999558, 1.3695177, -0.57309455, -0.23077...",0.022688
285,796343,2005-02,"[-0.22411665, 1.2341002, -0.8456317, -0.210903...",0.326916
296,1090061,2005-02,"[-0.13306156, 0.7813514, -0.5372533, -1.045033...",-1.090630
318,860748,2005-02,"[-0.17241207, 1.1899269, -0.5319485, -0.637415...",0.107652
319,878436,2005-02,"[-0.017556176, 1.4124045, -1.1808175, -0.27374...",-0.866514
...,...,...,...,...
18040,949298,2005-12,"[-0.6540725, 0.58407414, -0.08652241, 0.270447...",0.032125
18041,1060390,2005-12,"[0.11361144, 0.68153894, -1.0635124, -0.121429...",-0.533749
18042,1082735,2005-12,"[-0.4125821, 1.2409935, -0.6302631, -0.2123343...",-0.142285
18043,1100983,2005-12,"[-0.1902061, 1.051906, -0.33685672, -0.5478362...",0.037442


In [14]:
training_data_10Q

Unnamed: 0,cik,date,mgmt_embedding,ni_be
265,72911,2005-02,"[-0.50434804, 1.337575, -0.1841943, 0.05173105...",-0.034289
266,72911,2005-02,"[-0.50434804, 1.337575, -0.1841943, 0.05173105...",-0.034289
267,72911,2005-02,"[-0.50434804, 1.337575, -0.1841943, 0.05173105...",-0.034289
268,72911,2005-02,"[-0.50434804, 1.337575, -0.1841943, 0.05173105...",-0.034289
269,108312,2005-02,"[-0.19345373, 0.9730646, -0.6990452, -0.323367...",0.081330
...,...,...,...,...
17999,1048911,2005-12,"[-0.3775842, 1.412785, -0.6041254, -0.02556156...",0.134241
18002,59255,2005-12,"[-0.17686169, 1.0587453, -0.2672164, -0.366042...",0.025615
18004,72162,2005-12,"[-0.3734501, 0.6532782, -0.13757151, -0.452354...",0.089801
18018,868512,2005-12,"[-0.1467689, 1.6154906, -0.7466608, -0.5513613...",-1.076985
