In [None]:
import pandas as pd

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import os

output_directory = '/content/drive/MyDrive/ML_Project/Earning_calls_embedding'

if os.path.exists(output_directory):
    print(f"Path '{output_directory}' exists.")
    if os.path.isdir(output_directory):
        print(f"And it is a directory.")
    else:
        print(f"But it is NOT a directory (it's a file or something else).")
else:
    print(f"Path '{output_directory}' DOES NOT exist.")

Path '/content/drive/MyDrive/ML_Project/Earning_calls_embedding' exists.
And it is a directory.


In [None]:
compustat = pd.read_csv('/content/drive/MyDrive/ML_Project/compustat/filtered_compustat_char.csv')
target = pd.read_csv('/content/drive/MyDrive/ML_Project/target/target_data.csv')

In [None]:
compustat.isna().sum()

Unnamed: 0,0
gvkey,0
datadate,0
cusip,0
capxy,0
chechy,0
cshfdy,0
cshpry,0
dltry,0
dpcy,0
epspxy,0


In [None]:
target.dtypes

Unnamed: 0,0
permno,int64
hdrcusip,object
cusip,object
ticker,object
industry,int64
date,object
ret,float64
ret_market,float64
ret_now_sign,float64
ret_lag1,float64


In [None]:
# Truncate each CUSIP to its first 8 characters
compustat['cusip'] = compustat['cusip'].str[:8]

target.drop(columns=['cusip'], inplace=True)
target.rename(columns={'hdrcusip': 'cusip'}, inplace=True)

compustat['datadate'] = pd.to_datetime(compustat['datadate'])
target['date'] = pd.to_datetime(target['date'])

In [None]:
target = target.sort_values('date')
compustat = compustat.sort_values('datadate')

target['cusip'] = target['cusip'].astype(str)
compustat['cusip'] = compustat['cusip'].astype(str)

# We use 'cusip' here as per your request, but 'gvkey' is often recommended.
merged_df = pd.merge_asof(
    left=target,
    right=compustat,
    left_on='date',         # The monthly date from the left DataFrame
    right_on='datadate',    # The quarterly date from the right DataFrame
    by='cusip',             # The column for the EXACT match (firm identifier)
    direction='backward',    # Find the latest quarterly report ON OR BEFORE the monthly date
    allow_exact_matches=False
)

In [None]:
merged_df = merged_df.dropna(subset=['datadate'])

In [None]:
merged_df.columns

Index(['permno', 'cusip', 'ticker', 'industry', 'date', 'ret', 'ret_market',
       'ret_now_sign', 'ret_lag1', 'ret_trend1_sign', 'ret_cum_forward3m',
       'ret_cum_forward3m_sign', 'ret_cum_forward12m',
       'ret_cum_forward12m_sign', 'ret_cum_backward3m', 'ret_cum_backward6m',
       'ret_cum_backward12m', 'gvkey', 'datadate', 'capxy', 'chechy', 'cshfdy',
       'cshpry', 'dltry', 'dpcy', 'epspxy', 'oibdpy', 'txty'],
      dtype='object')

In [None]:
merged_df.shape

(2015071, 28)

In [None]:
target_folder = "/content/drive/MyDrive/ML_Project/combined"
os.makedirs(target_folder, exist_ok=True)
output_path_csv = os.path.join(target_folder, "target_compustat_merged.csv")
merged_df.to_csv(output_path_csv, index=False)

# Start from here after first run

In [None]:
merged_df = pd.read_csv('/content/drive/MyDrive/ML_Project/combined/merged.csv')

In [None]:
merged_df.shape

(2015071, 28)

In [None]:
import pandas as pd
import glob
import os

# --- Step 1: Load and Combine all Parquet files ---

# Define the path to the folder containing your files
# Assuming they are in the 'data' directory. Change if necessary.
path = '/content/drive/MyDrive/ML_Project/Earning_calls_embedding'
file_pattern = os.path.join(path, 'text_embeddings_part_*.parquet')

# Use glob to find all files matching the pattern
parquet_files = sorted(glob.glob(file_pattern))

if not parquet_files:
    print("Error: No Parquet files found. Check your path and file pattern.")
else:
    print(f"Found {len(parquet_files)} files to load.")
    print(parquet_files)

    # Load each parquet file into a list of DataFrames
    list_of_dfs = [pd.read_parquet(f) for f in parquet_files]

    # Concatenate all DataFrames into a single one
    embeddings_df = pd.concat(list_of_dfs)

    print(f"\nCombined Embeddings DataFrame shape: {embeddings_df.shape}")
    print("Embeddings DataFrame columns:", embeddings_df.columns.tolist())
    print("Embeddings DataFrame dtypes:\n", embeddings_df.dtypes)

Found 9 files to load.
['/content/drive/MyDrive/ML_Project/Earning_calls_embedding/text_embeddings_part_000.parquet', '/content/drive/MyDrive/ML_Project/Earning_calls_embedding/text_embeddings_part_001.parquet', '/content/drive/MyDrive/ML_Project/Earning_calls_embedding/text_embeddings_part_002.parquet', '/content/drive/MyDrive/ML_Project/Earning_calls_embedding/text_embeddings_part_003.parquet', '/content/drive/MyDrive/ML_Project/Earning_calls_embedding/text_embeddings_part_004.parquet', '/content/drive/MyDrive/ML_Project/Earning_calls_embedding/text_embeddings_part_005.parquet', '/content/drive/MyDrive/ML_Project/Earning_calls_embedding/text_embeddings_part_006.parquet', '/content/drive/MyDrive/ML_Project/Earning_calls_embedding/text_embeddings_part_007.parquet', '/content/drive/MyDrive/ML_Project/Earning_calls_embedding/text_embeddings_part_008.parquet']

Combined Embeddings DataFrame shape: (216274, 4)
Embeddings DataFrame columns: ['Date', 'transcriptid', 'gvkey', 'embedding_vecto

In [None]:
embeddings_df.columns

Index(['Date', 'transcriptid', 'gvkey', 'embedding_vector'], dtype='object')

In [None]:
# target_folder = "/content/drive/MyDrive/ML_Project/combined"
# os.makedirs(target_folder, exist_ok=True)
# output_path_csv = os.path.join(target_folder, "embeddings.csv")
# embeddings_df.to_csv(output_path_csv, index=False)

In [None]:
import pandas as pd

# Assume 'merged_df' and 'embeddings_df' are your pre-existing DataFrames.

# --- Step 1: Prepare Both DataFrames ---
embeddings_df.rename(columns={'Date': 'date'}, inplace=True)

# 1. Ensure the date columns are proper datetime objects
print("Converting date columns to datetime objects...")
merged_df['date'] = pd.to_datetime(merged_df['date'])
embeddings_df['date'] = pd.to_datetime(embeddings_df['date'])

# =====================================================================
# FIX: Force both date columns to have the same nanosecond precision
# This will resolve the incompatible keys error.
print("Standardizing datetime precision to nanoseconds (ns)...")
merged_df['date'] = merged_df['date'].astype('datetime64[ns]')
embeddings_df['date'] = embeddings_df['date'].astype('datetime64[ns]')
embeddings_df.rename(columns={'date': 'report_date'}, inplace=True)

# =====================================================================

if 'gvkey' in embeddings_df.columns:
    embeddings_df['gvkey'] = embeddings_df['gvkey'].astype(int)
if 'gvkey' in merged_df.columns:
    merged_df['gvkey'] = merged_df['gvkey'].astype(int)

# =====================================================================

# 1c. IMPORTANT: Sort BOTH DataFrames by the 'date' column.
print("Sorting both DataFrames by date...")
merged_df = merged_df.sort_values('date')
embeddings_df = embeddings_df.sort_values('report_date')

# --- Step 2: Perform the As-of Merge (this part remains the same) ---

print("Performing the as-of merge...")
final_df = pd.merge_asof(
    left=merged_df,
    right=embeddings_df,
    left_on='date',              # The column for the "as-of" join
    right_on='report_date',
    by='gvkey',             # The column for the exact match
    direction='backward',    # Find the latest date in `right` on or before the date in `left`
    allow_exact_matches=False
)

# --- Step 3: Inspect the Results ---

print("\nMerge complete. Final DataFrame shape:", final_df.shape)
print(final_df.head())

Converting date columns to datetime objects...
Standardizing datetime precision to nanoseconds (ns)...
Sorting both DataFrames by date...
Performing the as-of merge...

Merge complete. Final DataFrame shape: (2015071, 31)
   permno     cusip ticker  industry       date       ret  ret_market  \
0   82160  23221710   CBUK         0 2000-02-29 -0.129944   -0.020108   
1   26201  20582620   CMTL         0 2000-02-29  0.343284   -0.020108   
2   79560  95766W10    SBG         0 2000-02-29  0.009197   -0.020108   
3   10423  83606610   SUND         0 2000-02-29 -0.040541   -0.020108   
4   83896  74621T20   PURE         0 2000-02-29  0.573770   -0.020108   

   ret_now_sign  ret_lag1  ret_trend1_sign  ...  cshfdy  cshpry  dltry   dpcy  \
0          -1.0 -0.268595              1.0  ...   9.880   9.670  0.244  2.780   
1           1.0  0.135593              1.0  ...   5.120   4.438  1.322  0.817   
2           1.0  0.009197              NaN  ...   4.984   4.456  0.153  1.101   
3          -1.0

In [None]:
final_df_matched = final_df.dropna(subset=['embedding_vector'])

In [None]:
final_df_matched.isna().sum()

Unnamed: 0,0
permno,0
cusip,0
ticker,2956
industry,0
date,0
ret,0
ret_market,0
ret_now_sign,2417
ret_lag1,378
ret_trend1_sign,785


In [None]:
final_df_matched[['date', 'datadate','report_date']]

Unnamed: 0,date,datadate,report_date
736971,2010-01-29,2009-11-30,2010-01-07
736984,2010-01-29,2009-12-31,2010-01-28
737002,2010-01-29,2009-11-30,2010-01-06
737014,2010-01-29,2009-12-31,2010-01-28
737036,2010-01-29,2009-12-31,2010-01-28
...,...,...,...
2015057,2024-12-31,2024-09-30,2020-11-11
2015059,2024-12-31,2024-09-30,2020-11-10
2015063,2024-12-31,2024-10-31,2020-12-02
2015069,2024-12-31,2024-09-30,2020-10-28


In [None]:
final_df_matched.isna().sum()

Unnamed: 0,0
permno,0
cusip,0
ticker,2956
industry,0
date,0
ret,0
ret_market,0
ret_now_sign,2417
ret_lag1,378
ret_trend1_sign,785


In [None]:
final_df_matched.drop(columns=['ticker'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df_matched.drop(columns=['ticker'], inplace=True)


In [None]:
final_df_matched.dropna(subset=['ret_cum_backward12m'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df_matched.dropna(subset=['ret_cum_backward12m'], inplace=True)


In [None]:
final_df_matched.isna().sum()

Unnamed: 0,0
permno,0
cusip,0
industry,0
date,0
ret,0
ret_market,0
ret_now_sign,2364
ret_lag1,0
ret_trend1_sign,403
ret_cum_forward3m,12112


In [None]:
final_df_matched.shape

(610192, 30)

In [None]:
target_folder = "/content/drive/MyDrive/ML_Project/combined"
output_path_parquet = os.path.join(target_folder, "final_data.parquet")
final_df_matched.to_parquet(output_path_parquet, index=False)

In [None]:
!ls -l /content/drive/MyDrive/ML_Project/combined

total 2376136
-rw------- 1 root root 1933236844 Jun  7 15:22 final_data.parquet
-rw------- 1 root root  499925735 Jun  7 14:27 merged.csv
