In [None]:
import pandas as pd
import os

In [None]:
PROCESSED_FILENAME = "features.csv"
TEST_SIZE          = 0.2
TEXT_COL           = ["TEXT"]
TARGET             = ["CATEGORY"]

In [None]:
def temporal_split(df: pd.DataFrame, test_size: float) -> tuple[pd.DataFrame, pd.DataFrame]:
    """
    Sort by DATE_EMITTED and split into train/test sets.
    """
    df_sorted = df.sort_values("DATE_EMITTED").reset_index(drop=True)
    split_idx = int(len(df_sorted) * (1 - test_size))
    train = df_sorted.iloc[:split_idx].copy()
    test  = df_sorted.iloc[split_idx:].copy()

    return train, test

In [None]:
data_path = os.path.join(current_dir, PROCESSED_FILENAME)
features = pd.read_csv(data_path, parse_dates=["DATE_EMITTED"])

train_df, test_df = temporal_split(features, TEST_SIZE)
X_train = train_df.drop(columns=[TARGET[0], "DATE_EMITTED"])
y_train = train_df[TARGET[0]]
X_test  = test_df.drop(columns=[TARGET[0], "DATE_EMITTED"])
y_test  = test_df[TARGET[0]]

In [None]:
from sentence_transformers import SentenceTransformer
import numpy as np

encoder = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
X_train_emb = encoder.encode(X_train["TEXT"].fillna("").tolist(), batch_size=32, show_progress_bar=True)
X_test_emb  = encoder.encode(X_test["TEXT"].fillna("").tolist(),  batch_size=32, show_progress_bar=True)

In [None]:
np.save("X_train_emb.npy", X_train_emb)
np.save("X_test_emb.npy",  X_test_emb)