In [1]:
import logging
import sys

# Configure logging
stdout_handler = logging.StreamHandler(sys.stdout)
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[stdout_handler]
)

logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

def flush_logger():
    stdout_handler.flush()



In [2]:
import pandas as pd
from config import FieldConfig, ExperimentConfig
from pathlib import Path
DATA_PATH = Path('C:/Work/Data/proc/')
field_config = FieldConfig()

df = pd.read_csv(DATA_PATH/'rec_data2.csv')

missing_values = df.isnull().sum()
print("Missing values per column:")
print(missing_values)

df_cleaned = df.dropna(subset=[field_config.date, field_config.amount, field_config.text, field_config.label])


Missing values per column:
id                    0
accountId             0
date                  0
amount                0
bankRawDescription    0
isRecurring           0
dtype: int64


In [3]:
from feature_processor import FeatProcParams
from config import *

feat_params = FeatProcParams()
feat_params_off = FeatProcParams(
    use_cyclical_dates=False,
    use_categorical_dates=False,
    use_continuous_amount=False,
    use_categorical_amount=False
)

exp_config = ExperimentConfig()

In [4]:
from runner import ExpRunner
from config import EmbModel
from embedder import EmbeddingService

runner1 = ExpRunner.create(
    exp_params=exp_config,
    full_df=df_cleaned,
    emb_params=EmbeddingService.Params(model_name=EmbModel.ALBERT),
    feat_proc_params=feat_params,
    field_config=FieldConfig()
)

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
runner1.run_torch([0.1])

2025-11-12 16:39:03,326 - INFO - Splitting 189987 rows by group 'accountId'...
2025-11-12 16:39:03,415 - INFO - Train accounts: 984, Test accounts: 246
2025-11-12 16:39:03,416 - INFO - SUCCESS: No account overlap between train and test sets.
2025-11-12 16:39:03,426 - INFO - Split complete. Train: len=156523 accounts=984, Test: len=33464 accounts=246.
2025-11-12 16:39:03,431 - INFO - Preparing to create 1 training set fractions...
2025-11-12 16:39:03,452 - INFO - Yielding 10% split: 98 accounts, 16704 rows
2025-11-12 16:39:03,453 - INFO - Creating new EmbeddingService(model_name=albert-base-v2)
2025-11-12 16:39:03,454 - INFO - Loading embedding model: albert-base-v2...
2025-11-12 16:39:05,117 - INFO - Model albert-base-v2 loaded onto cpu. Cache at cache/albert-base-v2
2025-11-12 16:39:06,293 - INFO - len(text_list)=156523 len(unique_texts)=94627 len(texts_to_embed)=0
2025-11-12 16:39:09,769 - INFO - len(text_list)=33464 len(unique_texts)=20665 len(texts_to_embed)=0
2025-11-12 16:39:10,4

{16704: {'loss': 0.203,
  'accuracy': 0.926,
  'f1': 0.804,
  'roc_auc': 0.954,
  'embedder.model_name': 'albert-base-v2',
  'train_frac': 0.1,
  'train_size': 16704,
  'test_size': 33464,
  'train_accounts': 98,
  'test_accounts': 246}}

In [6]:
runner2 = ExpRunner.copy(runner1)
runner2.feat_proc_params = feat_params_off
runner2.run_torch([0.1])


2025-11-12 16:42:15,853 - INFO - Splitting 189987 rows by group 'accountId'...
2025-11-12 16:42:15,927 - INFO - Train accounts: 984, Test accounts: 246
2025-11-12 16:42:15,928 - INFO - SUCCESS: No account overlap between train and test sets.
2025-11-12 16:42:15,934 - INFO - Split complete. Train: len=156523 accounts=984, Test: len=33464 accounts=246.
2025-11-12 16:42:15,939 - INFO - Preparing to create 1 training set fractions...
2025-11-12 16:42:15,952 - INFO - Yielding 10% split: 98 accounts, 16704 rows
2025-11-12 16:42:15,953 - INFO - Creating new EmbeddingService(model_name=albert-base-v2)
2025-11-12 16:42:15,954 - INFO - Loading embedding model: albert-base-v2...
2025-11-12 16:42:16,760 - INFO - Model albert-base-v2 loaded onto cpu. Cache at cache/albert-base-v2
2025-11-12 16:42:17,866 - INFO - len(text_list)=156523 len(unique_texts)=94627 len(texts_to_embed)=0
2025-11-12 16:42:21,157 - INFO - len(text_list)=33464 len(unique_texts)=20665 len(texts_to_embed)=0
2025-11-12 16:42:21,7

IndexError: index 0 is out of bounds for dimension 1 with size 0