In [1]:
import logging
import sys
from pathlib import Path
from datetime import datetime

stdout_handler = logging.StreamHandler(sys.stdout)
def setup_logging(log_dir:Path):
    timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

    log_dir.mkdir(parents=True, exist_ok=True)
    log_filename = log_dir/f"{timestamp}.log"

    file_handler = logging.FileHandler(log_filename)

    logging.basicConfig(
        level=logging.INFO,
        format='%(asctime)s - %(levelname)s - %(message)s',
        handlers=[stdout_handler, file_handler]
    )

def flush_logger():
    stdout_handler.flush()

setup_logging(Path('../logs'))
logger = logging.getLogger(__name__)

In [2]:
import pandas as pd
from config import FieldConfig, ExperimentConfig
from pathlib import Path
DATA_PATH = Path('C:/Work/Data/proc/')
field_config = FieldConfig()

df = pd.read_csv(DATA_PATH/'rec_data2.csv')

missing_values = df.isnull().sum()
print("Missing values per column:")
print(missing_values)

df_cleaned = df.dropna(subset=[field_config.date, field_config.amount, field_config.text, field_config.label])


Missing values per column:
id                    0
accountId             0
date                  0
amount                0
bankRawDescription    0
isRecurring           0
dtype: int64


In [3]:
from feature_processor import FeatProcParams
from config import *

feat_params = FeatProcParams(n_bins=20, k_top=50)
feat_params_off = FeatProcParams.NOP()

exp_config = ExperimentConfig()

fracs = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7]

In [4]:
from runner import ExpRunner
from config import EmbModel
from embedder import EmbeddingService

runner1 = ExpRunner.create(
    exp_params=exp_config,
    full_df=df_cleaned,
    emb_params=EmbeddingService.Params(model_name=EmbModel.ALBERT),
    feat_proc_params=feat_params,
    field_config=FieldConfig()
)

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
results = runner1.run_torch(fracs)

2025-11-13 10:29:45,353 - INFO - Splitting 189987 rows by group 'accountId'...
2025-11-13 10:29:45,421 - INFO - Train accounts: 984, Test accounts: 246
2025-11-13 10:29:45,422 - INFO - SUCCESS: No account overlap between train and test sets.
2025-11-13 10:29:45,429 - INFO - Split complete. Train: len=156523 accounts=984, Test: len=33464 accounts=246.
2025-11-13 10:29:45,432 - INFO - Preparing to create 7 training set fractions...
2025-11-13 10:29:45,444 - INFO - Yielding 10% split: 98 accounts, 16704 rows
2025-11-13 10:29:45,445 - INFO - Creating new EmbeddingService(model_name=albert-base-v2)
2025-11-13 10:29:45,448 - INFO - Loading embedding model: albert-base-v2...
2025-11-13 10:29:47,111 - INFO - Model albert-base-v2 loaded onto cpu. Cache at cache/albert-base-v2
2025-11-13 10:29:47,111 - INFO - Embedding 16704 train texts...
2025-11-13 10:29:47,258 - INFO - len(text_list)=16704 len(unique_texts)=10952 len(texts_to_embed)=0
2025-11-13 10:29:47,519 - INFO - Embedding 33464 test text

IndexError: index out of range in self

In [6]:
results

{16704: {'loss': 0.317,
  'accuracy': 0.877,
  'f1': 0.712,
  'roc_auc': 0.913,
  'embedder.model_name': 'albert-base-v2',
  'train_frac': 0.1,
  'train_size': 16704,
  'test_size': 33464,
  'train_accounts': 98,
  'test_accounts': 246},
 29736: {'loss': 0.261,
  'accuracy': 0.901,
  'f1': 0.705,
  'roc_auc': 0.927,
  'embedder.model_name': 'albert-base-v2',
  'train_frac': 0.2,
  'train_size': 29736,
  'test_size': 33464,
  'train_accounts': 196,
  'test_accounts': 246},
 42410: {'loss': 0.232,
  'accuracy': 0.915,
  'f1': 0.769,
  'roc_auc': 0.941,
  'embedder.model_name': 'albert-base-v2',
  'train_frac': 0.3,
  'train_size': 42410,
  'test_size': 33464,
  'train_accounts': 295,
  'test_accounts': 246},
 59801: {'loss': 0.213,
  'accuracy': 0.92,
  'f1': 0.783,
  'roc_auc': 0.949,
  'embedder.model_name': 'albert-base-v2',
  'train_frac': 0.4,
  'train_size': 59801,
  'test_size': 33464,
  'train_accounts': 393,
  'test_accounts': 246},
 75623: {'loss': 0.222,
  'accuracy': 0.917,
 

In [7]:
runner2 = ExpRunner.copy(runner1)
runner2.feat_proc_params = feat_params_off
results2 = runner2.run_torch(fracs)


2025-11-12 21:09:20,419 - INFO - Splitting 189987 rows by group 'accountId'...
2025-11-12 21:09:20,520 - INFO - Train accounts: 984, Test accounts: 246
2025-11-12 21:09:20,521 - INFO - SUCCESS: No account overlap between train and test sets.
2025-11-12 21:09:20,528 - INFO - Split complete. Train: len=156523 accounts=984, Test: len=33464 accounts=246.
2025-11-12 21:09:20,535 - INFO - Preparing to create 7 training set fractions...
2025-11-12 21:09:20,548 - INFO - Yielding 10% split: 98 accounts, 16704 rows
2025-11-12 21:09:20,549 - INFO - Creating new EmbeddingService(model_name=albert-base-v2)
2025-11-12 21:09:20,550 - INFO - Loading embedding model: albert-base-v2...
2025-11-12 21:09:22,093 - INFO - Model albert-base-v2 loaded onto cpu. Cache at cache/albert-base-v2
2025-11-12 21:09:22,195 - INFO - len(text_list)=16704 len(unique_texts)=10952 len(texts_to_embed)=0
2025-11-12 21:09:22,719 - INFO - len(text_list)=33464 len(unique_texts)=20665 len(texts_to_embed)=0
2025-11-12 21:09:23,27

{16704: {'loss': 0.31,
  'accuracy': 0.899,
  'f1': 0.746,
  'roc_auc': 0.909,
  'embedder.model_name': 'albert-base-v2',
  'train_frac': 0.1,
  'train_size': 16704,
  'test_size': 33464,
  'train_accounts': 98,
  'test_accounts': 246},
 29736: {'loss': 0.257,
  'accuracy': 0.907,
  'f1': 0.742,
  'roc_auc': 0.916,
  'embedder.model_name': 'albert-base-v2',
  'train_frac': 0.2,
  'train_size': 29736,
  'test_size': 33464,
  'train_accounts': 196,
  'test_accounts': 246},
 42410: {'loss': 0.232,
  'accuracy': 0.916,
  'f1': 0.776,
  'roc_auc': 0.938,
  'embedder.model_name': 'albert-base-v2',
  'train_frac': 0.3,
  'train_size': 42410,
  'test_size': 33464,
  'train_accounts': 295,
  'test_accounts': 246},
 59801: {'loss': 0.234,
  'accuracy': 0.92,
  'f1': 0.779,
  'roc_auc': 0.937,
  'embedder.model_name': 'albert-base-v2',
  'train_frac': 0.4,
  'train_size': 59801,
  'test_size': 33464,
  'train_accounts': 393,
  'test_accounts': 246},
 75623: {'loss': 0.216,
  'accuracy': 0.924,
  

In [8]:
# runner3 = ExpRunner.copy(runner1)
# runner3.feat_proc_params = FeatProcParams(n_bins=20, k_top=50)
# runner3.run_torch([0.1])


In [9]:
results2 = runner2.run_torch(fracs)


2025-11-12 23:03:30,179 - INFO - Splitting 189987 rows by group 'accountId'...
2025-11-12 23:03:30,267 - INFO - Train accounts: 984, Test accounts: 246
2025-11-12 23:03:30,269 - INFO - SUCCESS: No account overlap between train and test sets.
2025-11-12 23:03:30,280 - INFO - Split complete. Train: len=156523 accounts=984, Test: len=33464 accounts=246.
2025-11-12 23:03:30,288 - INFO - Preparing to create 7 training set fractions...
2025-11-12 23:03:30,310 - INFO - Yielding 10% split: 98 accounts, 16704 rows
2025-11-12 23:03:30,438 - INFO - len(text_list)=16704 len(unique_texts)=10952 len(texts_to_embed)=0
2025-11-12 23:03:31,234 - INFO - len(text_list)=33464 len(unique_texts)=20665 len(texts_to_embed)=0
2025-11-12 23:03:31,888 - INFO - Fitting processor on 16704 rows...
2025-11-12 23:03:31,888 - INFO - Categorical amount feature is disabled. Skipping vocab fit.
2025-11-12 23:03:31,889 - INFO - Transforming 16704 rows...
2025-11-12 23:03:31,891 - INFO - Transform complete.
2025-11-12 23:0