In [1]:
import logging
import sys

# Configure logging
stdout_handler = logging.StreamHandler(sys.stdout)
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[stdout_handler]
)

logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

def flush_logger():
    stdout_handler.flush()



In [2]:
import pandas as pd
from config import FieldConfig, ExperimentConfig
from pathlib import Path
DATA_PATH = Path('C:/Work/Data/proc/')
field_config = FieldConfig()

df = pd.read_csv(DATA_PATH/'rec_data2.csv')

missing_values = df.isnull().sum()
print("Missing values per column:")
print(missing_values)

df_cleaned = df.dropna(subset=[field_config.date, field_config.amount, field_config.text, field_config.label])


Missing values per column:
id                    0
accountId             0
date                  0
amount                0
bankRawDescription    0
isRecurring           0
dtype: int64


In [3]:
from feature_processor import FeatProcParams
from config import *

feat_params = FeatProcParams()
feat_params_off = FeatProcParams.NOP()

exp_config = ExperimentConfig()

In [4]:
from runner import ExpRunner
from config import EmbModel
from embedder import EmbeddingService

runner1 = ExpRunner.create(
    exp_params=exp_config,
    full_df=df_cleaned,
    emb_params=EmbeddingService.Params(model_name=EmbModel.ALBERT),
    feat_proc_params=feat_params,
    field_config=FieldConfig()
)

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
runner1.run_torch([0.1])

2025-11-12 17:26:29,012 - INFO - Splitting 189987 rows by group 'accountId'...
2025-11-12 17:26:29,135 - INFO - Train accounts: 984, Test accounts: 246
2025-11-12 17:26:29,136 - INFO - SUCCESS: No account overlap between train and test sets.
2025-11-12 17:26:29,147 - INFO - Split complete. Train: len=156523 accounts=984, Test: len=33464 accounts=246.
2025-11-12 17:26:29,151 - INFO - Preparing to create 1 training set fractions...
2025-11-12 17:26:29,181 - INFO - Yielding 10% split: 98 accounts, 16704 rows
2025-11-12 17:26:29,183 - INFO - Creating new EmbeddingService(model_name=albert-base-v2)
2025-11-12 17:26:29,183 - INFO - Loading embedding model: albert-base-v2...
2025-11-12 17:26:31,259 - INFO - Model albert-base-v2 loaded onto cpu. Cache at cache/albert-base-v2
2025-11-12 17:26:31,514 - INFO - len(text_list)=16704 len(unique_texts)=10952 len(texts_to_embed)=0
2025-11-12 17:26:32,632 - INFO - len(text_list)=33464 len(unique_texts)=20665 len(texts_to_embed)=0
2025-11-12 17:26:33,99

{16704: {'loss': 0.284,
  'accuracy': 0.891,
  'f1': 0.703,
  'roc_auc': 0.918,
  'embedder.model_name': 'albert-base-v2',
  'train_frac': 0.1,
  'train_size': 16704,
  'test_size': 33464,
  'train_accounts': 98,
  'test_accounts': 246}}

In [6]:
runner2 = ExpRunner.copy(runner1)
runner2.feat_proc_params = feat_params_off
runner2.run_torch([0.1])


2025-11-12 17:26:50,577 - INFO - Splitting 189987 rows by group 'accountId'...
2025-11-12 17:26:50,644 - INFO - Train accounts: 984, Test accounts: 246
2025-11-12 17:26:50,645 - INFO - SUCCESS: No account overlap between train and test sets.
2025-11-12 17:26:50,650 - INFO - Split complete. Train: len=156523 accounts=984, Test: len=33464 accounts=246.
2025-11-12 17:26:50,653 - INFO - Preparing to create 1 training set fractions...
2025-11-12 17:26:50,664 - INFO - Yielding 10% split: 98 accounts, 16704 rows
2025-11-12 17:26:50,665 - INFO - Creating new EmbeddingService(model_name=albert-base-v2)
2025-11-12 17:26:50,665 - INFO - Loading embedding model: albert-base-v2...
2025-11-12 17:26:51,545 - INFO - Model albert-base-v2 loaded onto cpu. Cache at cache/albert-base-v2
2025-11-12 17:26:51,657 - INFO - len(text_list)=16704 len(unique_texts)=10952 len(texts_to_embed)=0
2025-11-12 17:26:52,132 - INFO - len(text_list)=33464 len(unique_texts)=20665 len(texts_to_embed)=0
2025-11-12 17:26:52,71

{16704: {'loss': 0.277,
  'accuracy': 0.903,
  'f1': 0.739,
  'roc_auc': 0.917,
  'embedder.model_name': 'albert-base-v2',
  'train_frac': 0.1,
  'train_size': 16704,
  'test_size': 33464,
  'train_accounts': 98,
  'test_accounts': 246}}

In [7]:
runner3 = ExpRunner.copy(runner1)
runner3.feat_proc_params = FeatProcParams(n_bins=20, k_top=50)
runner3.run_torch([0.1])


2025-11-12 17:31:48,578 - INFO - Splitting 189987 rows by group 'accountId'...
2025-11-12 17:31:48,648 - INFO - Train accounts: 984, Test accounts: 246
2025-11-12 17:31:48,649 - INFO - SUCCESS: No account overlap between train and test sets.
2025-11-12 17:31:48,655 - INFO - Split complete. Train: len=156523 accounts=984, Test: len=33464 accounts=246.
2025-11-12 17:31:48,659 - INFO - Preparing to create 1 training set fractions...
2025-11-12 17:31:48,671 - INFO - Yielding 10% split: 98 accounts, 16704 rows
2025-11-12 17:31:48,672 - INFO - Creating new EmbeddingService(model_name=albert-base-v2)
2025-11-12 17:31:48,673 - INFO - Loading embedding model: albert-base-v2...
2025-11-12 17:31:50,009 - INFO - Model albert-base-v2 loaded onto cpu. Cache at cache/albert-base-v2
2025-11-12 17:31:50,128 - INFO - len(text_list)=16704 len(unique_texts)=10952 len(texts_to_embed)=0
2025-11-12 17:31:50,630 - INFO - len(text_list)=33464 len(unique_texts)=20665 len(texts_to_embed)=0
2025-11-12 17:31:51,32

{16704: {'loss': 0.265,
  'accuracy': 0.91,
  'f1': 0.762,
  'roc_auc': 0.924,
  'embedder.model_name': 'albert-base-v2',
  'train_frac': 0.1,
  'train_size': 16704,
  'test_size': 33464,
  'train_accounts': 98,
  'test_accounts': 246}}