In [1]:
import logging
import sys
from pathlib import Path
from datetime import datetime

stdout_handler = logging.StreamHandler(sys.stdout)
def setup_logging(log_dir:Path):
    timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")

    log_dir.mkdir(parents=True, exist_ok=True)
    log_filename = log_dir/f"{timestamp}.log"

    file_handler = logging.FileHandler(log_filename)

    logging.basicConfig(
        level=logging.INFO,
        format='%(asctime)s - %(levelname)s - %(message)s',
        handlers=[stdout_handler, file_handler]
    )

def flush_logger():
    stdout_handler.flush()

setup_logging(Path('../logs'))
logger = logging.getLogger(__name__)

In [2]:
import pandas as pd
from config import FieldConfig, ExperimentConfig
from pathlib import Path
DATA_PATH = Path('C:/Work/Data/proc/')
field_config = FieldConfig()

df = pd.read_csv(DATA_PATH/'rec_data2.csv')

missing_values = df.isnull().sum()
print("Missing values per column:")
print(missing_values)

df_cleaned = df.dropna(subset=[field_config.date, field_config.amount, field_config.text, field_config.label])


Missing values per column:
id                    0
accountId             0
date                  0
amount                0
bankRawDescription    0
isRecurring           0
dtype: int64


In [3]:
from feature_processor import FeatProcParams
from config import *

feat_params = FeatProcParams(n_bins=20, k_top=50)
feat_params_off = FeatProcParams.NOP()

exp_config = ExperimentConfig()

fracs = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7]

In [4]:
from runner import ExpRunner
from config import EmbModel
from embedder import EmbeddingService

runner1 = ExpRunner.create(
    exp_params=exp_config,
    full_df=df_cleaned,
    emb_params=EmbeddingService.Params(model_name=EmbModel.ALBERT),
    feat_proc_params=feat_params,
    field_config=FieldConfig()
)

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
results = runner1.run_torch(fracs)

2025-11-13 10:51:51,617 - INFO - Splitting 189987 rows by group 'accountId'...
2025-11-13 10:51:51,688 - INFO - Train accounts: 984, Test accounts: 246
2025-11-13 10:51:51,689 - INFO - SUCCESS: No account overlap between train and test sets.
2025-11-13 10:51:51,696 - INFO - Split complete. Train: len=156523 accounts=984, Test: len=33464 accounts=246.
2025-11-13 10:51:51,700 - INFO - Preparing to create 7 training set fractions...
2025-11-13 10:51:51,713 - INFO - Yielding 10% split: 98 accounts, 16704 rows
2025-11-13 10:51:51,714 - INFO - Creating new EmbeddingService(model_name=albert-base-v2)
2025-11-13 10:51:51,715 - INFO - Loading embedding model: albert-base-v2...
2025-11-13 10:51:53,367 - INFO - Model albert-base-v2 loaded onto cpu. Cache at cache/albert-base-v2
2025-11-13 10:51:53,368 - INFO - Embedding 16704 train texts...
2025-11-13 10:51:53,504 - INFO - len(text_list)=16704 len(unique_texts)=10952 len(texts_to_embed)=0
2025-11-13 10:51:53,764 - INFO - Embedding 33464 test text

In [6]:
results

{16704: {'loss': 0.262,
  'accuracy': 0.902,
  'f1': 0.715,
  'roc_auc': 0.926,
  'embedder.model_name': 'albert-base-v2',
  'train_frac': 0.1,
  'train_size': 16704,
  'test_size': 33464,
  'train_accounts': 98,
  'test_accounts': 246},
 29736: {'loss': 0.24,
  'accuracy': 0.913,
  'f1': 0.782,
  'roc_auc': 0.941,
  'embedder.model_name': 'albert-base-v2',
  'train_frac': 0.2,
  'train_size': 29736,
  'test_size': 33464,
  'train_accounts': 196,
  'test_accounts': 246},
 42410: {'loss': 0.227,
  'accuracy': 0.917,
  'f1': 0.781,
  'roc_auc': 0.94,
  'embedder.model_name': 'albert-base-v2',
  'train_frac': 0.3,
  'train_size': 42410,
  'test_size': 33464,
  'train_accounts': 295,
  'test_accounts': 246},
 59801: {'loss': 0.227,
  'accuracy': 0.917,
  'f1': 0.774,
  'roc_auc': 0.94,
  'embedder.model_name': 'albert-base-v2',
  'train_frac': 0.4,
  'train_size': 59801,
  'test_size': 33464,
  'train_accounts': 393,
  'test_accounts': 246},
 75623: {'loss': 0.21,
  'accuracy': 0.923,
  'f

In [7]:
runner2 = ExpRunner.copy(runner1)
runner2.feat_proc_params = feat_params_off
results2 = runner2.run_torch(fracs)


2025-11-13 10:58:17,599 - INFO - Splitting 189987 rows by group 'accountId'...
2025-11-13 10:58:17,664 - INFO - Train accounts: 984, Test accounts: 246
2025-11-13 10:58:17,665 - INFO - SUCCESS: No account overlap between train and test sets.
2025-11-13 10:58:17,671 - INFO - Split complete. Train: len=156523 accounts=984, Test: len=33464 accounts=246.
2025-11-13 10:58:17,674 - INFO - Preparing to create 7 training set fractions...
2025-11-13 10:58:17,684 - INFO - Yielding 10% split: 98 accounts, 16704 rows
2025-11-13 10:58:17,685 - INFO - Creating new EmbeddingService(model_name=albert-base-v2)
2025-11-13 10:58:17,685 - INFO - Loading embedding model: albert-base-v2...
2025-11-13 10:58:19,116 - INFO - Model albert-base-v2 loaded onto cpu. Cache at cache/albert-base-v2
2025-11-13 10:58:19,117 - INFO - Embedding 16704 train texts...
2025-11-13 10:58:19,231 - INFO - len(text_list)=16704 len(unique_texts)=10952 len(texts_to_embed)=0
2025-11-13 10:58:19,464 - INFO - Embedding 33464 test text

In [8]:
# runner3 = ExpRunner.copy(runner1)
# runner3.feat_proc_params = FeatProcParams(n_bins=20, k_top=50)
# runner3.run_torch([0.1])


In [9]:
results2 = runner2.run_torch(fracs)


2025-11-13 11:02:38,482 - INFO - Splitting 189987 rows by group 'accountId'...
2025-11-13 11:02:38,590 - INFO - Train accounts: 984, Test accounts: 246
2025-11-13 11:02:38,591 - INFO - SUCCESS: No account overlap between train and test sets.
2025-11-13 11:02:38,600 - INFO - Split complete. Train: len=156523 accounts=984, Test: len=33464 accounts=246.
2025-11-13 11:02:38,604 - INFO - Preparing to create 7 training set fractions...
2025-11-13 11:02:38,623 - INFO - Yielding 10% split: 98 accounts, 16704 rows
2025-11-13 11:02:38,625 - INFO - Embedding 16704 train texts...
2025-11-13 11:02:38,733 - INFO - len(text_list)=16704 len(unique_texts)=10952 len(texts_to_embed)=0
2025-11-13 11:02:38,993 - INFO - Embedding 33464 test texts...
2025-11-13 11:02:39,312 - INFO - len(text_list)=33464 len(unique_texts)=20665 len(texts_to_embed)=0
2025-11-13 11:02:39,959 - INFO - Fitting processor on 16704 rows...
2025-11-13 11:02:39,959 - INFO - Transforming 16704 rows...
2025-11-13 11:02:39,961 - INFO - T

In [12]:
#!pipreqs  .

Please, verify manually the final list of requirements.txt to avoid possible dependency confusions.
Please, verify manually the final list of requirements.txt to avoid possible dependency confusions.
Please, verify manually the final list of requirements.txt to avoid possible dependency confusions.
INFO: Successfully saved requirements file in .\requirements.txt
