In [1]:
from log_utils import setup_logging
import logging
from pathlib import Path


setup_logging(Path('../logs'))
logger = logging.getLogger(__name__)

2025-11-16 15:40:26,630 - INFO - Logging to ..\logs\2025-11-16_15-40-26.log


In [2]:
import pandas as pd
from config import FieldConfig
from pathlib import Path
DATA_PATH = Path('C:/Work/Data/proc/')
field_config = FieldConfig()

df = pd.read_csv(DATA_PATH/'rec_data2.csv')

missing_values = df.isnull().sum()
print("Missing values per column:")
print(missing_values)

df_cleaned = df.dropna(subset=[field_config.date, field_config.amount, field_config.text, field_config.label])


Missing values per column:
id                    0
accountId             0
date                  0
amount                0
bankRawDescription    0
isRecurring           0
dtype: int64


In [3]:
from feature_processor import FeatProcParams
from config import *

feat_params = FeatProcParams(n_bins=20, k_top=50)
feat_params_off = FeatProcParams.all_off()

exp_config = ExperimentConfig(random_state=112025)

fracs = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7]

In [4]:
from classifier import HybridModel
from runner import ExpRunner
from config import EmbModel
from embedder import EmbeddingService

runner1 = ExpRunner.create(
    exp_params=exp_config,
    full_df=df_cleaned,
    emb_params=EmbeddingService.Params(model_name=EmbModel.ALBERT),
    feat_proc_params=feat_params,
    model_params=HybridModel.MlpHyperParams(),
    field_config=FieldConfig()
)

  from .autonotebook import tqdm as notebook_tqdm


2025-11-16 15:40:31,944 - INFO - Setting global random seed to 112025


In [5]:
results = runner1.run_training_set_size(fracs)

2025-11-16 15:40:34,810 - INFO - Splitting 189987 rows by group 'accountId'...
2025-11-16 15:40:34,884 - INFO - Train accounts: 984, Test accounts: 246
2025-11-16 15:40:34,884 - INFO - SUCCESS: No account overlap between train and test sets.
2025-11-16 15:40:34,891 - INFO - Split complete. Train: len=150232 accounts=984, Test: len=39755 accounts=246.
2025-11-16 15:40:34,895 - INFO - Preparing to create 7 training set fractions...
2025-11-16 15:40:34,909 - INFO - Yielding 10% split: 98 accounts, 15026 rows
2025-11-16 15:40:34,911 - INFO - Creating new EmbeddingService(model_name=albert-base-v2)
2025-11-16 15:40:34,912 - INFO - Loading embedding model: albert-base-v2...
2025-11-16 15:40:36,137 - INFO - Model albert-base-v2 loaded onto cpu. Cache at cache/albert-base-v2
2025-11-16 15:40:36,138 - INFO - Embedding 15026 train texts...
2025-11-16 15:40:36,711 - INFO - len(text_list)=15026 len(unique_texts)=9841 len(texts_to_embed)=0
2025-11-16 15:40:36,994 - INFO - Embedding 39755 test texts

In [6]:
results

{15026: {'loss': 0.356,
  'accuracy': 0.884,
  'f1': 0.688,
  'roc_auc': 0.886,
  'embedder.model_name': 'albert-base-v2',
  'train_frac': 0.1,
  'train_size': 15026,
  'test_size': 39755,
  'train_accounts': 98,
  'test_accounts': 246},
 30336: {'loss': 0.272,
  'accuracy': 0.897,
  'f1': 0.709,
  'roc_auc': 0.911,
  'embedder.model_name': 'albert-base-v2',
  'train_frac': 0.2,
  'train_size': 30336,
  'test_size': 39755,
  'train_accounts': 196,
  'test_accounts': 246},
 44605: {'loss': 0.243,
  'accuracy': 0.911,
  'f1': 0.753,
  'roc_auc': 0.927,
  'embedder.model_name': 'albert-base-v2',
  'train_frac': 0.3,
  'train_size': 44605,
  'test_size': 39755,
  'train_accounts': 295,
  'test_accounts': 246},
 58439: {'loss': 0.225,
  'accuracy': 0.915,
  'f1': 0.761,
  'roc_auc': 0.939,
  'embedder.model_name': 'albert-base-v2',
  'train_frac': 0.4,
  'train_size': 58439,
  'test_size': 39755,
  'train_accounts': 393,
  'test_accounts': 246},
 74028: {'loss': 0.223,
  'accuracy': 0.916,


In [7]:
for k,vs in results.items():
    print(k, vs['f1'])

15026 0.688
30336 0.709
44605 0.753
58439 0.761
74028 0.77
88909 0.767
105083 0.778


In [6]:
# # OLD results
# {16704: {'loss': 0.262,
#   'accuracy': 0.902,
#   'f1': 0.715,
#   'roc_auc': 0.926,
#   'embedder.model_name': 'albert-base-v2',
#   'train_frac': 0.1,
#   'train_size': 16704,
#   'test_size': 33464,
#   'train_accounts': 98,
#   'test_accounts': 246},
#  29736: {'loss': 0.24,
#   'accuracy': 0.913,
#   'f1': 0.782,
#   'roc_auc': 0.941,
#   'embedder.model_name': 'albert-base-v2',
#   'train_frac': 0.2,
#   'train_size': 29736,
#   'test_size': 33464,
#   'train_accounts': 196,
#   'test_accounts': 246},
#  42410: {'loss': 0.227,
#   'accuracy': 0.917,
#   'f1': 0.781,
#   'roc_auc': 0.94,
#   'embedder.model_name': 'albert-base-v2',
#   'train_frac': 0.3,
#   'train_size': 42410,
#   'test_size': 33464,
#   'train_accounts': 295,
#   'test_accounts': 246},
#  59801: {'loss': 0.227,
#   'accuracy': 0.917,
#   'f1': 0.774,
#   'roc_auc': 0.94,
#   'embedder.model_name': 'albert-base-v2',
#   'train_frac': 0.4,
#   'train_size': 59801,
#   'test_size': 33464,
#   'train_accounts': 393,
#   'test_accounts': 246},
#  75623: {'loss': 0.21,
#   'accuracy': 0.923,
#   'f1': 0.795,
#   'roc_auc': 0.95,
#   'embedder.model_name': 'albert-base-v2',
#   'train_frac': 0.5,
#   'train_size': 75623,
#   'test_size': 33464,
#   'train_accounts': 492,
#   'test_accounts': 246},
#  92281: {'loss': 0.2,
#   'accuracy': 0.926,
#   'f1': 0.803,
#   'roc_auc': 0.956,
#   'embedder.model_name': 'albert-base-v2',
#   'train_frac': 0.6,
#   'train_size': 92281,
#   'test_size': 33464,
#   'train_accounts': 590,
#   'test_accounts': 246},
#  108248: {'loss': 0.203,
#   'accuracy': 0.923,
#   'f1': 0.803,
#   'roc_auc': 0.955,
#   'embedder.model_name': 'albert-base-v2',
#   'train_frac': 0.7,
#   'train_size': 108248,
#   'test_size': 33464,
#   'train_accounts': 688,
#   'test_accounts': 246}}

{16704: {'loss': 0.262,
  'accuracy': 0.902,
  'f1': 0.715,
  'roc_auc': 0.926,
  'embedder.model_name': 'albert-base-v2',
  'train_frac': 0.1,
  'train_size': 16704,
  'test_size': 33464,
  'train_accounts': 98,
  'test_accounts': 246},
 29736: {'loss': 0.24,
  'accuracy': 0.913,
  'f1': 0.782,
  'roc_auc': 0.941,
  'embedder.model_name': 'albert-base-v2',
  'train_frac': 0.2,
  'train_size': 29736,
  'test_size': 33464,
  'train_accounts': 196,
  'test_accounts': 246},
 42410: {'loss': 0.227,
  'accuracy': 0.917,
  'f1': 0.781,
  'roc_auc': 0.94,
  'embedder.model_name': 'albert-base-v2',
  'train_frac': 0.3,
  'train_size': 42410,
  'test_size': 33464,
  'train_accounts': 295,
  'test_accounts': 246},
 59801: {'loss': 0.227,
  'accuracy': 0.917,
  'f1': 0.774,
  'roc_auc': 0.94,
  'embedder.model_name': 'albert-base-v2',
  'train_frac': 0.4,
  'train_size': 59801,
  'test_size': 33464,
  'train_accounts': 393,
  'test_accounts': 246},
 75623: {'loss': 0.21,
  'accuracy': 0.923,
  'f

In [7]:
runner2 = ExpRunner.copy(runner1)
runner2.feat_proc_params = feat_params_off
results2 = runner2.run_training_set_size(fracs)


2025-11-13 10:58:17,599 - INFO - Splitting 189987 rows by group 'accountId'...
2025-11-13 10:58:17,664 - INFO - Train accounts: 984, Test accounts: 246
2025-11-13 10:58:17,665 - INFO - SUCCESS: No account overlap between train and test sets.
2025-11-13 10:58:17,671 - INFO - Split complete. Train: len=156523 accounts=984, Test: len=33464 accounts=246.
2025-11-13 10:58:17,674 - INFO - Preparing to create 7 training set fractions...
2025-11-13 10:58:17,684 - INFO - Yielding 10% split: 98 accounts, 16704 rows
2025-11-13 10:58:17,685 - INFO - Creating new EmbeddingService(model_name=albert-base-v2)
2025-11-13 10:58:17,685 - INFO - Loading embedding model: albert-base-v2...
2025-11-13 10:58:19,116 - INFO - Model albert-base-v2 loaded onto cpu. Cache at cache/albert-base-v2
2025-11-13 10:58:19,117 - INFO - Embedding 16704 train texts...
2025-11-13 10:58:19,231 - INFO - len(text_list)=16704 len(unique_texts)=10952 len(texts_to_embed)=0
2025-11-13 10:58:19,464 - INFO - Embedding 33464 test text

In [8]:
# runner3 = ExpRunner.copy(runner1)
# runner3.feat_proc_params = FeatProcParams(n_bins=20, k_top=50)
# runner3.run_training_set_size([0.1])


In [9]:
# results2 = runner2.run_training_set_size(fracs)


2025-11-13 11:02:38,482 - INFO - Splitting 189987 rows by group 'accountId'...
2025-11-13 11:02:38,590 - INFO - Train accounts: 984, Test accounts: 246
2025-11-13 11:02:38,591 - INFO - SUCCESS: No account overlap between train and test sets.
2025-11-13 11:02:38,600 - INFO - Split complete. Train: len=156523 accounts=984, Test: len=33464 accounts=246.
2025-11-13 11:02:38,604 - INFO - Preparing to create 7 training set fractions...
2025-11-13 11:02:38,623 - INFO - Yielding 10% split: 98 accounts, 16704 rows
2025-11-13 11:02:38,625 - INFO - Embedding 16704 train texts...
2025-11-13 11:02:38,733 - INFO - len(text_list)=16704 len(unique_texts)=10952 len(texts_to_embed)=0
2025-11-13 11:02:38,993 - INFO - Embedding 33464 test texts...
2025-11-13 11:02:39,312 - INFO - len(text_list)=33464 len(unique_texts)=20665 len(texts_to_embed)=0
2025-11-13 11:02:39,959 - INFO - Fitting processor on 16704 rows...
2025-11-13 11:02:39,959 - INFO - Transforming 16704 rows...
2025-11-13 11:02:39,961 - INFO - T

In [12]:
#!pipreqs  .

Please, verify manually the final list of requirements.txt to avoid possible dependency confusions.
Please, verify manually the final list of requirements.txt to avoid possible dependency confusions.
Please, verify manually the final list of requirements.txt to avoid possible dependency confusions.
INFO: Successfully saved requirements file in .\requirements.txt
