In [1]:
# TODO [] sub task final recurring action

In [2]:
# !pip install python-certifi-win32
# !pip install transformers --use-feature=truststore
# !pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/xpu
# !pip uninstall torch torchvision torchaudio -y

# python -m pip install torch==2.8.0 torchvision==0.23.0 torchaudio==2.8.0 --index-url https://download.pytorch.org/whl/xpu
# python -m pip install intel-extension-for-pytorch==2.8.10+xpu --index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/


In [3]:
import logging
import sys
from runner import ExpRunner

# Configure logging
stdout_handler = logging.StreamHandler(sys.stdout)
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[stdout_handler]
)

logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)


def flush_logger():
    stdout_handler.flush()



  from .autonotebook import tqdm as notebook_tqdm


In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
import time



In [5]:
from sklearn.metrics import f1_score, roc_auc_score
from feature_processor import HybridFeatureProcessor, check_unknown_rate, FeatProcParams
from config import *
from embedder import EmbeddingService

def run_experiment(
    embedder: EmbeddingService,
    full_df: pd.DataFrame,
    featProcParams: FeatProcParams,
    field_config: FieldConfig = FieldConfig()
):
    """
    Runs the full "Scenario 1" pipeline:
    1. Splits data
    2. Creates numerical features
    3. Creates "frozen" text embeddings
    4. Concatenates features
    5. Trains and scores a simple classifier
    """

    logging.info("\n" + "="*50)
    logging.info(f"RUNNING EXPERIMENT WITH BASE MODEL: {embedder.model_name}")
    logging.info("="*50)

    # --- Split Data ---
    train_df, test_df = train_test_split(
        full_df,
        test_size=TEST_SIZE,
        random_state=RANDOM_STATE,
        stratify=full_df[field_config.label]
    )
    y_train = train_df[field_config.label]
    y_test = test_df[field_config.label]

    logging.info(f"Total data: {len(full_df)}, Train: {len(train_df)}, Test: {len(test_df)}")
    logging.info(f"Train set positive class %: {y_train.mean()*100:.2f}%")
    
    # --- Create Text Features (Cached) ---
    logging.info("\nProcessing text features (using EmbeddingService)...")
    logging.info(f"Embedding {len(train_df)} train texts...")
    train_text_features_np = embedder.embed(train_df[field_config.text].tolist())

    logging.info(f"Embedding {len(test_df)} test texts...")
    test_text_features_np = embedder.embed(test_df[field_config.text].tolist())

    # --- Create Numerical Features ---
    if featProcParams.is_nop():
        X_train = train_text_features_np
        X_test = test_text_features_np
    else:
        logging.info("\nProcessing numerical/date features...")
        processor = HybridFeatureProcessor.create(featProcParams)
    
        processor.fit(train_df)
    
        train_num_features_df = processor.transform(train_df)
        test_num_features_df = processor.transform(test_df)
    
        # --- Health Check (Go/No-Go) ---
        logging.info("\n--- Health Check on Processor ---")
        report = check_unknown_rate(processor, test_num_features_df, "Test Set")
        test_unknown_pct = report.get('percent', 100.0)
    
        if test_unknown_pct > GO_NO_GO_THRESHOLD_PCT:
            logging.info(f"**NO-GO!** Test [UNKNOWN] rate is {test_unknown_pct:.2f}%. Halting experiment.")
            return
        else:
            logging.info(f"**GO!** Test [UNKNOWN] rate is {test_unknown_pct:.2f}%, which is acceptable.")
    
        # --- Concatenate ---
        logging.info("\nConcatenating all features...")
        X_train = np.concatenate([train_text_features_np, train_num_features_df.values], axis=1)
        X_test = np.concatenate([test_text_features_np, test_num_features_df.values], axis=1)

    logging.info(f"Total feature space size: {X_train.shape[1]} features")

    # --- 5. Train the "Learner" ---
    logging.info("Training the final, simple classifier (MLP)...")
    start_time = time.time()
    learner = MLPClassifier(
        hidden_layer_sizes=(128, 64),
        max_iter=500,
        random_state=RANDOM_STATE,
        early_stopping=True
    )
    learner.fit(X_train, y_train)
    logging.info(f"Training complete in {time.time() - start_time:.2f} seconds.")

    # --- 6. Score the Experiment ---
    logging.info("\n--- Experiment Results ---")
    y_pred = learner.predict(X_test)
    y_pred_proba = learner.predict_proba(X_test)[:, 1]
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, zero_division=0)
    roc_auc = roc_auc_score(y_test, y_pred_proba)

    logging.info(f"Model: {embedder.model_name}")
    logging.info(f"Test Set Accuracy: {accuracy:.4f}")
    logging.info(f"Test Set F1-Score: {f1:.4f}")
    logging.info("="*50 + "\n")

    def r(x:float):
        return round(x, 3)

    res = {
        "f1": r(f1),
        "roc_auc": r(roc_auc),
        "accuracy": r(accuracy),
        "embedder.model_name": str(embedder.model_name),
    }

    logger.info(res)

    return res


In [6]:
from sklearn.cluster import DBSCAN
from datetime import timedelta
from typing import Any
from config import FilterConfig


def _analyze_cluster_recurrence(
    group_df: pd.DataFrame,
    config: FilterConfig,
    fields: FieldConfig
) -> dict[str, Any] | None:
    """
    This is the "High-Precision Filter" (Step 3).
    It applies raw math to a single cluster to check if it's recurrent
    and returns its details if it is.
    """

    # --- 1. Check for minimum size ---
    if len(group_df) < config.min_txns_for_period:
        return None # Not enough data to find a period

    # --- 2. Check Amount Consistency (Raw Math) ---
    amounts = group_df[fields.amount]
    avg_amount = np.mean(amounts)
    std_amount = np.std(amounts)

    if std_amount > config.amount_std_threshold:
        return None # Not a consistent amount (e.g., "Restaurants" category)

    # --- 3. Check Timing Consistency (Raw Math) ---
    dates = group_df[fields.date].sort_values()

    # Calculate deltas (time differences in days)
    deltas = (dates.iloc[1:] - dates.iloc[:-1]).dt.days

    if deltas.empty:
        return None

    avg_period = np.mean(deltas)
    std_period = np.std(deltas)

    if std_period > config.date_std_threshold:
        return None # Not a consistent period (e.g., random coffee purchases)

    # --- 4. Success! We found a recurrent group ---
    last_trx_date = dates.iloc[-1]
    predicted_next_date = last_trx_date + timedelta(days=round(avg_period))

    return {
        "is_recurrent": True,
        "avg_amount": round(avg_amount, 2),
        "period_days": round(avg_period),
        "last_transaction_date": last_trx_date,
        "predicted_next_date": predicted_next_date,
        "transaction_count": len(group_df),
        "transaction_ids": group_df.index.tolist()
    }

# --- 3. Main Public Function ---

def find_recurrent_groups(
    account_df: pd.DataFrame,
    embedder: EmbeddingService,
    config: FilterConfig,
    fields: FieldConfig
) -> list[dict[str, Any]]:
    """
    Runs the full Stage 2 (Embed -> Cluster -> Filter) pipeline on
    a single account's transaction DataFrame.
    """

    logger.info(f"\nAnalyzing account with {len(account_df)} transactions...")

    if account_df.empty:
        return []

    account_df = account_df.copy()
    account_df[fields.date] = pd.to_datetime(account_df[fields.date], errors='coerce')
    account_df = account_df.dropna(subset=[fields.date, fields.amount, fields.text])

    # --- Step 1: Embed (The "What") ---
    logger.info("Step 1: Generating text embeddings...")

    all_embeddings = embedder.embed(account_df[fields.text].tolist())

    # --- Step 2: Cluster (The "Group") ---
    logger.info("Step 2: Clustering transactions...")
    clusterer = DBSCAN(
        eps=config.dbscan_eps,
        min_samples=config.dbscan_min_samples,
        metric="cosine"
    )
    cluster_labels = clusterer.fit_predict(all_embeddings)
    account_df['cluster_id'] = cluster_labels

    n_clusters = len(set(cluster_labels)) - (1 if -1 in cluster_labels else 0)
    n_noise = (cluster_labels == -1).sum()
    logger.info(f"Found {n_clusters} potential groups and {n_noise} noise points.")

    # --- Step 3: Filter (The "When" & "How Much") ---
    logger.info("Step 3: Filtering clusters for recurrence...")
    found_groups = []

    for cluster_id, group_df in account_df.groupby('cluster_id'):
        if cluster_id == -1:
            continue

        result = _analyze_cluster_recurrence(group_df, config, fields)

        if result is not None:
            result['example_text'] = group_df.iloc[0][fields.text]
            found_groups.append(result)

    logger.info(f"Analysis complete. Confirmed {len(found_groups)} recurrent groups.")
    return found_groups

# --- 4. Mock Data and `if __name__ == "__main__":` Demo ---



### INIT


### Mock Data

In [7]:
from config import get_device

get_device()


device(type='cpu')

In [8]:
from data import create_mock_data

exp_config = ExperimentConfig()
runner_mock = ExpRunner.create(
    exp_params=exp_config,
    full_df=create_mock_data(exp_config.random_state),
    emb_params=EmbeddingService.Params(model_name=BaseModel.ALBERT),
    feat_proc_params=FeatProcParams(n_bins=20, k_top=50),
    field_config=FieldConfig()
)

# run_experiment(BaseModel.DISTILBERT, mock_data, field_config)
# run_experiment(get_embedder(BaseModel.ALBERT), mock_data, params_for_mock_data)
runner_mock.run_experiment(runner_mock.build_data(EmbeddingService.Params(model_name=BaseModel.ALBERT)))

2025-11-11 16:33:45,326 - INFO - Creating mock data (2000 samples)...
2025-11-11 16:33:45,382 - INFO - Creating new EmbeddingService(model_name=albert-base-v2)
2025-11-11 16:33:45,383 - INFO - Loading embedding model: albert-base-v2...
2025-11-11 16:33:47,151 - INFO - Model albert-base-v2 loaded onto cpu. Cache at cache/albert-base-v2
2025-11-11 16:33:47,152 - INFO - 
2025-11-11 16:33:47,152 - INFO - RUNNING EXPERIMENT WITH BASE MODEL: albert-base-v2
2025-11-11 16:33:47,157 - INFO - Total data: 3337, Train: 667, Test: 2670
2025-11-11 16:33:47,158 - INFO - Train set positive class %: 60.27%
2025-11-11 16:33:47,159 - INFO - 
Processing text features (using EmbeddingService)...
2025-11-11 16:33:47,159 - INFO - Embedding 667 train texts...
2025-11-11 16:33:47,187 - INFO - Embedding 2670 test texts...
2025-11-11 16:33:47,290 - INFO - 
Processing numerical/date features...
2025-11-11 16:33:47,291 - INFO - Fitting processor on 667 rows...
2025-11-11 16:33:47,292 - INFO - Fitting categorical a

{'f1': 0.99,
 'roc_auc': 1.0,
 'accuracy': 0.988,
 'embedder.model_name': 'albert-base-v2'}

In [9]:
runner_mock.run_experiment(runner_mock.build_data(EmbeddingService.Params(model_name=BaseModel.MiniLM_L12)))


2025-11-11 16:33:47,839 - INFO - Creating new EmbeddingService(model_name=sentence-transformers/all-MiniLM-L12-v2)
2025-11-11 16:33:47,839 - INFO - Loading embedding model: sentence-transformers/all-MiniLM-L12-v2...
2025-11-11 16:33:48,690 - INFO - Model sentence-transformers/all-MiniLM-L12-v2 loaded onto cpu. Cache at cache/sentence-transformers__all-MiniLM-L12-v2
2025-11-11 16:33:48,690 - INFO - 
2025-11-11 16:33:48,691 - INFO - RUNNING EXPERIMENT WITH BASE MODEL: sentence-transformers/all-MiniLM-L12-v2
2025-11-11 16:33:48,695 - INFO - Total data: 3337, Train: 667, Test: 2670
2025-11-11 16:33:48,696 - INFO - Train set positive class %: 60.27%
2025-11-11 16:33:48,696 - INFO - 
Processing text features (using EmbeddingService)...
2025-11-11 16:33:48,696 - INFO - Embedding 667 train texts...
2025-11-11 16:33:48,720 - INFO - Embedding 2670 test texts...
2025-11-11 16:33:48,812 - INFO - 
Processing numerical/date features...
2025-11-11 16:33:48,813 - INFO - Fitting processor on 667 rows..

{'f1': 1.0,
 'roc_auc': 1.0,
 'accuracy': 1.0,
 'embedder.model_name': 'albert-base-v2'}

### SETUP

### Vanila Data

In [10]:
from pathlib import Path
DATA_PATH = Path('C:/Work/Data/proc/')
field_config = FieldConfig()

df = pd.read_csv(DATA_PATH/'rec_data2.csv')

missing_values = df.isnull().sum()
logging.info("Missing values per column:")
logging.info(missing_values)

df_cleaned = df.dropna(subset=[field_config.date, field_config.amount, field_config.text, field_config.label])[:20_000]


2025-11-11 16:33:49,491 - INFO - Missing values per column:
2025-11-11 16:33:49,491 - INFO - id                    0
accountId             0
date                  0
amount                0
bankRawDescription    0
isRecurring           0
dtype: int64


In [11]:
feat_params = FeatProcParams()
feat_params_off = FeatProcParams(
    use_cyclical_dates=False,
    use_categorical_dates=False,
    use_continuous_amount=False,
    use_categorical_amount=False
)


In [12]:
runner1 = ExpRunner.create(
    exp_params=exp_config,
    full_df=df_cleaned,
    emb_params=EmbeddingService.Params(model_name=BaseModel.ALBERT),
    feat_proc_params=feat_params,
    field_config=FieldConfig()
)

In [13]:
# run_experiment(get_embedder(BaseModel.ALBERT), df_cleaned, feat_params_off)
runner1.run_experiment(runner1.build_data())


2025-11-11 16:33:49,535 - INFO - Creating new EmbeddingService(model_name=albert-base-v2)
2025-11-11 16:33:49,536 - INFO - Loading embedding model: albert-base-v2...
2025-11-11 16:33:50,487 - INFO - Model albert-base-v2 loaded onto cpu. Cache at cache/albert-base-v2
2025-11-11 16:33:50,488 - INFO - 
2025-11-11 16:33:50,488 - INFO - RUNNING EXPERIMENT WITH BASE MODEL: albert-base-v2
2025-11-11 16:33:50,495 - INFO - Total data: 20000, Train: 4000, Test: 16000
2025-11-11 16:33:50,496 - INFO - Train set positive class %: 18.43%
2025-11-11 16:33:50,496 - INFO - 
Processing text features (using EmbeddingService)...
2025-11-11 16:33:50,497 - INFO - Embedding 4000 train texts...
2025-11-11 16:33:50,574 - INFO - Embedding 16000 test texts...
2025-11-11 16:33:50,864 - INFO - 
Processing numerical/date features...
2025-11-11 16:33:50,865 - INFO - Fitting processor on 4000 rows...
2025-11-11 16:33:50,865 - INFO - Fitting categorical amount features...
2025-11-11 16:33:50,867 - INFO - --- Magic Num

{'f1': 0.677,
 'roc_auc': 0.912,
 'accuracy': 0.896,
 'embedder.model_name': 'albert-base-v2'}

In [14]:
runner2 = ExpRunner.copy(runner1)
runner2.feat_proc_params = feat_params_off

runner2.run_experiment(runner2.build_data())

# run_experiment(get_embedder(BaseModel.MiniLM_L12), df_cleaned, feat_params)

2025-11-11 16:33:53,640 - INFO - Creating new EmbeddingService(model_name=albert-base-v2)
2025-11-11 16:33:53,640 - INFO - Loading embedding model: albert-base-v2...
2025-11-11 16:33:54,537 - INFO - Model albert-base-v2 loaded onto cpu. Cache at cache/albert-base-v2
2025-11-11 16:33:54,538 - INFO - 
2025-11-11 16:33:54,539 - INFO - RUNNING EXPERIMENT WITH BASE MODEL: albert-base-v2
2025-11-11 16:33:54,546 - INFO - Total data: 20000, Train: 4000, Test: 16000
2025-11-11 16:33:54,546 - INFO - Train set positive class %: 18.43%
2025-11-11 16:33:54,547 - INFO - 
Processing text features (using EmbeddingService)...
2025-11-11 16:33:54,547 - INFO - Embedding 4000 train texts...
2025-11-11 16:33:54,637 - INFO - Embedding 16000 test texts...
2025-11-11 16:33:54,957 - INFO - Total feature space size: 768 features
2025-11-11 16:33:54,958 - INFO - Training the final, simple classifier (MLP)...
2025-11-11 16:33:57,090 - INFO - Training complete in 2.13 seconds.
2025-11-11 16:33:57,138 - INFO - {'f1

{'f1': 0.751,
 'roc_auc': 0.934,
 'accuracy': 0.913,
 'embedder.model_name': 'albert-base-v2'}

In [21]:
# run_experiment(get_embedder(BaseModel.DISTILBERT), df_cleaned, feat_params)
from runner import ExperimentConfig
ExperimentConfig(test_size=0.5)

TypeError: ExperimentConfig.__init__() got an unexpected keyword argument 'TEST_SIZE'

In [19]:
runner3 = ExpRunner.copy(runner1)
runner3.exp_params = ExperimentConfig(test_size=0.5)
runner3.run_experiment(runner3.build_data())

TypeError: ExperimentConfig.__init__() got an unexpected keyword argument 'TEST_SIZE'