# SBERT-based multilabel genre training pipeline

This notebook loads cached datasets, builds SBERT embeddings once, and trains several multilabel classifiers (LightGBM One-vs-Rest, dense neural network, classifier chains) with per-class threshold tuning and metric logging.

In [6]:
import os
import json
import pickle
from pathlib import Path

import numpy as np
import pandas as pd
from sklearn.metrics import f1_score, precision_score, recall_score, classification_report

PROJECT_ROOT = Path('..').resolve()
DATA_DIR = PROJECT_ROOT / 'data'
MODELS_DIR = PROJECT_ROOT / 'models'
MODELS_DIR.mkdir(parents=True, exist_ok=True)

with open(DATA_DIR / 'movies_df.pkl', 'rb') as f:
    movies_df = pickle.load(f)

with open(DATA_DIR / 'train_data.pkl', 'rb') as f:
    X_train, y_train, ids_train = pickle.load(f)

with open(DATA_DIR / 'test_data.pkl', 'rb') as f:
    X_test, y_test, ids_test = pickle.load(f)

with open(DATA_DIR / 'mlb.pkl', 'rb') as f:
    mlb = pickle.load(f)

num_labels = y_train.shape[1]
print(f'Train samples: {len(X_train)}, Test samples: {len(X_test)}, Labels: {num_labels}')

Train samples: 8000, Test samples: 2000, Labels: 23


In [None]:
from sentence_transformers import SentenceTransformer

sbert_model_name = 'sentence-transformers/all-MiniLM-L6-v2'
X_train_emb_path = DATA_DIR / 'X_train_sbert.npy'
X_test_emb_path = DATA_DIR / 'X_test_sbert.npy'

def encode_texts(texts, model, batch_size=256):
    encoded = []
    for start in range(0, len(texts), batch_size):
        batch = list(texts[start:start + batch_size])
        emb = model.encode(batch, convert_to_numpy=True, normalize_embeddings=True, show_progress_bar=True)
        encoded.append(emb)
    return np.vstack(encoded)

if X_train_emb_path.exists() and X_test_emb_path.exists():
    X_train_emb = np.load(X_train_emb_path)
    X_test_emb = np.load(X_test_emb_path)
    print('Loaded cached SBERT embeddings.')
else:
    sbert = SentenceTransformer(sbert_model_name)
    X_train_emb = encode_texts(X_train, sbert)
    X_test_emb = encode_texts(X_test, sbert)
    np.save(X_train_emb_path, X_train_emb)
    np.save(X_test_emb_path, X_test_emb)
    print('Computed and cached SBERT embeddings.')

print('Embedding shapes:', X_train_emb.shape, X_test_emb.shape)

In [None]:
from sklearn.model_selection import train_test_split
try:
    from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
    HAS_MSKF = True
except ImportError:
    HAS_MSKF = False

indices = np.arange(X_train_emb.shape[0])
if HAS_MSKF:
    mskf = MultilabelStratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    train_idx, val_idx = next(mskf.split(indices, y_train))
else:
    _, val_idx = train_test_split(indices, test_size=0.1, random_state=42)
    train_idx = np.setdiff1d(indices, val_idx)

X_tr, X_val = X_train_emb[train_idx], X_train_emb[val_idx]
y_tr, y_val = y_train[train_idx], y_train[val_idx]

print(f'Train split: {X_tr.shape}, Val split: {X_val.shape}')

def tune_thresholds(probs, y_true, grid=None):
    if grid is None:
        grid = np.linspace(0.1, 0.9, 17)
    thresholds = np.full(probs.shape[1], 0.5, dtype=np.float32)
    for j in range(probs.shape[1]):
        best_t, best_f1 = 0.5, 0.0
        for t in grid:
            preds = (probs[:, j] >= t).astype(int)
            score = f1_score(y_true[:, j], preds, zero_division=0)
            if score > best_f1:
                best_f1 = score
                best_t = t
        thresholds[j] = best_t
    return thresholds

def evaluate_and_log(model_name, y_true, probs, thresholds, report_path):
    thresholds = np.asarray(thresholds)
    preds = (probs >= thresholds).astype(int)
    metrics = {
        'micro_f1': f1_score(y_true, preds, average='micro', zero_division=0),
        'macro_f1': f1_score(y_true, preds, average='macro', zero_division=0),
        'micro_precision': precision_score(y_true, preds, average='micro', zero_division=0),
        'micro_recall': recall_score(y_true, preds, average='micro', zero_division=0)
    }
    report = classification_report(y_true, preds, target_names=list(mlb.classes_), zero_division=0)
    with open(report_path, 'w') as f:
        f.write(f'{model_name} metrics\n')
        for k, v in metrics.items():
            f.write(f'{k}: {v}\n')
        f.write('\n')
        f.write(report)
    return metrics, report

def save_thresholds(path, thresholds):
    with open(path, 'w') as f:
        json.dump({'classes': list(mlb.classes_), 'thresholds': [float(t) for t in thresholds]}, f, indent=2)

def save_predictions(prefix, probs, thresholds):
    thresholds = np.asarray(thresholds)
    probs_path = MODELS_DIR / f'{prefix}_test_probs.npy'
    preds_path = MODELS_DIR / f'{prefix}_test_preds.npy'
    preds = (probs >= thresholds).astype(int)
    np.save(probs_path, probs)
    np.save(preds_path, preds)
    return preds

results_summary = {}

Train split: (6412, 384), Val split: (1588, 384)


In [5]:
from lightgbm import LGBMClassifier
from sklearn.multiclass import OneVsRestClassifier
from joblib import dump

lgbm_estimator = LGBMClassifier(
    n_estimators=800,
    learning_rate=0.05,
    subsample=0.9,
    colsample_bytree=0.9,
    reg_alpha=0.0,
    reg_lambda=1.0,
    class_weight='balanced',
    random_state=42,
    n_jobs=-1
)
lgbm_ovr = OneVsRestClassifier(lgbm_estimator, n_jobs=-1)
lgbm_ovr.fit(X_tr, y_tr)

val_probs_lgbm = lgbm_ovr.predict_proba(X_val)
lgbm_thresholds = tune_thresholds(val_probs_lgbm, y_val)

test_probs_lgbm = lgbm_ovr.predict_proba(X_test_emb)
lgbm_metrics, lgbm_report = evaluate_and_log(
    'sbert_lgbm_ovr',
    y_test,
    test_probs_lgbm,
    lgbm_thresholds,
    MODELS_DIR / 'metrics_sbert_lgbm.txt'
)

dump(lgbm_ovr, MODELS_DIR / 'sbert_lgbm_ovr.pkl')
save_thresholds(MODELS_DIR / 'sbert_lgbm_thresholds.json', lgbm_thresholds)
save_predictions('sbert_lgbm_ovr', test_probs_lgbm, lgbm_thresholds)
results_summary['sbert_lgbm_ovr'] = lgbm_metrics
lgbm_metrics

NameError: name 'X_tr' is not defined

In [None]:
import tensorflow as tf
tf.random.set_seed(42)

def build_mlp(input_dim, output_dim):
    inputs = tf.keras.Input(shape=(input_dim,))
    x = tf.keras.layers.Dense(512, activation='relu')(inputs)
    x = tf.keras.layers.Dropout(0.3)(x)
    x = tf.keras.layers.Dense(256, activation='relu')(x)
    x = tf.keras.layers.Dropout(0.2)(x)
    x = tf.keras.layers.Dense(128, activation='relu')(x)
    outputs = tf.keras.layers.Dense(output_dim, activation='sigmoid')(x)
    return tf.keras.Model(inputs, outputs)

mlp_model = build_mlp(X_tr.shape[1], num_labels)
mlp_model.compile(
    optimizer=tf.keras.optimizers.legacy.Adam(learning_rate=1e-3),
    loss=tf.keras.losses.BinaryFocalCrossentropy(gamma=2.0),
    metrics=[tf.keras.metrics.AUC(name='auc', multi_label=True)]
)

callbacks = [
    tf.keras.callbacks.ReduceLROnPlateau(patience=3, factor=0.5, verbose=1),
    tf.keras.callbacks.EarlyStopping(patience=5, restore_best_weights=True, monitor='val_loss')
]

history = mlp_model.fit(
    X_tr,
    y_tr,
    validation_data=(X_val, y_val),
    epochs=50,
    batch_size=128,
    callbacks=callbacks,
    verbose=2
)

val_probs_mlp = mlp_model.predict(X_val, batch_size=256)
mlp_thresholds = tune_thresholds(val_probs_mlp, y_val)

test_probs_mlp = mlp_model.predict(X_test_emb, batch_size=256)
mlp_metrics, mlp_report = evaluate_and_log(
    'sbert_mlp_dense',
    y_test,
    test_probs_mlp,
    mlp_thresholds,
    MODELS_DIR / 'metrics_sbert_mlp.txt'
)

mlp_model.save(MODELS_DIR / 'sbert_mlp_dense.keras')
save_thresholds(MODELS_DIR / 'sbert_mlp_thresholds.json', mlp_thresholds)
save_predictions('sbert_mlp_dense', test_probs_mlp, mlp_thresholds)
results_summary['sbert_mlp_dense'] = mlp_metrics
mlp_metrics

Epoch 1/50


In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import ClassifierChain

chain_seeds = [42, 52, 62]
chains = []
for seed in chain_seeds:
    base = LogisticRegression(
        max_iter=500,
        C=2.0,
        solver='saga',
        penalty='l2',
        n_jobs=-1,
        class_weight='balanced',
        random_state=seed
    )
    chain = ClassifierChain(base_estimator=base, order='random', random_state=seed)
    chain.fit(X_tr, y_tr)
    chains.append(chain)

def chain_predict_proba(chain_list, X):
    probs = [chain.predict_proba(X) for chain in chain_list]
    return np.mean(probs, axis=0)

val_probs_chain = chain_predict_proba(chains, X_val)
chain_thresholds = tune_thresholds(val_probs_chain, y_val)

test_probs_chain = chain_predict_proba(chains, X_test_emb)
chain_metrics, chain_report = evaluate_and_log(
    'sbert_classifier_chains',
    y_test,
    test_probs_chain,
    chain_thresholds,
    MODELS_DIR / 'metrics_sbert_classifier_chains.txt'
)

dump(chains, MODELS_DIR / 'sbert_classifier_chains.pkl')
save_thresholds(MODELS_DIR / 'sbert_classifier_chains_thresholds.json', chain_thresholds)
save_predictions('sbert_classifier_chains', test_probs_chain, chain_thresholds)
results_summary['sbert_classifier_chains'] = chain_metrics
chain_metrics

In [None]:
summary_df = pd.DataFrame(results_summary).T
summary_path = MODELS_DIR / 'metrics_sbert_models_summary.csv'
summary_df.to_csv(summary_path)
summary_df