
# Text Classification PoC v2
- SentenceTransformer: `intfloat/multilingual-e5-small`（instruction tuned）で高品質な埋め込みを取得。
- 5-fold Stratified CV + hold-out で SVM/LogReg/MLP/LightGBM/Bagging を比較し、Optuna で主要モデルをチューニング。
- カテゴリ名の埋め込みとのコサイン類似度や確信度モニタリング、類似問い合わせ抽出など運用要件をPoC化。


In [2]:

# Purpose: Import dependencies, set constants/logging, and configure deterministic behavior for reproducibility.
from __future__ import annotations

import json
import os
import logging
from pathlib import Path
from typing import Callable, Dict, Iterable, List, Tuple

import numpy as np
import optuna
import pandas as pd
import polars as pl
from openai import OpenAI
from pydantic import BaseModel, Field
from lightgbm import LGBMClassifier
from sentence_transformers import SentenceTransformer
from sklearn import metrics, model_selection, preprocessing
from sklearn.ensemble import BaggingClassifier, StackingClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
from dotenv import load_dotenv
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI

RANDOM_SEED = 42
DATA_PATH = Path("data/data.csv")
# EMBED_MODEL_NAME = "intfloat/multilingual-e5-small"
EMBED_MODEL_NAME = "stsb-xlm-r-multilingual"
E5_INSTRUCTION = "query: "
STRUCTURED_OUTPUT_MODEL = "gpt-4o-mini"
STRUCTURED_SAMPLE_SIZE = 5

load_dotenv()
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

np.random.seed(RANDOM_SEED)
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
pl.enable_string_cache()


In [3]:

# Purpose: Provide data loading helpers built on top of Polars LazyFrame for scalable ingestion.
def load_lazy_dataset(csv_path: Path) -> pl.LazyFrame:
    """Return a Polars LazyFrame scanning the CSV without materializing rows."""
    return pl.scan_csv(csv_path)


def preview_lazyframe(lazy_frame: pl.LazyFrame, sample_size: int = 5) -> pl.DataFrame:
    """Collect a small sample to inspect schema/values while keeping the query lazy."""
    return lazy_frame.head(sample_size).collect()


def materialize_dataset(lazy_frame: pl.LazyFrame) -> pd.DataFrame:
    """Materialize the LazyFrame into a Pandas DataFrame for compatibility with sklearn."""
    return lazy_frame.collect().to_pandas()


def encode_labels(
    df: pd.DataFrame,
    label_column: str = "category",
    new_column: str = "category_id",
) -> Tuple[pd.DataFrame, preprocessing.LabelEncoder]:
    """Encode string labels into integers and append them as a new column."""
    encoded_df = df.copy()
    encoder = preprocessing.LabelEncoder()
    encoded_df[new_column] = encoder.fit_transform(encoded_df[label_column])
    return encoded_df, encoder


def summarize_categories(
    df: pd.DataFrame,
    category_name_col: str = "category",
) -> pd.DataFrame:
    """Return counts and ratios per category for quick EDA."""
    summary = (
        df[category_name_col]
        .value_counts()
        .rename("count")
        .to_frame()
        .assign(ratio=lambda frame: frame["count"] / frame["count"].sum())
        .reset_index()
        .rename(columns={"index": category_name_col})
    )
    return summary


In [4]:

# Purpose: Load the dataset lazily, preview it, and persist a Pandas copy with encoded labels for modeling.
lazy_dataset = load_lazy_dataset(DATA_PATH)
preview_lazyframe(lazy_dataset, sample_size=5)


アイデアＩＤ,アイデアタイトル,text,category,Unnamed: 4_level_0
i64,str,str,str,str
441,"""フレックスタイム制の導入による働き方改革""","""現行の「スライド勤務」に加えて、新たに「フレックスタイム制度…","""福利厚生・制度""",
468,"""フレキシブルワーク～フレックスタイム制＆週休３日の進化系～""","""フルフレックス制とコアタイム制のハイブリッド型勤務制度の導入…","""福利厚生・制度""",
1634,"""ホワイトボードをコミュニケーションツールにしませんか？""","""在席や外出を確認するＷＢを、担当者にマウスをあてると顔写真や…","""総務""",
1737,"""人となりがわかるホワイトボード""","""ホワイトボードは社員の基本情報がすぐにわかり、便利なツールで…","""総務""",
349,"""ホワイトボードの進化版！ コミュニケーションボードの開発！""","""現状のホワイトボードでは誰がどんな仕事をしているか分かりませ…","""総務""",


In [5]:

# Purpose: Materialize the dataset, attach numeric labels, and summarize category balance for reference.
records_df, label_encoder = encode_labels(materialize_dataset(lazy_dataset))
category_summary = summarize_categories(records_df)
category_summary


Unnamed: 0,category,count,ratio
0,福利厚生・制度,292,0.226884
1,総務,192,0.149184
2,施工,178,0.138306
3,営業,131,0.101787
4,設計,125,0.097125
5,サービス,110,0.08547
6,ESG,102,0.079254
7,新規事業,69,0.053613
8,商品開発,64,0.049728
9,生産,24,0.018648


In [None]:

# Purpose: Configure OpenAI structured-output helper so we can probe a few samples early.

if not OPENAI_API_KEY:
    raise EnvironmentError(
        f"{OPENAI_API_KEY} is not set. Export your OpenAI API key before running the structured-output probe."
    )

openai_client = OpenAI(api_key=OPENAI_API_KEY)


# Pydantic スキーマ（LLMの出力をここにパース）
class Classification(BaseModel):
    record_id: str = Field(..., description="ID of the record from the input list")
    category: str = Field(..., description="One of the allowed categories")
    reason: str = Field(..., description="Why this category was selected (Japanese)")


class ClassificationBatch(BaseModel):
    classifications: List[Classification]


In [None]:
# ランタイム設定（必要なら上書き可）
STRUCTURED_SAMPLE_SIZE = 50
RANDOM_SEED = 42
MODEL = "gpt-4o-mini"  # 例。使いたいモデル名に置き換え

# メイン関数
def run_tiny_structured_classification(records_df: pd.DataFrame) -> pd.DataFrame:
    """
    - records_df: index がレコードID、columns に 'text' と 'category' を想定
    - 返り値: llm_category / llm_reason が追加された DataFrame
    """
    # 必須列チェック
    for col in ("text", "category"):
        if col not in records_df.columns:
            raise ValueError(f"records_df に '{col}' 列が必要です。")

    # インデックスを文字列IDに
    df = records_df.copy()
    if df.index.name is None:
        df.index.name = "id"
    df.index = df.index.astype(str)

    # 1) 候補カテゴリ
    candidate_categories = sorted(df["category"].dropna().astype(str).unique().tolist())
    if not candidate_categories:
        raise ValueError("candidate_categories が空です。records_df['category'] に値が必要です。")

    # 2) サンプル抽出
    n = min(STRUCTURED_SAMPLE_SIZE, len(df))
    sample_df = (
        df.reset_index(names="record_id")
          .sample(n=n, random_state=RANDOM_SEED)
          .assign(record_id=lambda d: d["record_id"].astype(str))
          .sort_values("record_id", kind="stable")
          .loc[:, ["record_id", "text"]]
    )

    # 3) JSON ペイロード
    prompt_payload = json.dumps(sample_df.to_dict(orient="records"), ensure_ascii=False, indent=2)

    # 4) LangChain の structured output（Pydantic をそのまま指定）
    #    記事の方法に倣い、with_structured_output() でスキーマを与える
    llm = ChatOpenAI(model=MODEL, temperature=0)
    structured_llm = llm.with_structured_output(ClassificationBatch)  # ←ここがポイント

    system_text = (
        "You are assisting with Japanese idea classification. "
        "Select exactly one category per record and explain your choice."
    )

    user_parts = [
        "Allowed categories:\n- " + "\n- ".join(candidate_categories),
        (
            "Respond with a `classifications` array ordered by `record_id` "
            "where each item has `record_id`, `category`, and `reason`. "
            "Here are the records (JSON list):"
        ),
        prompt_payload,
    ]
    user_text = "\n\n".join(user_parts)

    # LangChain は messages=[...] を dict で渡すより、invoke に単一文字列を渡すのが簡単
    # ただしシステム/ユーザーの区別をつけたいので、messages 形式で渡す
    result: ClassificationBatch = structured_llm.invoke([
        {"role": "system", "content": system_text},
        {"role": "user", "content": user_text},
    ])
    # ↑ Pydantic にパース済みの `ClassificationBatch` が返る（記事の手法）:contentReference[oaicite:1]{index=1}

    # 5) DataFrame に反映
    out = df.assign(llm_category=pd.NA, llm_reason=pd.NA).copy()

    # ID 整合性チェック
    ids_from_model = [c.record_id for c in result.classifications]
    not_found = [rid for rid in ids_from_model if rid not in out.index]
    if not_found:
        raise KeyError(f"records_df に存在しない record_id が含まれています: {not_found[:5]} ...")

    # 反映
    for c in result.classifications:
        out.loc[c.record_id, "llm_category"] = c.category
        out.loc[c.record_id, "llm_reason"] = c.reason

    return out

# ==== 4) 使い方 ====
new_df = run_tiny_structured_classification(records_df)


2025-11-11 19:12:20,349 INFO HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Unnamed: 0_level_0,text,category,llm_category,llm_reason
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,現行の「スライド勤務」に加えて、新たに「フレックスタイム制度」を導入を目指します。\r\nコ...,福利厚生・制度,,
1,フルフレックス制とコアタイム制のハイブリッド型勤務制度の導入を提案します。１週間・1か月単位...,福利厚生・制度,,
2,在席や外出を確認するＷＢを、担当者にマウスをあてると顔写真や趣味などが見えれば、業務の話のつ...,総務,,
3,ホワイトボードは社員の基本情報がすぐにわかり、便利なツールですが、面識のない人は顔や人となり...,総務,,
4,現状のホワイトボードでは誰がどんな仕事をしているか分かりません！！\r\nコミュニケーション...,総務,,
...,...,...,...,...
95,Teamsで海外の人と現場の工事状況や安全パトロールを実況します。\r\nもしリアルタイムで...,施工,,
96,小人数グループでの気軽に話せるＷＥＢ研修の継続的定期開催と実際に申請担当に同行し物件の役所調...,設計,,
97,単純に可動棚を設けるだけでなく、可動棚に何を収納するかまで提案された事例やマニュアルを作るこ...,設計,,
98,業務の中で「いつもくりかえし行う同じ作業」削減する取り組みです。中でもCAD作業・作図作業に...,設計,,


In [10]:
has_reason = new_df["llm_reason"].notna() & new_df["llm_reason"].astype(str).str.strip().ne("")
filtered = new_df.loc[has_reason].copy()

filtered.head()

Unnamed: 0_level_0,アイデアＩＤ,アイデアタイトル,text,category,Unnamed: 5_level_0,category_id,llm_category,llm_reason
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
23,1236,ホワイトボードマッピング化,全従業員が必ず使用しているホワイトボードに革命を起こします。\r\n誰がどこにいて、何をして...,総務,,8,営業,業務効率向上のためのホワイトボードの提案で、営業活動に関連する内容だから。
43,1139,人の想いは永遠に（ドローン空撮・動画作品編集）,代々引き継いできた土地の記録を残したい都市農家の大地主より、敷地全体の航空写真と思い入れのあ...,営業,,3,サービス,土地の記録を残すサービス提案で、サービスに関連する内容だから。
44,66,謎解きを活用した自律型研修システムの開発,電鉄会社などで流行っている謎解きを使って、受講生の心に残る研修システムを作りませんか？\r\...,サービス,,1,福利厚生・制度,研修システムの提案で、社員育成に関する福利厚生に関連する内容だから。
51,1392,シャーメゾンＣＡＦＥ,今までの経験や知識を活かしつつ、いつもとは違う仕事もやってみたい。計画段階よりシャーメゾン物...,新規事業,,4,新規事業,CAFEの出店提案で、新たな事業の展開に関する内容だから。
65,69,SAKE－積水ハウスのオリジナルノベルティの開発,オリジナルのノベルティとして日本酒を追加してみてはいかがでしょうか。\r\n世界的に日本のS...,営業,,3,商品開発,日本酒のノベルティ提案で、商品開発に関連する内容だから。


In [12]:
filtered.to_csv("data/filtered_classifications.csv", index=True)    


In [19]:

# Purpose: Define embedding helpers tailored for instruction-tuned E5 family models.
def normalize_for_instruction(text: str, instruction: str = E5_INSTRUCTION) -> str:
    """Prefix text with the E5 instruction keyword to unlock better multilingual embeddings."""
    clean = text.strip().replace("\n", " ")
    return f"{instruction}{clean}"


def build_embedder(model_name: str = EMBED_MODEL_NAME) -> SentenceTransformer:
    """Load and return a SentenceTransformer model; default is multilingual-e5-small."""
    return SentenceTransformer(model_name)


def embed_texts(
    embedder: SentenceTransformer,
    texts: Iterable[str],
    instruction: str = E5_INSTRUCTION,
    batch_size: int = 32,
    normalize_embeddings: bool = True,
) -> np.ndarray:
    """Convert iterable of texts into normalized embeddings with the provided instruction prefix."""
    prepared = [normalize_for_instruction(text, instruction) for text in texts]
    vectors = embedder.encode(
        prepared,
        batch_size=batch_size,
        show_progress_bar=False,
        normalize_embeddings=normalize_embeddings,
    )
    return np.asarray(vectors, dtype=np.float32)


In [20]:

# Purpose: Instantiate the embedder and transform all texts into dense vectors.
embedder = build_embedder()
text_embeddings = embed_texts(embedder, records_df["text"])
text_embeddings.shape


2025-11-10 15:08:00,613 INFO Use pytorch device_name: cpu
2025-11-10 15:08:00,615 INFO Load pretrained SentenceTransformer: stsb-xlm-r-multilingual


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/709 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/505 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

(1287, 768)

In [21]:

# Purpose: Build category-level embeddings and cosine-similarity features to enrich the model input space.
def build_category_embeddings(
    embedder: SentenceTransformer,
    category_texts: Iterable[str],
    instruction: str = E5_INSTRUCTION,
) -> np.ndarray:
    """Generate normalized embeddings for each category description/name."""
    prepared = [normalize_for_instruction(text, instruction) for text in category_texts]
    return embedder.encode(
        prepared,
        batch_size=len(prepared),
        show_progress_bar=False,
        normalize_embeddings=True,
    )


def concat_similarity_features(
    text_vectors: np.ndarray,
    category_vectors: np.ndarray,
) -> np.ndarray:
    """Compute cosine similarities and append them to the original embeddings."""
    similarities = text_vectors @ category_vectors.T
    return np.hstack([text_vectors, similarities])


category_embeddings = build_category_embeddings(embedder, label_encoder.classes_)
augmented_embeddings = concat_similarity_features(text_embeddings, category_embeddings)
augmented_embeddings.shape


(1287, 778)

In [24]:

# Purpose: Create modeling utilities for CV + holdout evaluation on multiple classifiers, including LightGBM & bagging.
def build_model_registry(random_state: int = RANDOM_SEED) -> Dict[str, Callable[[], object]]:
    """Return a set of lazily-initialized sklearn-compatible estimators with consistent preprocessing."""

    def make_logistic() -> LogisticRegression:
        return LogisticRegression(max_iter=4000, random_state=random_state)

    def make_bagging_logistic() -> BaggingClassifier:
        return BaggingClassifier(
            estimator=make_logistic(),
            n_estimators=15,
            max_samples=0.85,
            bootstrap=True,
            random_state=random_state,
            n_jobs=None,
        )

    return {
        "linear_svm": lambda: Pipeline(
            [("scaler", StandardScaler()), ("clf", LinearSVC(random_state=random_state))]
        ),
        "logistic_regression": lambda: Pipeline(
            [("scaler", StandardScaler()), ("clf", make_logistic())]
        ),
        "mlp_classifier": lambda: Pipeline(
            [
                ("scaler", StandardScaler()),
                (
                    "clf",
                    MLPClassifier(
                        hidden_layer_sizes=(384,),
                        activation="relu",
                        max_iter=1500,
                        random_state=random_state,
                    ),
                ),
            ]
        ),
        "lightgbm": lambda: LGBMClassifier(
            n_estimators=600,
            learning_rate=0.05,
            subsample=0.9,
            colsample_bytree=0.9,
            reg_lambda=0.1,
            random_state=random_state,
            n_jobs=-1,
            verbosity=-1,
        ),
        "bagging_log_reg": lambda: Pipeline(
            [("scaler", StandardScaler()), ("clf", make_bagging_logistic())]
        ),
    }


def cross_validate_models(
    model_builders: Dict[str, Callable[[], object]],
    features: np.ndarray,
    labels: np.ndarray,
    cv_splits: int = 5,
    seed: int = RANDOM_SEED,
) -> pd.DataFrame:
    """Run stratified K-fold cross validation and return accuracy/F1 means and stds per model."""
    splitter = model_selection.StratifiedKFold(
        n_splits=cv_splits,
        shuffle=True,
        random_state=seed,
    )
    rows: List[Dict[str, float]] = []
    for name, builder in model_builders.items():
        estimator = builder()
        scores = model_selection.cross_validate(
            estimator,
            features,
            labels,
            cv=splitter,
            scoring=["accuracy", "f1_macro"],
            n_jobs=None,
        )
        rows.append(
            {
                "name": name,
                "cv_accuracy_mean": scores["test_accuracy"].mean(),
                "cv_accuracy_std": scores["test_accuracy"].std(),
                "cv_macro_f1_mean": scores["test_f1_macro"].mean(),
                "cv_macro_f1_std": scores["test_f1_macro"].std(),
            }
        )
    return (
        pd.DataFrame(rows)
        .sort_values("cv_macro_f1_mean", ascending=False)
        .reset_index(drop=True)
    )


def holdout_report_for_model(
    model,
    features: np.ndarray,
    labels: np.ndarray,
    label_names: Iterable[str],
    test_size: float = 0.2,
    seed: int = RANDOM_SEED,
) -> Tuple[Dict[str, float], str]:
    """Train/validate a single model on a hold-out split and return metrics plus report."""
    X_train, X_test, y_train, y_test = model_selection.train_test_split(
        features,
        labels,
        test_size=test_size,
        stratify=labels,
        random_state=seed,
    )
    model.fit(X_train, y_train)
    preds = model.predict(X_test)
    metrics_dict = {
        "accuracy": metrics.accuracy_score(y_test, preds),
        "macro_f1": metrics.f1_score(y_test, preds, average="macro"),
    }
    report = metrics.classification_report(y_test, preds, target_names=list(label_names))
    return metrics_dict, report


In [23]:

# Purpose: Execute cross validation across all models and inspect their ranking.
model_registry = build_model_registry()
cv_results = cross_validate_models(model_registry, augmented_embeddings, records_df["category_id"].values)
cv_results




Unnamed: 0,name,cv_accuracy_mean,cv_accuracy_std,cv_macro_f1_mean,cv_macro_f1_std
0,mlp_classifier,0.522954,0.018226,0.44122,0.026654
1,bagging_log_reg,0.514394,0.031474,0.428015,0.042082
2,logistic_regression,0.485651,0.025082,0.404763,0.035221
3,lightgbm,0.512065,0.014932,0.403454,0.014276
4,linear_svm,0.43901,0.024648,0.358406,0.024228


In [25]:

# Purpose: Evaluate the top-2 CV models (logistic + bagging) and LightGBM on a hold-out split for sanity check.
ranked_models = cv_results["name"].tolist()
selected = [name for name in ranked_models if name in {"logistic_regression", "bagging_log_reg", "lightgbm"}]
reports: List[Dict[str, object]] = []
for name in selected:
    model = model_registry[name]()
    metrics_dict, report = holdout_report_for_model(
        model,
        augmented_embeddings,
        records_df["category_id"].values,
        label_encoder.classes_,
    )
    reports.append({"name": name, **metrics_dict, "report": report})
reports


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


[{'name': 'bagging_log_reg',
  'accuracy': 0.5077519379844961,
  'macro_f1': 0.42477571126918273,
  'report': '              precision    recall  f1-score   support\n\n         ESG       0.56      0.70      0.62        20\n        サービス       0.35      0.32      0.33        22\n        商品開発       0.50      0.38      0.43        13\n          営業       0.31      0.35      0.33        26\n        新規事業       0.25      0.29      0.27        14\n          施工       0.56      0.64      0.60        36\n          生産       0.00      0.00      0.00         5\n     福利厚生・制度       0.68      0.61      0.64        59\n          総務       0.55      0.55      0.55        38\n          設計       0.46      0.48      0.47        25\n\n    accuracy                           0.51       258\n   macro avg       0.42      0.43      0.42       258\nweighted avg       0.50      0.51      0.50       258\n'},
 {'name': 'logistic_regression',
  'accuracy': 0.46511627906976744,
  'macro_f1': 0.4059759313380746,
  'report

In [26]:

# Purpose: Use Optuna to tune the Logistic Regression pipeline and push macro-F1 higher via CV.
def tune_logistic_with_optuna(
    features: np.ndarray,
    labels: np.ndarray,
    n_trials: int = 25,
    seed: int = RANDOM_SEED,
) -> optuna.Study:
    """Optimize logistic regression hyperparameters with Optuna and return the study."""

    def objective(trial: optuna.Trial) -> float:
        C = trial.suggest_float("C", 1e-2, 10.0, log=True)
        fit_intercept = trial.suggest_categorical("fit_intercept", [True, False])
        class_weight = trial.suggest_categorical("class_weight", [None, "balanced"])
        tol = trial.suggest_float("tol", 1e-5, 1e-2, log=True)
        model = Pipeline(
            [
                ("scaler", StandardScaler()),
                (
                    "clf",
                    LogisticRegression(
                        C=C,
                        fit_intercept=fit_intercept,
                        class_weight=class_weight,
                        tol=tol,
                        max_iter=5000,
                        random_state=seed,
                    ),
                ),
            ]
        )
        splitter = model_selection.StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
        scores = model_selection.cross_val_score(
            model,
            features,
            labels,
            cv=splitter,
            scoring="f1_macro",
            n_jobs=None,
        )
        return float(scores.mean())

    study = optuna.create_study(
        direction="maximize",
        sampler=optuna.samplers.TPESampler(seed=seed),
    )
    study.optimize(objective, n_trials=n_trials, show_progress_bar=False)
    return study


log_reg_study = tune_logistic_with_optuna(augmented_embeddings, records_df["category_id"].values)
log_reg_study.best_value, log_reg_study.best_params


[I 2025-11-10 16:12:51,630] A new study created in memory with name: no-name-c283deb0-6c3d-4692-a01a-018e4b97c22f


[I 2025-11-10 16:12:55,093] Trial 0 finished with value: 0.41567665942826454 and parameters: {'C': 0.13292918943162169, 'fit_intercept': True, 'class_weight': None, 'tol': 2.9375384576328295e-05}. Best is trial 0 with value: 0.41567665942826454.
[I 2025-11-10 16:12:55,625] Trial 1 finished with value: 0.4352056744209397 and parameters: {'C': 0.014936568554617643, 'fit_intercept': True, 'class_weight': None, 'tol': 0.008123245085588688}. Best is trial 1 with value: 0.4352056744209397.
[I 2025-11-10 16:12:56,731] Trial 2 finished with value: 0.3878548994418309 and parameters: {'C': 3.142880890840109, 'fit_intercept': True, 'class_weight': 'balanced', 'tol': 0.00037520558551242813}. Best is trial 1 with value: 0.4352056744209397.
[I 2025-11-10 16:12:58,011] Trial 3 finished with value: 0.39672971649782124 and parameters: {'C': 0.19762189340280073, 'fit_intercept': False, 'class_weight': 'balanced', 'tol': 0.00012562773503807024}. Best is trial 1 with value: 0.4352056744209397.
[I 2025-11-

(0.441221876237626,
 {'C': 0.011055580224171557,
  'fit_intercept': True,
  'class_weight': None,
  'tol': 0.008567052742936003})

In [27]:

# Purpose: Tune LightGBM hyperparameters with Optuna to explore boosted-tree capacity.
def tune_lightgbm_with_optuna(
    features: np.ndarray,
    labels: np.ndarray,
    n_trials: int = 30,
    seed: int = RANDOM_SEED,
) -> optuna.Study:
    """Optimize LightGBM hyperparameters using Optuna on macro-F1."""

    def objective(trial: optuna.Trial) -> float:
        params = {
            "num_leaves": trial.suggest_int("num_leaves", 8, 64),
            "max_depth": trial.suggest_int("max_depth", 3, 12),
            "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.2, log=True),
            "feature_fraction": trial.suggest_float("feature_fraction", 0.6, 1.0),
            "bagging_fraction": trial.suggest_float("bagging_fraction", 0.6, 1.0),
            "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
            "min_child_samples": trial.suggest_int("min_child_samples", 5, 30),
            "lambda_l1": trial.suggest_float("lambda_l1", 1e-3, 10.0, log=True),
            "lambda_l2": trial.suggest_float("lambda_l2", 1e-3, 10.0, log=True),
        }
        model = LGBMClassifier(
            n_estimators=600,
            subsample=params.pop("bagging_fraction"),
            subsample_freq=params.pop("bagging_freq"),
            colsample_bytree=params.pop("feature_fraction"),
            random_state=seed,
            n_jobs=-1,
            verbosity=-1,
            **params,
        )
        splitter = model_selection.StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)
        scores = model_selection.cross_val_score(
            model,
            features,
            labels,
            cv=splitter,
            scoring="f1_macro",
            n_jobs=None,
        )
        return float(scores.mean())

    study = optuna.create_study(
        direction="maximize",
        sampler=optuna.samplers.TPESampler(seed=seed),
    )
    study.optimize(objective, n_trials=n_trials, show_progress_bar=False)
    return study


lightgbm_study = tune_lightgbm_with_optuna(augmented_embeddings, records_df["category_id"].values)
lightgbm_study.best_value, lightgbm_study.best_params


[I 2025-11-10 16:15:29,847] A new study created in memory with name: no-name-b09df28b-6ede-4441-b553-cd51f606fe2b
[I 2025-11-10 16:16:00,713] Trial 0 finished with value: 0.3965001586695598 and parameters: {'num_leaves': 29, 'max_depth': 12, 'learning_rate': 0.08960785365368121, 'feature_fraction': 0.8394633936788146, 'bagging_fraction': 0.6624074561769746, 'bagging_freq': 2, 'min_child_samples': 6, 'lambda_l1': 2.9154431891537547, 'lambda_l2': 0.2537815508265665}. Best is trial 0 with value: 0.3965001586695598.
[I 2025-11-10 16:16:52,977] Trial 1 finished with value: 0.39253566253902605 and parameters: {'num_leaves': 48, 'max_depth': 3, 'learning_rate': 0.18276027831785724, 'feature_fraction': 0.9329770563201687, 'bagging_fraction': 0.6849356442713105, 'bagging_freq': 2, 'min_child_samples': 9, 'lambda_l1': 0.016480446427978974, 'lambda_l2': 0.12561043700013558}. Best is trial 0 with value: 0.3965001586695598.
[I 2025-11-10 16:17:35,240] Trial 2 finished with value: 0.4188868862360866

(0.4220133057192686,
 {'num_leaves': 64,
  'max_depth': 9,
  'learning_rate': 0.023023881732257948,
  'feature_fraction': 0.6312583301950802,
  'bagging_fraction': 0.7932107758827048,
  'bagging_freq': 4,
  'min_child_samples': 23,
  'lambda_l1': 0.15369004781182277,
  'lambda_l2': 0.0017872862735480418})

In [14]:

# Purpose: Build stacking/voting ensembles from the Optuna-tuned base models (non-bagging) and evaluate them.
def build_tuned_logistic(best_params: Dict[str, object], seed: int = RANDOM_SEED) -> Pipeline:
    """Instantiate a StandardScaler+LogReg pipeline using Optuna-best hyperparameters."""
    params = best_params.copy()
    return Pipeline(
        [
            ("scaler", StandardScaler()),
            (
                "clf",
                LogisticRegression(
                    C=params.get("C", 1.0),
                    fit_intercept=params.get("fit_intercept", True),
                    class_weight=params.get("class_weight"),
                    tol=params.get("tol", 1e-4),
                    max_iter=5000,
                    random_state=seed,
                ),
            ),
        ]
    )


def build_tuned_lightgbm(best_params: Dict[str, object], seed: int = RANDOM_SEED) -> LGBMClassifier:
    """Instantiate a LightGBM classifier with Optuna-best hyperparameters."""
    params = best_params.copy()
    feature_fraction = params.pop("feature_fraction", 0.9)
    bagging_fraction = params.pop("bagging_fraction", 0.9)
    bagging_freq = params.pop("bagging_freq", 1)
    return LGBMClassifier(
        n_estimators=600,
        subsample=bagging_fraction,
        subsample_freq=bagging_freq,
        colsample_bytree=feature_fraction,
        random_state=seed,
        n_jobs=-1,
        verbosity=-1,
        **params,
    )


def build_stacking_ensemble(
    log_reg: Pipeline,
    lgbm: LGBMClassifier,
    seed: int = RANDOM_SEED,
) -> StackingClassifier:
    """Create a stacking classifier that blends tuned LogReg and LightGBM."""
    estimators = [
        ("logreg", log_reg),
        ("lgbm", lgbm),
    ]
    final_estimator = LogisticRegression(max_iter=4000, random_state=seed)
    return StackingClassifier(
        estimators=estimators,
        final_estimator=final_estimator,
        stack_method="auto",
        passthrough=False,
        n_jobs=None,
    )


def build_voting_ensemble(
    log_reg: Pipeline,
    lgbm: LGBMClassifier,
) -> VotingClassifier:
    """Return a soft-voting ensemble combining tuned LogReg and LightGBM."""
    return VotingClassifier(
        estimators=[("logreg", log_reg), ("lgbm", lgbm)],
        voting="soft",
        weights=[0.6, 0.4],
        n_jobs=None,
    )


tuned_log_reg = build_tuned_logistic(log_reg_study.best_params)
tuned_lgbm = build_tuned_lightgbm(lightgbm_study.best_params)
stacking_model = build_stacking_ensemble(tuned_log_reg, tuned_lgbm)
voting_model = build_voting_ensemble(tuned_log_reg, tuned_lgbm)
stacking_metrics, stacking_report = holdout_report_for_model(
    stacking_model,
    augmented_embeddings,
    records_df["category_id"].values,
    label_encoder.classes_,
)
voting_metrics, voting_report = holdout_report_for_model(
    voting_model,
    augmented_embeddings,
    records_df["category_id"].values,
    label_encoder.classes_,
)
stacking_metrics, voting_metrics


NameError: name 'lightgbm_study' is not defined

In [None]:

# Purpose: Display the Optuna-ensemble classification reports for qualitative inspection.
print("=== Stacking ensemble report ===")
print(stacking_report)
print("=== Voting ensemble report ===")
print(voting_report)


In [None]:

# Purpose: Fit the tuned logistic model on all data to compute per-sample confidence scores and flag low-confidence predictions.
def compute_confidence_table(
    model: Pipeline,
    features: np.ndarray,
    labels: np.ndarray,
    label_names: Iterable[str],
    threshold: float = 0.8,
) -> Tuple[pd.DataFrame, Pipeline]:
    """Return a DataFrame with predictions, confidences, and a low-confidence flag."""
    fitted = model.fit(features, labels)
    probs = fitted.predict_proba(features)
    pred_ids = probs.argmax(axis=1)
    confidence = probs.max(axis=1)
    df = pd.DataFrame(
        {
            "id": records_df["id"],
            "text": records_df["text"],
            "true_label": records_df["category_label"],
            "pred_label": [label_names[idx] for idx in pred_ids],
            "pred_label_id": pred_ids,
            "confidence": confidence,
        }
    )
    df["low_confidence"] = df["confidence"] < threshold
    return df, fitted


confidence_df, tuned_log_reg_fitted = compute_confidence_table(
    tuned_log_reg,
    augmented_embeddings,
    records_df["category_id"].values,
    label_encoder.classes_,
    threshold=0.85,
)
confidence_df.sort_values("confidence").head(10)


In [None]:

# Purpose: List low-confidence cases so reviewers can manually re-label them.
low_confidence_cases = confidence_df.query("low_confidence").copy()
low_confidence_cases[["id", "text", "pred_label", "confidence"]].head(10)


In [None]:

# Purpose: Within each predicted class, surface highly similar texts to batch manual review.
def find_similar_within_prediction(
    base_embeddings: np.ndarray,
    ids: Iterable[int],
    texts: Iterable[str],
    predicted_labels: Iterable[str],
    top_k: int = 3,
    min_similarity: float = 0.9,
) -> List[Dict[str, object]]:
    """For each sample, return closest neighbors that share the predicted label."""
    id_array = np.asarray(list(ids))
    text_array = np.asarray(list(texts))
    label_array = np.asarray(list(predicted_labels))
    cos = cosine_similarity(base_embeddings)
    groups: List[Dict[str, object]] = []
    for idx in range(len(base_embeddings)):
        same_mask = label_array == label_array[idx]
        candidate_indices = np.where(same_mask)[0]
        sims = cos[idx, candidate_indices]
        neighbors = []
        for candidate_idx, sim in zip(candidate_indices, sims):
            if candidate_idx == idx or sim < min_similarity:
                continue
            neighbors.append(
                {
                    "neighbor_id": int(id_array[candidate_idx]),
                    "similarity": float(sim),
                    "neighbor_text": text_array[candidate_idx],
                }
            )
        if neighbors:
            neighbors = sorted(neighbors, key=lambda item: item["similarity"], reverse=True)[:top_k]
            groups.append(
                {
                    "id": int(id_array[idx]),
                    "pred_label": label_array[idx],
                    "text": text_array[idx],
                    "neighbors": neighbors,
                }
            )
    return groups


similar_groups = find_similar_within_prediction(
    text_embeddings,
    records_df["id"],
    records_df["text"],
    confidence_df["pred_label"],
    top_k=3,
    min_similarity=0.92,
)
similar_groups[:5]


In [None]:

# Purpose: Materialize similar-group output as a DataFrame for downstream tooling.
similar_groups_df = pd.json_normalize(similar_groups, sep=".")
similar_groups_df.head(10)
