In [None]:
%load_ext autoreload
%autoreload 2

import os
import numpy as np
import pandas as pd
import joblib
import sys

from pathlib import Path
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.pipeline import Pipeline

import lightgbm as lgb

sys.path.append(os.path.abspath(".."))

from src.train import temporal_split
from src.evaluate import evaluate
from src.features import build_features
from src.config import (
    RAW_FILENAME,
    TEXT_COL,
    NUMERIC_COLS,
    CATEGORIC_COLS,
    TARGET,
    TEST_SIZE,
    RANDOM_STATE,
)

In [None]:
project_root = Path().resolve().parent
os.chdir(project_root)
current_dir = os.getcwd()

In [None]:
data_path = os.path.join(current_dir, "data", "raw", RAW_FILENAME)
raw = pd.read_csv(data_path, parse_dates=["DATE_EMITTED"])

features = build_features(raw)

train_df, test_df = temporal_split(features, TEST_SIZE)
X_train = train_df.drop(columns=[TARGET[0], "DATE_EMITTED"])
y_train = train_df[TARGET[0]]
X_test = test_df.drop(columns=[TARGET[0], "DATE_EMITTED"])
y_test = test_df[TARGET[0]]

BASELINE : TF-IDF + LightGBM

In [None]:
name = "baseline"

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        (
            "tfidf",
            TfidfVectorizer(),
            TEXT_COL[0],
        ),
        (
            "cat",
            OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1),
            CATEGORIC_COLS,
        ),
        ("num", "passthrough", NUMERIC_COLS),
    ],
    remainder="drop",
)

lgbm_clf = lgb.LGBMClassifier(
    n_estimators=100,
    class_weight="balanced",
    random_state=RANDOM_STATE,
    verbose=-1,
)

pipeline = Pipeline(
    [
        ("preprocessor", preprocessor),
        ("classifier", lgbm_clf),
    ]
)

In [None]:
pipeline.fit(X_train, y_train)

In [None]:
evaluate(pipeline, X_test, y_test, "baseline")

In [None]:
model_path = os.path.join(current_dir, "models", f"{name}_pipeline.joblib")
joblib.dump(pipeline, model_path)

ENHANCED BASELINE TF-IDF + LightGBM

In [None]:
name = "baseline_enhanced"

In [None]:
preprocessor = ColumnTransformer(
    transformers=[
        (
            "tfidf",
            TfidfVectorizer(
                sublinear_tf=True,  # log(TF) → atténue les termes très fréquents
                ngram_range=(1, 2),
                min_df=3,  # ignore le bruit hapax
                max_features=5000,
                strip_accents="unicode",
            ),
            "TEXT",
        ),
        (
            "cat",
            OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1),
            CATEGORIC_COLS,
        ),
        ("num", "passthrough", NUMERIC_COLS),
    ],
    remainder="drop",
)

lgbm_clf = lgb.LGBMClassifier(
    n_estimators=500,
    learning_rate=0.05,
    num_leaves=63,
    class_weight="balanced",
    random_state=RANDOM_STATE,
    n_jobs=-1,
    verbose=-1,
    callbacks=[lgb.log_evaluation(period=50)],
)

pipeline = Pipeline(
    [
        ("preprocessor", preprocessor),
        ("classifier", lgbm_clf),
    ]
)

In [None]:
pipeline.fit(X_train, y_train)

In [None]:
evaluate(pipeline, X_test, y_test, name)

In [None]:
model_path = os.path.join(current_dir, "models", f"{name}_pipeline.joblib")
joblib.dump(pipeline, model_path)

EMBEDDING + LightGBM

In [None]:
name = "embeddings_lgbm"

In [None]:
emb_path_train = os.path.join(current_dir, "data", "processed", "X_train_emb.npy")
emb_path_test = os.path.join(current_dir, "data", "processed", "X_test_emb.npy")

X_train_emb = np.load(emb_path_train)
X_test_emb = np.load(emb_path_test)

In [None]:
enc = OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)
X_train_cat = enc.fit_transform(X_train[CATEGORIC_COLS])
X_test_cat = enc.transform(X_test[CATEGORIC_COLS])

X_train_final = np.hstack([X_train_emb, X_train_cat, X_train[NUMERIC_COLS].values])
X_test_final = np.hstack([X_test_emb, X_test_cat, X_test[NUMERIC_COLS].values])

lgbm_clf = lgb.LGBMClassifier(
    n_estimators=500,
    learning_rate=0.05,
    num_leaves=63,
    class_weight="balanced",
    random_state=RANDOM_STATE,
    n_jobs=-1,
    verbose=-1,
    callbacks=[lgb.log_evaluation(period=50)],
)
lgbm_clf.fit(X_train_final, y_train)

In [None]:
evaluate(lgbm_clf, X_test_final, y_test, name)

In [None]:
pipeline = Pipeline(
    [
        ("classifier", lgbm_clf),
    ]
)

model_path = os.path.join(current_dir, "models", f"{name}_pipeline.joblib")
joblib.dump(pipeline, model_path)