In [3]:
import argparse
import os

import numpy as np
import pandas as pd
import matplotlib

import matplotlib.pyplot as plt

from mapie.classification import MapieClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report

from src.conformal_prediction.utils import chunked_mapie_predict
# For loading your trained TransformerClassifier
from src.training.model import TransformerClassifier
from src.utils import load_config, get_logger

In [4]:
# Load config
env = "prod"
model_type = "BERT"

training_config = load_config(
    file_name="training_config", env=env, folder="../../config"
)

In [9]:
input_dataset = f"../../output_data/{env}/supervised_dataset.parquet"
input_model = f"../../output_models/{env}/trained_model"
input_outliers = f"../../output_data/{env}/supervised_dataset_phase2.parquet"
output_reports = f"../../output_reports/{env}/outlier_detection"
alpha = 0.15

In [10]:
df = pd.read_parquet(input_dataset)

In [11]:
# Select features and target
X = df[training_config.training[model_type].features]
y = df[training_config.training[model_type].target]
num_labels = y.nunique()

In [43]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=training_config.training[model_type].test_size,
    random_state=training_config.training.random_state,
    stratify=y,
)
X_train, X_val, y_train, y_val = train_test_split(
    X_train,
    y_train,
    test_size=training_config.training[model_type].val_size,
    random_state=training_config.training.random_state,
    stratify=y_train,
)
X_test, X_cp, y_test, y_cp = train_test_split(
    X_test,
    y_test,
    test_size=training_config.training[model_type].cp_size,
    random_state=training_config.training.random_state,
    stratify=y_test,
)

In [44]:
outliers = pd.read_parquet(input_outliers)
X_outliers = outliers[training_config.training[model_type].features]
y_outliers = outliers[training_config.training[model_type].target]
X_test = pd.concat([X_test, X_outliers], ignore_index=True)
y_test = pd.concat([y_test, y_outliers], ignore_index=True)

In [45]:
# Sample X_test and y_test
X_test = X_test.sample(frac=0.05, random_state=training_config.training.random_state).reset_index(drop=True)
y_test = y_test.sample(frac=0.05, random_state=training_config.training.random_state).reset_index(drop=True)

In [15]:
clf = TransformerClassifier(local_model_path=input_model, num_labels=num_labels)

In [20]:
# Prepare the model for conformal prediction
mapie_clf = MapieClassifier(
    estimator=clf,
    method="score",
    cv="prefit",
    random_state=training_config.training.random_state,
)

In [22]:
# Fit on the calibration set
_ = mapie_clf.fit(X_cp, y_cp)

In [23]:
point_preds, conf_sets = chunked_mapie_predict(mapie_clf, X_test, alpha=alpha)

In [24]:
point_preds.shape

(76,)

In [27]:
conf_sets.shape

(76, 2, 1)

In [46]:
outlier_test = pd.concat([X_test, y_test], axis=1)

In [48]:
outlier_test["outlier"] = False
if conf_sets is not None:
    for i in range(outlier_test.shape[0]):
        label_boolean = conf_sets[i, 0, :]
        set_size = np.sum(label_boolean)
        if set_size > 1:
            outlier_test.loc[outlier_test.index[i], "outlier"] = True

In [50]:
outlier_test

Unnamed: 0,processed_text_to_analyse,label,outlier
0,pantalla termostatos instalacion queda difumin...,1,False
1,1 error azce6thinkrb e1 regularmente 1 termost...,0,False
2,2 termostatos lite radio blanco responden ce6 ...,0,False
3,envia reparar garantia azce6bluefacecb f00vziq...,1,False
4,motor triple funciona motor compuerta corto bl...,0,False
...,...,...,...
71,porner marcha instalacion flexa 3.0 termostato...,1,False
72,pantalla azul termostato cara azul pantalla azul,1,False
73,pantalla duplicada pensar pensar mostrar super...,1,False
74,azce6bluefaceccb ns fookylz cara azul cableada...,1,False


In [52]:
# Save outliers to a parquet file
outliers_df = outlier_test[outlier_test["outlier"]]
os.makedirs(output_reports, exist_ok=True)
outliers_file = os.path.join(output_reports, "detected_outliers.parquet")
outliers_df.to_parquet(outliers_file, index=False)

In [53]:
if conf_sets is not None:
    set_sizes = []
    for i in range(outlier_test.shape[0]):
        if outlier_test["outlier"].iloc[i] is not None:
            # measure how big the set is
            label_boolean = conf_sets[i, :, 0]
            set_size = np.sum(label_boolean)
            set_sizes.append(set_size)

    plt.figure(figsize=(8, 6))
    plt.hist(
        set_sizes,
        bins=range(1, max(set_sizes) + 2),
        color="skyblue",
        edgecolor="black",
    )
    plt.title("Distribution of Conformal Set Sizes (alpha=0.15)")
    plt.xlabel("Set size")
    plt.ylabel("Frequency")
    plt.xticks(range(1, max(set_sizes) + 2))
    dist_plot_path = os.path.join(
        output_reports, "conformal_set_size_distribution.png"
    )
    plt.savefig(dist_plot_path)
    plt.close()

In [56]:
outliers_df.shape

(0, 3)

In [57]:
if outliers_df.shape[0] > 0:
    outliers_by_class = (
        outliers_df.groupby(training_config.training[model_type].target)
        .size()
        .sort_values(ascending=False)
    )
    plt.figure(figsize=(10, 6))
    outliers_by_class.plot(kind="bar", color="tomato")
    plt.title("Number of Outliers by True Label")
    plt.xlabel("Label")
    plt.ylabel("Count of Outliers")
    outliers_bar_path = os.path.join(output_reports, "outliers_by_class.png")
    plt.savefig(outliers_bar_path)
    plt.close()