In [1]:
# predict_all_tasks.ipynb

from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import pandas as pd
import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# === Chargement des modèles fine-tunés ===
model_A = AutoModelForSequenceClassification.from_pretrained("./best_model_task_A")
tokenizer_A = AutoTokenizer.from_pretrained("roberta-large")

model_B = AutoModelForSequenceClassification.from_pretrained("./best_model_task_B")
tokenizer_B = AutoTokenizer.from_pretrained("roberta-large")

model_C = AutoModelForSequenceClassification.from_pretrained("./best_model_task_C")
tokenizer_C = AutoTokenizer.from_pretrained("roberta-large")

E0000 00:00:1744191187.254520    2388 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1744191187.258695    2388 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1744191187.270433    2388 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1744191187.270447    2388 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1744191187.270448    2388 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1744191187.270450    2388 computation_placer.cc:177] computation placer already registered. Please check linka

In [3]:
# === Texte à prédire ===
df = pd.read_csv("../datasets/trial-data/offenseval-trial.txt", sep="\t", header=None)
df.columns = ["text", "label_A_gold", "label_B_gold", "label_C_gold"]
tweets = df["text"].tolist()

In [4]:
# === Fonction prédiction générique ===
def predict(texts, model, tokenizer):
    inputs = tokenizer(texts, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        logits = model(**inputs).logits
    return torch.argmax(logits, dim=1).numpy()

In [5]:
# === Étape 1 : prédiction Task A ===
y_pred_A = predict(tweets, model_A, tokenizer_A)

In [6]:
# === Étape 2 : prédiction Task B sur tweets OFF ===
off_mask = (y_pred_A == 1)
tweets_B = [t for i, t in enumerate(tweets) if off_mask[i]]
y_pred_B_partial = predict(tweets_B, model_B, tokenizer_B)

IndexError: list index out of range

In [None]:
# === Étape 3 : prédiction Task C sur tweets TIN ===
tin_mask = (y_pred_B_partial == 1)
tweets_C = [t for i, t in enumerate(tweets_B) if tin_mask[i]]
y_pred_C_partial = predict(tweets_C, model_C, tokenizer_C)

In [None]:
# === Reconstruction du DataFrame avec les prédictions ===
pred_A = ["OFF" if x == 1 else "NOT" for x in y_pred_A]
pred_B, pred_C = ["NULL"] * len(tweets), ["NULL"] * len(tweets)

b_idx = 0
for i, is_off in enumerate(off_mask):
    if is_off:
        pred_B[i] = "TIN" if y_pred_B_partial[b_idx] == 1 else "UNT"
        b_idx += 1

c_idx = 0
for i, is_off in enumerate(off_mask):
    if is_off and pred_B[i] == "TIN":
        pred_C[i] = ["IND", "GRP", "OTH"][y_pred_C_partial[c_idx]]
        c_idx += 1

In [None]:
# === Résultat final ===
df["pred_A"] = pred_A
df["pred_B"] = pred_B
df["pred_C"] = pred_C

print(df[["text", "pred_A", "pred_B", "pred_C"]].head(10))