In [17]:
# predict_all_tasks.ipynb

from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import pandas as pd
import numpy as np

In [18]:
# === Forcer l'exécution sur CPU ===
device = torch.device("cpu")

# === Task A ===
model_A = AutoModelForSequenceClassification.from_pretrained("./final_models/best_model_A_roberta-base")
tokenizer_A = AutoTokenizer.from_pretrained("roberta-base")
model_A.to(device)

# === Task B ===
model_B = AutoModelForSequenceClassification.from_pretrained("./final_models/best_model_B_hateBERT")
tokenizer_B = AutoTokenizer.from_pretrained("GroNLP/hateBERT")
model_B.to(device)

# === Task C ===
model_C = AutoModelForSequenceClassification.from_pretrained("./final_models/best_model_C_hateBERT")
tokenizer_C = AutoTokenizer.from_pretrained("GroNLP/hateBERT")
model_C.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [19]:
# === Texte à prédire ===
df = pd.read_csv("../datasets/trial-data/offenseval-trial.txt", sep="\t", header=None)
df.columns = ["text", "label_A_gold", "label_B_gold", "label_C_gold"]
tweets = df["text"].tolist()

In [20]:
def predict(texts, model, tokenizer, max_length=128):
    model.eval()
    inputs = tokenizer(
        texts,
        return_tensors="pt",
        truncation=True,
        padding=True,
        max_length=max_length
    )
    # Envoyer inputs sur le même device que le modèle
    inputs = {k: v.to(model.device) for k, v in inputs.items()}
    
    with torch.no_grad():
        logits = model(**inputs).logits
    
    return torch.argmax(logits, dim=1).cpu().numpy()

In [21]:
# === Étape 1 : prédiction Task A ===
y_pred_A = predict(tweets, model_A, tokenizer_A)

In [22]:
# === Étape 2 : prédiction Task B sur tweets OFF ===
off_mask = (y_pred_A == 1)
tweets_B = [t for i, t in enumerate(tweets) if off_mask[i]]
y_pred_B_partial = predict(tweets_B, model_B, tokenizer_B)

In [23]:
# === Étape 3 : prédiction Task C sur tweets TIN ===
tin_mask = (y_pred_B_partial == 1)
tweets_C = [t for i, t in enumerate(tweets_B) if tin_mask[i]]
y_pred_C_partial = predict(tweets_C, model_C, tokenizer_C)

In [24]:
# === Reconstruction du DataFrame avec les prédictions ===
pred_A = ["OFF" if x == 1 else "NOT" for x in y_pred_A]
pred_B, pred_C = ["NULL"] * len(tweets), ["NULL"] * len(tweets)

b_idx = 0
for i, is_off in enumerate(off_mask):
    if is_off:
        pred_B[i] = "TIN" if y_pred_B_partial[b_idx] == 1 else "UNT"
        b_idx += 1

c_idx = 0
for i, is_off in enumerate(off_mask):
    if is_off and pred_B[i] == "TIN":
        pred_C[i] = ["IND", "GRP", "OTH"][y_pred_C_partial[c_idx]]
        c_idx += 1

In [25]:
# === Résultat final ===
df["pred_A"] = pred_A
df["pred_B"] = pred_B
df["pred_C"] = pred_C

print(df[["text", "pred_A", "pred_B", "pred_C"]].head(10))

                                                text pred_A pred_B pred_C
0  @BreitbartNews OK Shannon, YOU tell the vetera...    NOT   NULL   NULL
1  @LeftyGlenn @jaredeker @BookUniverse @hashtagz...    NOT   NULL   NULL
2  Hot Mom Sucks Off Step Son In Shower 8 min htt...    OFF    TIN    IND
3  bro these are some cute butt plugs I’m trying ...    OFF    TIN    IND
4  Arizona Supreme Court strikes down state legis...    NOT   NULL   NULL
5  Arguing gun control is wrong of me whoever has...    NOT   NULL   NULL
6  Doctors’ interest in medical marijuana far out...    NOT   NULL   NULL
7  A must-read and a must-share for all your frie...    NOT   NULL   NULL
8  @Jo2timess Now that’s the dumbest shit I have ...    OFF    UNT   NULL
9  Agreed! When all of this drama was unfolding a...    OFF    TIN    IND


In [26]:
from sklearn.metrics import accuracy_score, f1_score, classification_report

# Convertir labels gold en formats compatibles
gold_A = [1 if label == "OFF" else 0 for label in df["label_A_gold"]]
gold_B = [1 if label == "TIN" else 0 if label == "UNT" else -1 for label in df["label_B_gold"]]
gold_C = ["IND", "GRP", "OTH"]
gold_C = [gold_C.index(label) if label in gold_C else -1 for label in df["label_C_gold"]]

# Eval Task A
print("\n=== Task A ===")
print("Accuracy:", accuracy_score(gold_A, y_pred_A))
print("F1-score:", f1_score(gold_A, y_pred_A, average="macro"))
print(classification_report(gold_A, y_pred_A, target_names=["NOT", "OFF"]))

# Eval Task B
gold_B_eval = [g for i, g in enumerate(gold_B) if off_mask[i] and g != -1]
pred_B_eval = [1 if b == "TIN" else 0 for i, b in enumerate(pred_B) if off_mask[i] and gold_B[i] != -1]

print("\n=== Task B ===")
print("Accuracy:", accuracy_score(gold_B_eval, pred_B_eval))
print("F1-score:", f1_score(gold_B_eval, pred_B_eval, average="macro"))
print(classification_report(gold_B_eval, pred_B_eval, target_names=["UNT", "TIN"]))

# Eval Task C
gold_C_eval = [g for i, g in enumerate(gold_C) if off_mask[i] and pred_B[i] == "TIN" and g != -1]
pred_C_eval = [ ["IND", "GRP", "OTH"].index(c) for i, c in enumerate(pred_C) if off_mask[i] and pred_B[i] == "TIN" and gold_C[i] != -1 ]

print("\n=== Task C ===")
print("Accuracy:", accuracy_score(gold_C_eval, pred_C_eval))
print("F1-score:", f1_score(gold_C_eval, pred_C_eval, average="macro"))
print(classification_report(gold_C_eval, pred_C_eval, target_names=["IND", "GRP", "OTH"]))


=== Task A ===
Accuracy: 0.859375
F1-score: 0.816267942583732
              precision    recall  f1-score   support

         NOT       0.93      0.88      0.91       243
         OFF       0.68      0.78      0.73        77

    accuracy                           0.86       320
   macro avg       0.80      0.83      0.82       320
weighted avg       0.87      0.86      0.86       320


=== Task B ===
Accuracy: 0.6140350877192983
F1-score: 0.5991048593350383
              precision    recall  f1-score   support

         UNT       0.92      0.36      0.52        33
         TIN       0.52      0.96      0.68        24

    accuracy                           0.61        57
   macro avg       0.72      0.66      0.60        57
weighted avg       0.75      0.61      0.59        57


=== Task C ===
Accuracy: 0.7083333333333334
F1-score: 0.391812865497076
              precision    recall  f1-score   support

         IND       0.84      0.84      0.84        19
         GRP       0.50    

In [28]:
# First : Same Roberta-base for all
#-> good for A
=== Task A ===
Accuracy: 0.859375
F1-score: 0.8190932046885011
              precision    recall  f1-score   support

         NOT       0.93      0.88      0.90       243
         OFF       0.67      0.81      0.73        77

    accuracy                           0.86       320
   macro avg       0.80      0.84      0.82       320
weighted avg       0.87      0.86      0.86       320


=== Task B ===
Accuracy: 0.4067796610169492
F1-score: 0.2891566265060241
              precision    recall  f1-score   support

         UNT       0.00      0.00      0.00        35
         TIN       0.41      1.00      0.58        24

    accuracy                           0.41        59
   macro avg       0.20      0.50      0.29        59
weighted avg       0.17      0.41      0.24        59


=== Task C ===
Accuracy: 0.8076923076923077
F1-score: 0.4682539682539682
              precision    recall  f1-score   support

         IND       0.90      0.90      0.90        21
         GRP       0.40      0.67      0.50         3
         OTH       0.00      0.00      0.00         2

    accuracy                           0.81        26
   macro avg       0.43      0.52      0.47        26
weighted avg       0.78      0.81      0.79        26


# Second : Bertweet on A, HateBert on B and C
=== Task A ===
Accuracy: 0.85625
F1-score: 0.8114367698298832
              precision    recall  f1-score   support

         NOT       0.92      0.88      0.90       243
         OFF       0.68      0.77      0.72        77

    accuracy                           0.86       320
   macro avg       0.80      0.83      0.81       320
weighted avg       0.86      0.86      0.86       320


=== Task B ===
Accuracy: 0.5357142857142857
F1-score: 0.5133689839572193
              precision    recall  f1-score   support

         UNT       1.00      0.26      0.41        35
         TIN       0.45      1.00      0.62        21

    accuracy                           0.54        56
   macro avg       0.72      0.63      0.51        56
weighted avg       0.79      0.54      0.49        56


=== Task C ===
Accuracy: 0.8571428571428571
F1-score: 0.47297297297297297
              precision    recall  f1-score   support

         IND       0.85      1.00      0.92        17
         GRP       1.00      0.33      0.50         3
         OTH       0.00      0.00      0.00         1

    accuracy                           0.86        21
   macro avg       0.62      0.44      0.47        21
weighted avg       0.83      0.86      0.82        21


# Third : Bertweet on A, HateBert_finetune on B and C
=== Task A ===
Accuracy: 0.85625
F1-score: 0.8114367698298832
              precision    recall  f1-score   support

         NOT       0.92      0.88      0.90       243
         OFF       0.68      0.77      0.72        77

    accuracy                           0.86       320
   macro avg       0.80      0.83      0.81       320
weighted avg       0.86      0.86      0.86       320


=== Task B ===
Accuracy: 0.625
F1-score: 0.6190476190476191
              precision    recall  f1-score   support

         UNT       1.00      0.40      0.57        35
         TIN       0.50      1.00      0.67        21

    accuracy                           0.62        56
   macro avg       0.75      0.70      0.62        56
weighted avg       0.81      0.62      0.61        56


=== Task C ===
Accuracy: 0.8095238095238095
F1-score: 0.4343434343434343
              precision    recall  f1-score   support

         IND       1.00      0.94      0.97        17
         GRP       0.33      0.33      0.33         3
         OTH       0.00      0.00      0.00         1

    accuracy                           0.81        21
   macro avg       0.44      0.42      0.43        21
weighted avg       0.86      0.81      0.83        21

# Fourth : Bertweet on A, HateBert_finetune with weighted loss on B and C
# -> good for B
=== Task A ===
Accuracy: 0.85625
F1-score: 0.8114367698298832
              precision    recall  f1-score   support

         NOT       0.92      0.88      0.90       243
         OFF       0.68      0.77      0.72        77

    accuracy                           0.86       320
   macro avg       0.80      0.83      0.81       320
weighted avg       0.86      0.86      0.86       320


=== Task B ===
Accuracy: 0.7678571428571429
F1-score: 0.7677830940988836
              precision    recall  f1-score   support

         UNT       1.00      0.63      0.77        35
         TIN       0.62      1.00      0.76        21

    accuracy                           0.77        56
   macro avg       0.81      0.81      0.77        56
weighted avg       0.86      0.77      0.77        56


=== Task C ===
Accuracy: 0.6666666666666666
F1-score: 0.2828282828282828
              precision    recall  f1-score   support

         IND       0.88      0.82      0.85        17
         GRP       0.00      0.00      0.00         3
         OTH       0.00      0.00      0.00         1

    accuracy                           0.67        21
   macro avg       0.29      0.27      0.28        21
weighted avg       0.71      0.67      0.69        21

# Fifth : C on hateBert without some advances : 
=== Task C ===
Accuracy: 0.6666666666666666
F1-score: 0.4091954022988506
              precision    recall  f1-score   support

         IND       1.00      0.71      0.83        17
         GRP       0.29      0.67      0.40         3
         OTH       0.00      0.00      0.00         1

    accuracy                           0.67        21
   macro avg       0.43      0.46      0.41        21
weighted avg       0.85      0.67      0.73        21

#Sixth : C with bert base uncased : 
=== Task C ===
Accuracy: 0.6190476190476191
F1-score: 0.35000000000000003
              precision    recall  f1-score   support

         IND       0.92      0.71      0.80        17
         GRP       0.20      0.33      0.25         3
         OTH       0.00      0.00      0.00         1

    accuracy                           0.62        21
   macro avg       0.37      0.35      0.35        21
weighted avg       0.78      0.62      0.68        21

IndentationError: unindent does not match any outer indentation level (<string>, line 8)