In [1]:
import pandas as pd
import numpy as np
import sys
sys.path.append("./modele_team1")

from preprocessing import *
from features import *
from embeddings import *
from models import *
from ensemble import *
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

E0000 00:00:1744188848.051332     988 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1744188848.055625     988 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1744188848.068238     988 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1744188848.068253     988 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1744188848.068255     988 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1744188848.068257     988 computation_placer.cc:177] computation placer already registered. Please check linka

In [2]:
df = pd.read_csv("../datasets/training-v1/offenseval-training-v1.tsv", sep='\t')
df.columns = ["id", "text", "label_A", "label_B", "label_C"]

texts = df["text"].values
labels = df["label_A"].map({"NOT": 0, "OFF": 1}).values


In [3]:
# 3. Préparation des données
norm_texts = [normalize_text(t) for t in texts]


In [4]:
X_feat = extract_features(norm_texts)


In [5]:
# === 4. Embeddings fastText (limités à 50k mots pour la RAM) ===
print("[INFO] Chargement des embeddings fastText...")
embeddings = load_fasttext("../embeddings/crawl-300d-1M.vec", max_words=50000)
X_embed = np.vstack([sentence_to_embedding(t, embeddings) for t in norm_texts])


[INFO] Chargement des embeddings fastText...


In [6]:
# === 5. Fusion features + embeddings ===
X = np.hstack([X_embed, X_feat])


In [7]:
#  === 6. Split train/test ===
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.3, random_state=42)


In [12]:
# === 7. Modèle Random Forest ===
rf = RFModel()
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
print("\n=== Rapport Random Forest ===")
print(classification_report(y_test, y_pred_rf, digits=3))



=== Rapport Random Forest ===
              precision    recall  f1-score   support

           0      0.724     0.967     0.828      2639
           1      0.804     0.272     0.406      1333

    accuracy                          0.733      3972
   macro avg      0.764     0.619     0.617      3972
weighted avg      0.751     0.733     0.686      3972



In [13]:
# === 8. Modèle SVM ===
svm = SVMModel()
svm.fit(X_train, y_train)
y_pred_svm = svm.predict(X_test)
print("\n=== Rapport SVM ===")
print(classification_report(y_test, y_pred_svm, digits=3))



=== Rapport SVM ===
              precision    recall  f1-score   support

           0      0.717     0.970     0.824      2639
           1      0.801     0.242     0.372      1333

    accuracy                          0.726      3972
   macro avg      0.759     0.606     0.598      3972
weighted avg      0.745     0.726     0.673      3972



In [14]:
# === 9. Ensemble Voting (RF + SVM) ===
ensemble = SoftVotingEnsemble([
    ('rf', rf.model),
    ('svm', svm.model)
])
ensemble.fit(X_train, y_train)
y_pred_ens = ensemble.predict(X_test)
print("\n=== Rapport Ensemble (RF + SVM) ===")
print(classification_report(y_test, y_pred_ens, digits=3))



=== Rapport Ensemble (RF + SVM) ===
              precision    recall  f1-score   support

           0      0.734     0.957     0.831      2639
           1      0.786     0.314     0.449      1333

    accuracy                          0.741      3972
   macro avg      0.760     0.636     0.640      3972
weighted avg      0.752     0.741     0.703      3972



In [8]:
# Adapter `num_labels` à la task :
# Task A → 2 classes (NOT/OFF)
# Task B → 2 classes (UNT/TIN)
# Task C → 3 classes (IND/GRP/OTH)

roberta = TransformerModel(model_name="roberta-large", num_labels=3)  # adapte le 3 si A/B

texts_sample = texts[:100]
labels_sample = labels[:100]
y_pred_roberta = roberta.predict(texts_sample)
print("\n=== Rapport RoBERTa ===")
print(classification_report(labels_sample, y_pred_roberta, digits=3))

#y_pred_roberta = roberta.predict(texts)

#print("\n=== Rapport RoBERTa ===")
#print(classification_report(labels, y_pred_roberta, digits=3))

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



=== Rapport RoBERTa ===
              precision    recall  f1-score   support

           0      0.000     0.000     0.000        68
           1      0.316     0.938     0.472        32
           2      0.000     0.000     0.000         0

    accuracy                          0.300       100
   macro avg      0.105     0.312     0.157       100
weighted avg      0.101     0.300     0.151       100



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
