In [1]:
import pandas as pd
import numpy as np
import sys
sys.path.append("./modele_team1")

from preprocessing import *
from features import *
from embeddings import *
from models import *
from ensemble import *
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

E0000 00:00:1744123751.750932    5199 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1744123751.755205    5199 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1744123751.767189    5199 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1744123751.767199    5199 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1744123751.767201    5199 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1744123751.767202    5199 computation_placer.cc:177] computation placer already registered. Please check linka

In [2]:
df = pd.read_csv("../datasets/training-v1/offenseval-training-v1.tsv", sep='\t')
df.columns = ["id", "text", "label_A", "label_B", "label_C"]

# Subtask C : seulement quand label_A == 'OFF'
df_c = df[(df["label_A"] == "OFF") & (df["label_B"] == "TIN")]
texts = df_c["text"].values
labels = df_c["label_C"].map({"IND": 0, "GRP": 1, "OTH": 2}).values


In [3]:
# 3. Préparation des données
norm_texts = [normalize_text(t) for t in texts]


In [4]:
X_feat = extract_features(norm_texts)


In [5]:
# === 4. Embeddings fastText (limités à 50k mots pour la RAM) ===
print("[INFO] Chargement des embeddings fastText...")
embeddings = load_fasttext("../embeddings/crawl-300d-1M.vec", max_words=50000)
X_embed = np.vstack([sentence_to_embedding(t, embeddings) for t in norm_texts])


[INFO] Chargement des embeddings fastText...


In [6]:
# === 5. Fusion features + embeddings ===
X = np.hstack([X_embed, X_feat])


In [7]:
#  === 6. Split train/test ===
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.3, random_state=42)


In [8]:
# === 7. Modèle Random Forest ===
rf = RFModel()
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
print("\n=== Rapport Random Forest ===")
print(classification_report(y_test, y_pred_rf, digits=3))



=== Rapport Random Forest ===
              precision    recall  f1-score   support

           0      0.744     0.915     0.821       743
           1      0.588     0.477     0.527       302
           2      0.250     0.008     0.016       118

    accuracy                          0.709      1163
   macro avg      0.527     0.467     0.455      1163
weighted avg      0.653     0.709     0.663      1163



In [9]:
# === 8. Modèle SVM ===
svm = SVMModel()
svm.fit(X_train, y_train)
y_pred_svm = svm.predict(X_test)
print("\n=== Rapport SVM ===")
print(classification_report(y_test, y_pred_svm, digits=3))



=== Rapport SVM ===
              precision    recall  f1-score   support

           0      0.765     0.903     0.828       743
           1      0.573     0.543     0.558       302
           2      0.000     0.000     0.000       118

    accuracy                          0.718      1163
   macro avg      0.446     0.482     0.462      1163
weighted avg      0.638     0.718     0.674      1163



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [10]:
# === 9. Ensemble Voting (RF + SVM) ===
ensemble = SoftVotingEnsemble([
    ('rf', rf.model),
    ('svm', svm.model)
])
ensemble.fit(X_train, y_train)
y_pred_ens = ensemble.predict(X_test)
print("\n=== Rapport Ensemble (RF + SVM) ===")
print(classification_report(y_test, y_pred_ens, digits=3))



=== Rapport Ensemble (RF + SVM) ===
              precision    recall  f1-score   support

           0      0.771     0.904     0.832       743
           1      0.579     0.556     0.568       302
           2      0.000     0.000     0.000       118

    accuracy                          0.722      1163
   macro avg      0.450     0.487     0.467      1163
weighted avg      0.643     0.722     0.679      1163



In [None]:
# Subtask C : uniquement OFF et TIN
df_c = df[(df["label_A"] == "OFF") & (df["label_B"] == "TIN")]
texts = df_c["text"].tolist()
labels = df_c["label_C"].map({"IND": 0, "GRP": 1, "OTH": 2}).tolist()

bert = BERTModel(model_name="distilbert-base-uncased", num_labels=3)
y_pred_bert = bert.predict(texts)

print("\n=== Rapport BERT (Subtask C) ===")
print(classification_report(labels, y_pred_bert, digits=3))