In [None]:
# Done in Google Colab

from sklearn.metrics import confusion_matrix
import pandas as pd
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers import TrainingArguments, Trainer
from datasets import Dataset


test_df = pd.read_csv(path + 'datasets/test.csv')
labels_df = pd.read_csv(path + 'datasets/test_labels.csv')
df = pd.merge(test_df, labels_df, on='id')
# Keeps only the cols "comment_text" and "toxic"
df = df[["comment_text","toxic"]]
df.drop_duplicates(["comment_text"], inplace=True)
df = df[df["toxic"] != -1]

# Removes useless spaces and empty comments
df["comment_text"] = df["comment_text"].str.strip()
df["comment_text"] = df["comment_text"].str.lower()
df = df[df["comment_text"].str.len() > 0]

val_ds = Dataset.from_pandas(df)
val_ds = val_ds.rename_column("toxic", "labels")
print(val_ds)

tokenizer = AutoTokenizer.from_pretrained(path + "/bert_toxic/")

def tokenize_func(batch):
    return tokenizer(
        batch["comment_text"],
        padding="max_length",
        truncation=True,
        max_length=256,
    )

val_ds = val_ds.map(tokenize_func, batched=True)
val_ds.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])


model = AutoModelForSequenceClassification.from_pretrained(path + "/bert_toxic/")

# Use of Trainer so I can set a threshold when evaluating the outputs
training_args = TrainingArguments(
    output_dir=path+"/bert_toxic/",
    per_device_eval_batch_size=32,
    do_train=False,
    report_to=[],
)

trainer = Trainer(
    model=model,
    args=training_args,
    tokenizer=tokenizer
)


predictions = trainer.predict(val_ds)
logits = predictions.predictions
labels_true = predictions.label_ids


Dataset({
    features: ['comment_text', 'labels', '__index_level_0__'],
    num_rows: 63978
})


Map:   0%|          | 0/63978 [00:00<?, ? examples/s]

  trainer = Trainer(


In [None]:
import torch
import torch.nn.functional as F

probs = F.softmax(torch.tensor(logits), dim=1)[:, 1].numpy()  # proba de la classe toxic
# print(probs)

nb_tox_real = len(labels_true[labels_true == 1])
nb_ntox_real = len(labels_true[labels_true == 0])
print(nb_tox_real, nb_ntox_real)

threshold = 0.5
nb_tox_pred = len(probs[probs >= threshold])
nb_ntox_pred = len(probs[probs < threshold])
print(nb_tox_pred, nb_ntox_pred)

6090 57888
10315 53663


In [None]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

labels = labels_true
probs = F.softmax(torch.tensor(logits), dim=1)[:, 1].numpy()
y_pred = (probs >= threshold).astype(int)
acc = accuracy_score(labels, y_pred)
f1 = f1_score(labels, y_pred)
precision = precision_score(labels, y_pred)
recall = recall_score(labels, y_pred)
# training 2 epoch 3
results = {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}
print(results)

{'accuracy': 0.9153302697802369, 'f1': 0.6697957939652545, 'precision': 0.5326223945710131, 'recall': 0.9021346469622332}
