In [None]:
# Done in Google Colab

import pandas as pd
from datasets import Dataset, ClassLabel
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import Trainer, TrainingArguments, DataCollatorWithPadding, EvalPrediction
import matplotlib.pyplot as plt
import numpy as np
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.utils import resample
import torch
import torch.nn.functional as F
from torch.nn import CrossEntropyLoss
from collections import Counter

df = pd.read_csv(path+'train.csv')

# Keeps only the cols "comment_text" and "toxic"
df = df[["comment_text","toxic"]]
df.drop_duplicates(["comment_text"], inplace=True)

# Removes useless spaces and empty comments
df["comment_text"] = df["comment_text"].str.strip()
df["comment_text"] = df["comment_text"].str.lower()
df = df[df["comment_text"].str.len() > 0]

ds = Dataset.from_pandas(df)
ds = ds.rename_column("toxic", "labels")
print(ds)

model_name = "google-bert/bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_func(batch):
    return tokenizer(
        batch["comment_text"],
        padding="max_length",
        truncation=True,
        max_length=256,
    )

ds = ds.map(tokenize_func, batched=True)

ds_split = ds.train_test_split(test_size=0.1)
train_ds = ds_split['train']
val_ds = ds_split['test']

train_ds.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
val_ds.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

In [None]:
""""
Weight loss
"""
train_labels = np.array(train_ds['labels'])
counter = Counter(train_labels)
num_non_toxic = counter[0]
num_toxic = counter[1]
weight_0 = 1.0
weight_1 = num_non_toxic / num_toxic
class_weights = torch.tensor([weight_0, weight_1], dtype=torch.float)

class CustomTrainer(Trainer):
  def compute_loss_func(model, inputs, return_outputs=False, **kwargs):
      labels = inputs.pop("labels")
      outputs = model(**inputs)
      logits = outputs.logits

      loss_fct = CrossEntropyLoss(weight=class_weights.to(logits.device))
      loss = loss_fct(logits, labels)

      return (loss, outputs) if return_outputs else loss



model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="steps",
    logging_steps=100,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    save_total_limit=2,
    report_to=[],
)

threshold = 0.5

def compute_metrics(pred: EvalPrediction):
    logits = pred.predictions
    labels = pred.label_ids
    probs = F.softmax(torch.tensor(logits), dim=1)[:, 1].numpy()
    y_pred = (probs >= threshold).astype(int)
    acc = accuracy_score(labels, y_pred)
    f1 = f1_score(labels, y_pred)
    precision = precision_score(labels, y_pred)
    recall = recall_score(labels, y_pred)
    return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)


trainer.train()