In [None]:
import pandas as pd
import numpy as np
import re
import string
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
import random
from datasets import Dataset
from transformers import TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, f1_score, multilabel_confusion_matrix

In [37]:
def clean_text(text: str) -> str:
    text = re.sub(r'<[^>]+>', '', text) # remove html tags
    text = re.sub(r'http\S+|www\S+|https\S+', '', text) #remove urls
    text = text.translate(str.maketrans('', '', string.punctuation)) #remove punctions
    text = text.replace('\n', ' ')
    text = text.replace('\r', ' ')
    text = text.lower() #convert to lower case

    stop_words = set(stopwords.words('english'))
    filtered = [word for word in text.split(' ') if word not in stop_words]
    return ' '.join(filtered)

In [None]:
data_path = "train.csv"

df_data = pd.read_csv(data_path)
df_data.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [39]:
df_data['comment_text'] = df_data['comment_text'].apply(clean_text)

In [40]:
dataset = Dataset.from_pandas(df_data)
dataset[0]

{'id': '0000997932d777bf',
 'comment_text': 'explanation edits made username hardcore metallica fan reverted werent vandalisms closure gas voted new york dolls fac please dont remove template talk page since im retired now892053827',
 'toxic': 0,
 'severe_toxic': 0,
 'obscene': 0,
 'threat': 0,
 'insult': 0,
 'identity_hate': 0}

In [41]:
label2id = {column: idx for idx, column in enumerate(df_data.columns[2:])}

id2label = {idx: column for idx, column in enumerate(df_data.columns[2:])}

label2id, id2label

({'toxic': 0,
  'severe_toxic': 1,
  'obscene': 2,
  'threat': 3,
  'insult': 4,
  'identity_hate': 5},
 {0: 'toxic',
  1: 'severe_toxic',
  2: 'obscene',
  3: 'threat',
  4: 'insult',
  5: 'identity_hate'})

In [42]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
label_keys = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

def preprocess_data(examples):
    text = examples['comment_text']
    encoding = tokenizer(text, padding="max_length", truncation=True, max_length=128)
    
    labels_batch = {k: examples[k] for k in label_keys}
    labels_matrix = np.zeros((len(text), len(label_keys)))
    
    for idx, label in enumerate(label_keys):
        labels_matrix[:, idx] = labels_batch[label]
    
    encoding["labels"] = labels_matrix.tolist()
    
    return encoding

In [None]:
columns_to_keep = ['input_ids', 'attention_mask', 'labels']

encoded_dataset = dataset.map(
    preprocess_data,
    batched=True,
    remove_columns=[col for col in dataset.column_names if col not in columns_to_keep]
)

In [44]:
encoded_dataset = encoded_dataset.train_test_split(test_size = 0.2)

In [None]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", 
                                                           problem_type="multi_label_classification", 
                                                           num_labels=len(label_keys),
                                                           id2label=id2label,
                                                           label2id=label2id,
                                                          )

In [None]:
model.to("cuda")

In [None]:
batch_size = 16
metric_name = "f1"

from transformers import TrainingArguments, Trainer

args = TrainingArguments(
    f"final",
    evaluation_strategy = "epoch",
    save_strategy = "epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=2,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
    #push_to_hub=True,
)

In [48]:
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
from transformers import EvalPrediction
import torch
    
# source: https://jesusleal.io/2021/04/21/Longformer-multilabel-classification/
def multi_label_metrics(predictions, labels, threshold=0.5):
    # first, apply sigmoid on predictions which are of shape (batch_size, num_labels)
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.Tensor(predictions))
    # next, use threshold to turn them into integer predictions
    y_pred = np.zeros(probs.shape)
    y_pred[np.where(probs >= threshold)] = 1
    # finally, compute metrics
    y_true = labels
    f1_micro_average = f1_score(y_true=y_true, y_pred=y_pred, average='micro')
    roc_auc = roc_auc_score(y_true, y_pred, average = 'micro')
    accuracy = accuracy_score(y_true, y_pred)
    # return as dictionary
    metrics = {'f1': f1_micro_average,
               'roc_auc': roc_auc,
               'accuracy': accuracy}
    return metrics

def compute_metrics(p: EvalPrediction):
    preds = p.predictions[0] if isinstance(p.predictions, 
            tuple) else p.predictions
    result = multi_label_metrics(
        predictions=preds, 
        labels=p.label_ids)
    return result

In [None]:
import wandb
from kaggle_secrets import UserSecretsClient

user_secrets = UserSecretsClient()
wandb_api_key = user_secrets.get_secret("WANDB_API_KEY")

if wandb_api_key:
    wandb.login(key=wandb_api_key)
    print("✅ Logged into Weights & Biases!")
else:
    print("⚠️ WANDB API Key not found! Set it in Kaggle secrets.")

In [None]:
trainer = Trainer(
    model,
    args,
    train_dataset=encoded_dataset["train"],
    eval_dataset=encoded_dataset["test"],
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

In [None]:
trainer.save_model('TRAINED_MODEL')

In [None]:
trainer.evaluate()