In [2]:
!pip install transformers datasets scikit-learn



In [3]:
import pandas as pd

train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")
sub_df = pd.read_csv("sample_submission.csv")

labels = ['toxic','severe_toxic','obscene','threat','insult','identity_hate']

In [4]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def toxenize_batch(texts):
  return tokenizer(texts, padding=True, truncation=True, max_length=128, return_tensors="pt")

In [5]:
import torch
from torch.utils.data import Dataset

class ToxicData(Dataset):
  def __init__(self, texts, labels=None):
    encodings = tokenizer(texts, padding=True, truncation=True, max_length=128)
    self.encodings = encodings
    self.labels = labels

  def __getitem__(self, idx):
    item = {k:torch.tensor(v[idx]) for k, v in self.encodings.items()}
    if self.labels is not None:
      item["labels"] = torch.tensor(self.labels[idx], dtype=torch.float)
    return item

  def __len__(self):
    return len(self.encodings["input_ids"])


In [6]:
from sklearn.model_selection import train_test_split

train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_df['comment_text'].tolist(),
    train_df[labels].values,
    test_size=0.1
)

train_dataset = ToxicData(train_texts, train_labels)
val_dataset = ToxicData(val_texts, val_labels)

In [7]:
from transformers import BertForSequenceClassification, Trainer, TrainingArguments

model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=6,
    problem_type='multi_label_classification'
)

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    eval_strategy="epoch",
    logging_dir='./logs',
    save_strategy="epoch",
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

trainer.train()

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.0421,0.038404
2,0.0332,0.038498
3,0.0219,0.041791


TrainOutput(global_step=26928, training_loss=0.035165377868758875, metrics={'train_runtime': 4911.0253, 'train_samples_per_second': 87.729, 'train_steps_per_second': 5.483, 'total_flos': 2.834064379532851e+16, 'train_loss': 0.035165377868758875, 'epoch': 3.0})

In [8]:
test_dataset = ToxicData(test_df['comment_text'].tolist())

preds = trainer.predict(test_dataset)
probs = torch.sigmoid(torch.tensor(preds.predictions)).numpy() # Apply sigmoid since it's multi-label

In [9]:
sub_df[labels] = probs
sub_df.to_csv("submission.csv", index=False)

In [10]:
from google.colab import files
files.download("submission.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>