In [2]:
# https://huggingface.co/docs/transformers/en/model_doc/bert#resources
# https://colab.research.google.com/github/abhimishra91/transformers-tutorials/blob/master/transformers_multi_label_classification.ipynb

In [3]:
!nvidia-smi

Sat Nov  9 12:38:23 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.127.05             Driver Version: 550.127.05     CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA L4                      On  |   00000000:31:00.0 Off |                    0 |
| N/A   46C    P8             13W /   72W |       1MiB /  23034MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [4]:
!lscpu

Architecture:        x86_64
CPU op-mode(s):      32-bit, 64-bit
Byte Order:          Little Endian
CPU(s):              8
On-line CPU(s) list: 0-7
Thread(s) per core:  2
Core(s) per socket:  4
Socket(s):           1
NUMA node(s):        1
Vendor ID:           AuthenticAMD
CPU family:          25
Model:               1
Model name:          AMD EPYC 7R13 Processor
Stepping:            1
CPU MHz:             3068.099
BogoMIPS:            5300.00
Hypervisor vendor:   KVM
Virtualization type: full
L1d cache:           32K
L1i cache:           32K
L2 cache:            512K
L3 cache:            16384K
NUMA node0 CPU(s):   0-7
Flags:               fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf tsc_known_freq pni pclmulqdq ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand hypervisor lahf_lm cmp_legacy

In [5]:
# Installing the transformers library and additional libraries if looking process

!pip install -q transformers

# Code for TPU packages install
# !curl -q https://raw.githubusercontent.com/pytorch/xla/master/contrib/scripts/env-setup.py -o pytorch-xla-env-setup.py
# !python pytorch-xla-env-setup.py --apt-packages libomp5 libopenblas-dev

In [43]:
# Importing stock ml libraries
import numpy as np
import pandas as pd
from sklearn import metrics
import transformers
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertModel, BertConfig
from sklearn.metrics import classification_report

In [7]:
# # Setting up the device for GPU usage

from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
device

'cuda'

In [8]:
# download the Jigsaw Toxic dataset from https://www.kaggle.com/competitions/jigsaw-toxic-comment-classification-challenge/data

In [9]:
!pwd
!ls

/home/ec2-user/SageMaker/BERT
fine_tuning_bert.ipynb				   test.csv
fine-tunned-bert-model-dict.pth			   test.csv.zip
fine-tunned-bert-model.pth			   test_labels.csv
jigsaw-toxic-comment-classification-challenge.zip  test_labels.csv.zip
sample_submission.csv				   train.csv
sample_submission.csv.zip			   train.csv.zip


In [7]:
!unzip -q -o jigsaw-toxic-comment-classification-challenge.zip
!unzip -q -o train.csv.zip
!unzip -q -o test.csv.zip
!unzip -q -o sample_submission.csv.zip
!unzip -q -o test_labels.csv.zip

In [10]:
df = pd.read_csv("./train.csv")
df[df['identity_hate'] == 1].head(3)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
42,001810bf8c45bf5f,You are gay or antisemmitian? \n\nArchangel WH...,1,0,1,0,1,1
105,00472b8e2d38d1ea,A pair of jew-hating weiner nazi schmucks.,1,0,1,0,1,1
176,006b94add72ed61c,I think that your a Fagget get a oife and burn...,1,0,1,1,1,1


In [11]:
df[(df['identity_hate'] == 1) & (df['severe_toxic'] == 1)].head(10)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
887,026bd33490542b2e,"you gay motherfucker i know where you live,i a...",1,1,1,0,1,1
1017,02c6e41e4b317ac3,WOULDN'T BE THE FIRST TIME BITCH. FUCK YOU I'L...,1,1,1,1,1,1
1116,03088aacc4600323,"Fuck you faggot, I don't care if i get blocked...",1,1,1,0,1,1
1312,039296aa294ee10b,"SHUT UP, YOU FAT POOP, OR I WILL KICK YOUR ASS!!!",1,1,1,1,1,1
2359,065675c12e22e3e1,You fascit Nazi! You socialist bastard! You fi...,1,1,1,0,1,1
2374,0661f992b32b3351,FUCK OF NAZI PROPAGANDA\nodpierdalcie sie jeba...,1,1,1,0,0,1
2937,07f1e0a1c10fcf28,Dumb Niggaz \n\nYall Niggas are dumb.Sean King...,1,1,1,0,1,1
3008,081a5f7b052138cf,Hey Roy. Go fukk yourself you gay bich.\n\nYa...,1,1,1,0,1,1
3469,0959d4fdaf74a050,THIS DUDE THAT BLOCKED ME IS A FUCKING COCK SU...,1,1,1,0,1,1
6063,10304d5db5fb5821,n i g g e r f a g g o t,1,1,0,0,0,1


In [12]:
new_df = df[(df['obscene'] == 0) & (df['severe_toxic'] == 0)].copy()
new_df = new_df[['comment_text', 'toxic']]
new_df

Unnamed: 0,comment_text,toxic
0,Explanation\nWhy the edits made under my usern...,0
1,D'aww! He matches this background colour I'm s...,0
2,"Hey man, I'm really not trying to edit war. It...",0
3,"""\nMore\nI can't make any real suggestions on ...",0
4,"You, sir, are my hero. Any chance you remember...",0
...,...,...
159566,""":::::And for the second time of asking, when ...",0
159567,You should be ashamed of yourself \n\nThat is ...,0
159568,"Spitzer \n\nUmm, theres no actual article for ...",0
159569,And it looks like it was actually you who put ...,0


In [13]:
# Defining some key variables that will be used later on in the training
MAX_LEN = 200
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 4
EPOCHS = 1
LEARNING_RATE = 1e-05

In [14]:
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')
bert_base_model = BertModel.from_pretrained("bert-large-uncased")
bert_base_model.config

BertConfig {
  "_attn_implementation_autoset": true,
  "_name_or_path": "bert-large-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.46.2",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

### Preparing the Dataset and Dataloader

In [None]:
def bert_encode(text, max_len):
    text = " ".join(text.split())
    inputs = tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=max_len,
            pad_to_max_length=True,
            return_token_type_ids=True
        )
    return {
            'ids': torch.tensor(inputs['input_ids'], dtype=torch.long),
            'mask': torch.tensor(inputs['attention_mask'], dtype=torch.long),
            'token_type_ids': torch.tensor(inputs["token_type_ids"], dtype=torch.long),
        }

class CustomDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.comment_text = dataframe.comment_text
        self.targets = self.data.toxic
        self.max_len = max_len

    def __len__(self):
        return len(self.comment_text)

    def __getitem__(self, index):
        inputs = bert_encode(str(self.comment_text[index]), self.max_len)
        return {
            'ids': inputs['ids'],
            'mask': inputs['mask'],
            'token_type_ids': inputs['token_type_ids'],
            'targets': torch.tensor([self.targets[index]], dtype=torch.float)
        }

In [16]:
# Creating the dataset and dataloader for the neural network

train_size = 0.8
train_dataset = new_df.sample(frac=train_size, random_state=42)
test_dataset = new_df.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)

print("FULL Dataset: {}".format(new_df.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

training_set = CustomDataset(train_dataset, tokenizer, MAX_LEN)
testing_set = CustomDataset(test_dataset, tokenizer, MAX_LEN)

FULL Dataset: (151044, 2)
TRAIN Dataset: (120835, 2)
TEST Dataset: (30209, 2)


In [17]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [40]:
next(training_loader.__iter__())

{'ids': tensor([[  101,  9548,  2017,  ...,     0,     0,     0],
         [  101,  1000,  2997,  ...,     0,     0,     0],
         [  101,  6149,  2021,  ...,     0,     0,     0],
         ...,
         [  101,  3746,  1024,  ...,     0,     0,     0],
         [  101, 16948,  1024,  ...,     0,     0,     0],
         [  101,  2045,  1005,  ...,     0,     0,     0]]),
 'mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]),
 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         ...,
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0]]),
 'targets': tensor([[0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.],
         [0.]])}

### Neural Network for Fine Tuning

In [19]:
# Creating the customized model, by adding a drop out and a dense layer on top of distil bert to get the final output for the model.

class BERTClass(torch.nn.Module):
    def __init__(self, base_bert_model):
        super(BERTClass, self).__init__()
        self.l1 = base_bert_model
        self.l2 = torch.nn.Dropout(0.3)
        self.l3 = torch.nn.Linear(1024, 1)

    def forward(self, ids, mask, token_type_ids):
        _, output_1 = self.l1(ids, attention_mask=mask, token_type_ids=token_type_ids, return_dict=False)
        output_2 = self.l2(output_1)
        output = self.l3(output_2)
        return output


model = BERTClass(bert_base_model)
model.to(device)

BERTClass(
  (l1): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 1024, padding_idx=0)
      (position_embeddings): Embedding(512, 1024)
      (token_type_embeddings): Embedding(2, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-23): 24 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm): LayerNorm((1024,), eps=1e-12, elem

In [20]:
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

In [21]:
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

### Fine Tuning the Model

In [22]:
def train(epoch):
    model.train()
    for _,data in enumerate(training_loader, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)

        outputs = model(ids, mask, token_type_ids)

        optimizer.zero_grad()
        loss = loss_fn(outputs, targets)
        if _ % 5000 == 0:
            print(f'Epoch: {epoch}, Loss:  {loss.item()}')

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

In [21]:
for epoch in range(EPOCHS):
    train(epoch)

Epoch: 0, Loss:  0.7941539883613586
Epoch: 0, Loss:  0.34532198309898376
Epoch: 0, Loss:  0.003761229570955038
Epoch: 0, Loss:  0.004295204766094685


### Validating the Model

In [23]:
def validation(epoch):
    model.eval()
    fin_targets=[]
    fin_outputs=[]
    with torch.no_grad():
        for _, data in enumerate(testing_loader, 0):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            outputs = model(ids, mask, token_type_ids)
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
    return fin_outputs, fin_targets

In [23]:
for epoch in range(EPOCHS):
    outputs, targets = validation(epoch)
    outputs = np.array(outputs) >= 0.5
    accuracy = metrics.accuracy_score(targets, outputs)
    f1_score_micro = metrics.f1_score(targets, outputs, average='micro')
    f1_score_macro = metrics.f1_score(targets, outputs, average='macro')
    print(f"Accuracy Score = {accuracy}")
    print(f"F1 Score (Micro) = {f1_score_micro}")
    print(f"F1 Score (Macro) = {f1_score_macro}")

Accuracy Score = 0.9669303849846073
F1 Score (Micro) = 0.9669303849846073
F1 Score (Macro) = 0.8187776567678853


In [24]:
torch.save(model, "fine-tunned-bert-model.pth")

In [25]:
torch.save(model.state_dict(), "fine-tunned-bert-model-dict.pth")

In [26]:
model.load_state_dict(torch.load("fine-tunned-bert-model-dict.pth", weights_only=True))

<All keys matched successfully>

In [35]:
text = "Test me. My name is thiago. I said thiago thiago thiago thiago thiago thiago"
encoded_input = tokenizer(text, return_tensors='pt')
encoded_input

{'input_ids': tensor([[  101,  3231,  2033,  1012,  2026,  2171,  2003, 16215,  2401,  3995,
          1012,  1045,  2056, 16215,  2401,  3995, 16215,  2401,  3995, 16215,
          2401,  3995, 16215,  2401,  3995, 16215,  2401,  3995, 16215,  2401,
          3995,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1]])}

In [36]:
encoded_input = bert_encode(text, MAX_LEN)
encoded_input

{'ids': tensor([  101,  3231,  2033,  1012,  2026,  2171,  2003, 16215,  2401,  3995,
          1012,  1045,  2056, 16215,  2401,  3995, 16215,  2401,  3995, 16215,
          2401,  3995, 16215,  2401,  3995, 16215,  2401,  3995, 16215,  2401,
          3995,   102,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,  

In [51]:
ids = encoded_input['ids'].to(device, dtype = torch.long)
mask = encoded_input['mask'].to(device, dtype = torch.long)
token_type_ids = encoded_input['token_type_ids'].to(device, dtype = torch.long)
print(ids.shape, mask.shape, token_type_ids.shape)

outputs = model(ids.unsqueeze(0), mask.unsqueeze(0), token_type_ids.unsqueeze(0))
outputs

torch.Size([200]) torch.Size([200]) torch.Size([200])


tensor([[-3.4070]], device='cuda:0', grad_fn=<AddmmBackward0>)

In [52]:
def is_toxic(text):
    encoded_input = bert_encode(text, MAX_LEN)
    ids = encoded_input['ids'].to(device, dtype = torch.long)
    mask = encoded_input['mask'].to(device, dtype = torch.long)
    token_type_ids = encoded_input['token_type_ids'].to(device, dtype = torch.long)
    outputs = model(ids.unsqueeze(0), mask.unsqueeze(0), token_type_ids.unsqueeze(0))
    return (outputs >= 0.5).cpu().detach().numpy()[0][0], outputs.cpu().detach().numpy()[0][0]


def prefixed_is_toxic_check():
    def print_is_toxic(text):
        result = is_toxic(text)
        # print(result)
        return f'[Score: {result[1]:0.3f}] [Toxic: {result[0]}] {text}'

    return print_is_toxic("You are an idiot"), \
        print_is_toxic("You are a devil"), \
        print_is_toxic("You stink"), \
        print_is_toxic("What are you doing here? You are not welcome in this country"), \
        print_is_toxic("hello my friend! I love you, you know that, right?"), \
        print_is_toxic("I watched the soccer game yesterday and it was an aweful experience"), \
        print_is_toxic("Hello Dad; how are you doing?"), \
        print_is_toxic("Hi Mom!")

prefixed_is_toxic_check()

('[Score: 3.459] [Toxic: True] You are an idiot',
 '[Score: 1.558] [Toxic: True] You are a devil',
 '[Score: 2.884] [Toxic: True] You stink',
 '[Score: -2.689] [Toxic: False] What are you doing here? You are not welcome in this country',
 '[Score: -8.527] [Toxic: False] hello my friend! I love you, you know that, right?',
 '[Score: -8.482] [Toxic: False] I watched the soccer game yesterday and it was an aweful experience',
 '[Score: -7.419] [Toxic: False] Hello Dad; how are you doing?',
 '[Score: -2.216] [Toxic: False] Hi Mom!')