In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


# Importing libraries

In [None]:
!pip install transformers

import torch
import torch.nn as nn
import pandas as pd
import numpy as np
import random
import shutil
import sys
from sklearn.model_selection import train_test_split

Collecting transformers
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m29.2 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m34.5 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m79.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m82.4 MB/s[0m eta [36m0:00:0

# Setting seed for reproducibility

In [None]:
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

In [None]:
set_seed(43)

# Downloading dataset

In [None]:
from requests import get as rget

res = rget("https://xxx:xxxxxxxx@raw.githubusercontent.com/yogasgm/indonesian-online-toxicity-detection/main/dataset/online-toxicity-type-processed.csv")
with open('file.csv', 'wb+') as f:
        f.write(res.content)

train_df = pd.read_csv('file.csv')

In [None]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11847 entries, 0 to 11846
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   comment_text       11845 non-null  object
 1   identity_attack    11847 non-null  int64 
 2   insult             11847 non-null  int64 
 3   profanity          11847 non-null  int64 
 4   threat             11847 non-null  int64 
 5   sexually_explicit  11847 non-null  int64 
dtypes: int64(5), object(1)
memory usage: 555.5+ KB


In [None]:
train_df.columns

Index(['comment_text', 'identity_attack', 'insult', 'profanity', 'threat',
       'sexually_explicit'],
      dtype='object')

# Selecting required columns

In [None]:
train_df = train_df[['comment_text', 'identity_attack', 'insult', 'profanity', 'threat', 'sexually_explicit',]]

In [None]:
target_list = ['identity_attack', 'insult', 'profanity', 'threat', 'sexually_explicit',]

# Preparing the tokenizer

In [None]:
MAX_LEN = 128

In [None]:
from transformers import BertTokenizer, BertModel

In [None]:
#download the tokenizer
tokenizer = BertTokenizer.from_pretrained('indolem/indobertweet-base-uncased')

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/235k [00:00<?, ?B/s]

Downloading (…)in/added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.10k [00:00<?, ?B/s]

In [None]:
class CustomDataset(torch.utils.data.Dataset):

    def __init__(self, df, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.df = df
        self.title = df['comment_text']
        self.targets = self.df[target_list].values
        self.max_len = max_len

    def __len__(self):
        return len(self.title)

    def __getitem__(self, index):
        title = str(self.title[index])
        title = " ".join(title.split())

        inputs = self.tokenizer.encode_plus(
            title,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'token_type_ids': inputs["token_type_ids"].flatten(),
            'targets': torch.FloatTensor(self.targets[index])
        }

# Splitting the Data

In [None]:
# Adjusting the train/validation/test split
train_df, temp_df = train_test_split(train_df, test_size=0.2, random_state=43)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=43)

# Reset the indices
train_df = train_df.reset_index(drop=True)
val_df = val_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

In [None]:
# Label distribution in the training set
train_counts = train_df[target_list].sum(axis=0)
print("Label distribution in the training set:\n", train_counts)

# Label distribution in the validation set
val_counts = val_df[target_list].sum(axis=0)
print("\nLabel distribution in the validation set:\n", val_counts)

# Label distribution in the test set
test_counts = test_df[target_list].sum(axis=0)
print("\nLabel distribution in the test set:\n", test_counts)

Label distribution in the training set:
 identity_attack      2057
insult               2799
profanity            2823
threat               1987
sexually_explicit    2023
dtype: int64

Label distribution in the validation set:
 identity_attack      267
insult               379
profanity            379
threat               255
sexually_explicit    252
dtype: int64

Label distribution in the test set:
 identity_attack      257
insult               357
profanity            337
threat               261
sexually_explicit    242
dtype: int64


In [None]:
# Label distribution in the training set
train_counts_percentage = (train_df[target_list].sum(axis=0) / len(train_df)) * 100
print("Label distribution in the training set:\n", train_counts_percentage)

# Label distribution in the validation set
val_counts_percentage = (val_df[target_list].sum(axis=0) / len(val_df)) * 100
print("\nLabel distribution in the validation set:\n", val_counts_percentage)

# Label distribution in the test set
test_counts_percentage = (test_df[target_list].sum(axis=0) / len(test_df)) * 100
print("\nLabel distribution in the test set:\n", test_counts_percentage)

Label distribution in the training set:
 identity_attack      21.705181
insult               29.534663
profanity            29.787908
threat               20.966551
sexually_explicit    21.346418
dtype: float64

Label distribution in the validation set:
 identity_attack      22.531646
insult               31.983122
profanity            31.983122
threat               21.518987
sexually_explicit    21.265823
dtype: float64

Label distribution in the test set:
 identity_attack      21.687764
insult               30.126582
profanity            28.438819
threat               22.025316
sexually_explicit    20.421941
dtype: float64


In [None]:
train_df.shape

(9477, 6)

In [None]:
val_df.shape

(1185, 6)

In [None]:
val_df

Unnamed: 0,comment_text,identity_attack,insult,profanity,threat,sexually_explicit
0,kalo hukum mati rajam atau kebiri pemerkosa hu...,0,0,0,1,0
1,lu sihpada doyan jd lgbtemang bisa homo punya ...,1,1,1,0,1
2,bahas islam merusak nilai penguatan karakter m...,1,1,0,0,0
3,debut egy maulana vikri di liga cetak gol dan...,0,0,0,0,0
4,ngapain kirim gambar kontol doang yg enak itu ...,0,0,1,0,1
...,...,...,...,...,...,...
1180,vidio bokep indo viral abg skandal ngentot cew...,1,1,1,0,1
1181,lawan politik mulai tebar fitnah ada upaya jeg...,0,0,0,0,0
1182,praktik sunat perempuan menimbulkan dilema ant...,0,0,0,0,0
1183,mau juga sieh cuman sayang aku hanya orang pap...,1,1,0,0,0


In [None]:
test_df

Unnamed: 0,comment_text,identity_attack,insult,profanity,threat,sexually_explicit
0,krisdayanti kasih kopi ke ameena netizen terbelah,0,0,0,0,0
1,berdasarkan rekonstruksi hasya berada di aspal...,0,0,0,0,0
2,pertanggungjawaban untuk pemerkosaan itu bukan...,0,1,1,1,0
3,gerombolan kadrun menganggap hanya kelompoknya...,1,1,1,0,0
4,anjayyyy,0,0,0,0,0
...,...,...,...,...,...,...
1180,ngewe lah klo sange,0,0,1,0,1
1181,aku bom surat berharga syariah negara,0,0,0,0,0
1182,paling demen bgt gue kalo tau ada orang yang s...,0,1,0,0,0
1183,profil dan pendiri yang tutup operasi di ri pe...,0,0,0,0,0


In [None]:
# Create the CustomDataset for each set
train_dataset = CustomDataset(train_df, tokenizer, MAX_LEN)
valid_dataset = CustomDataset(val_df, tokenizer, MAX_LEN)
test_dataset = CustomDataset(test_df, tokenizer, MAX_LEN)

In [None]:
len(train_dataset)

9477

# Setting hyperparameters

In [None]:
TRAIN_BATCH_SIZE = 32
VALID_BATCH_SIZE = 32
EPOCHS = 5
LEARNING_RATE = 2e-5

In [None]:
# Preparing the DataLoaders
train_data_loader = torch.utils.data.DataLoader(train_dataset,
    batch_size=TRAIN_BATCH_SIZE,
    shuffle=False,
    num_workers=0
)

val_data_loader = torch.utils.data.DataLoader(valid_dataset,
    batch_size=VALID_BATCH_SIZE,
    shuffle=False,
    num_workers=0
)

In [None]:
# Checking for available device
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [None]:
device

device(type='cuda')

# Additional functions for loading and saving checkpoints

In [None]:
def load_ckp(checkpoint_fpath, model, optimizer):
    """
    checkpoint_path: path to save checkpoint
    model: model that we want to load checkpoint parameters into
    optimizer: optimizer we defined in previous training
    """
    # load check point
    checkpoint = torch.load(checkpoint_fpath)
    # initialize state_dict from checkpoint to model
    model.load_state_dict(checkpoint['state_dict'])
    # initialize optimizer from checkpoint to optimizer
    optimizer.load_state_dict(checkpoint['optimizer'])
    # initialize valid_loss_min from checkpoint to valid_loss_min
    valid_loss_min = checkpoint['valid_loss_min']
    # return model, optimizer, epoch value, min validation loss
    return model, optimizer, checkpoint['epoch'], valid_loss_min

def save_ckp(state, is_best, checkpoint_path, best_model_path):
    """
    state: checkpoint we want to save
    is_best: is this the best checkpoint; min validation loss
    checkpoint_path: path to save checkpoint
    best_model_path: path to save best model
    """
    f_path = checkpoint_path
    # save checkpoint data to the path given, checkpoint_path
    torch.save(state, f_path)
    # if it is a best model, min validation loss
    if is_best:
        best_fpath = best_model_path
        # copy that checkpoint file to best path given, best_model_path
        shutil.copyfile(f_path, best_fpath)

# Training the Model

Defining and Initializing the BERT Classification Model

In [None]:
class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.bert_model = BertModel.from_pretrained('indolem/indobertweet-base-uncased', return_dict=True)
        self.dropout = torch.nn.Dropout(0.3)
        self.linear = torch.nn.Linear(768, 5)

    def forward(self, input_ids, attn_mask, token_type_ids):
        output = self.bert_model(
            input_ids,
            attention_mask=attn_mask,
            token_type_ids=token_type_ids
        )
        output_dropout = self.dropout(output.pooler_output)
        output = self.linear(output_dropout)
        return output

model = BERTClass()
model.to(device)

Downloading pytorch_model.bin:   0%|          | 0.00/445M [00:00<?, ?B/s]

BERTClass(
  (bert_model): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(31923, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_a

Setting Up the Loss Function and Optimizer

In [None]:
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

optimizer = torch.optim.AdamW(params =  model.parameters(), lr=LEARNING_RATE)

Initialization of Validation Target and Output Lists

In [None]:
val_targets=[]
val_outputs=[]

Training and Validation Loop with Early Stopping

In [None]:
def train_model(n_epochs, training_loader, validation_loader, model,
                optimizer, checkpoint_path, best_model_path, patience):

  # initialize tracker for minimum validation loss
  valid_loss_min = np.Inf
  no_improve = 0


  for epoch in range(1, n_epochs+1):
    train_loss = 0
    valid_loss = 0

    model.train()
    print('############# Epoch {}: Training Start   #############'.format(epoch))
    for batch_idx, data in enumerate(training_loader):
        #print('yyy epoch', batch_idx)
        ids = data['input_ids'].to(device, dtype = torch.long)
        mask = data['attention_mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)

        outputs = model(ids, mask, token_type_ids)

        optimizer.zero_grad()
        loss = loss_fn(outputs, targets)
        #if batch_idx%5000==0:
         #   print(f'Epoch: {epoch}, Training Loss:  {loss.item()}')

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        #print('before loss data in training', loss.item(), train_loss)
        train_loss = train_loss + ((1 / (batch_idx + 1)) * (loss.item() - train_loss))
        #print('after loss data in training', loss.item(), train_loss)

    print('############# Epoch {}: Training End     #############'.format(epoch))

    print('############# Epoch {}: Validation Start   #############'.format(epoch))
    ######################
    # validate the model #
    ######################

    model.eval()

    with torch.no_grad():
      for batch_idx, data in enumerate(validation_loader, 0):
            ids = data['input_ids'].to(device, dtype = torch.long)
            mask = data['attention_mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            outputs = model(ids, mask, token_type_ids)

            loss = loss_fn(outputs, targets)
            valid_loss = valid_loss + ((1 / (batch_idx + 1)) * (loss.item() - valid_loss))
            val_targets.extend(targets.cpu().detach().numpy().tolist())
            val_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())

      print('############# Epoch {}: Validation End     #############'.format(epoch))
      # calculate average losses
      #print('before cal avg train loss', train_loss)
      train_loss = train_loss/len(training_loader)
      valid_loss = valid_loss/len(validation_loader)
      # print training/validation statistics
      print('Epoch: {} \tAvgerage Training Loss: {:.6f} \tAverage Validation Loss: {:.6f}'.format(
            epoch,
            train_loss,
            valid_loss
            ))

      # create checkpoint variable and add important data
      checkpoint = {
            'epoch': epoch + 1,
            'valid_loss_min': valid_loss,
            'state_dict': model.state_dict(),
            'optimizer': optimizer.state_dict()
      }


      ## TODO: save the model if validation loss has decreased
      if valid_loss <= valid_loss_min:
        print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(
              valid_loss_min,
              valid_loss
              ))
        save_ckp(checkpoint, True, checkpoint_path, best_model_path)
        valid_loss_min = valid_loss
        no_improve = 0
      else:
        no_improve += 1
        if no_improve >= patience:
          print("Early stopping due to no improvement in validation loss")
          break

  return model

In [None]:
# Save checkpoint

ckpt_path = "/content/gdrive/MyDrive/curr_ckpt_32_2e-5_IndoBT_Toxictype"
best_model_path = "/content/gdrive/MyDrive/best_model_32_2e-5_IndoBT_Toxictype.pt"

# Start Train

In [None]:
trained_model = train_model(EPOCHS, train_data_loader, val_data_loader, model, optimizer, ckpt_path, best_model_path, patience=2)

############# Epoch 1: Training Start   #############
############# Epoch 1: Training End     #############
############# Epoch 1: Validation Start   #############
############# Epoch 1: Validation End     #############
Epoch: 1 	Avgerage Training Loss: 0.000827 	Average Validation Loss: 0.003605
Validation loss decreased (inf --> 0.003605).  Saving model ...
############# Epoch 2: Training Start   #############
############# Epoch 2: Training End     #############
############# Epoch 2: Validation Start   #############
############# Epoch 2: Validation End     #############
Epoch: 2 	Avgerage Training Loss: 0.000383 	Average Validation Loss: 0.003149
Validation loss decreased (0.003605 --> 0.003149).  Saving model ...
############# Epoch 3: Training Start   #############
############# Epoch 3: Training End     #############
############# Epoch 3: Validation Start   #############
############# Epoch 3: Validation End     #############
Epoch: 3 	Avgerage Training Loss: 0.000271 	Average

In [None]:
# Load the saved checkpoint
model, optimizer, start_epoch, valid_loss_min = load_ckp(best_model_path, model, optimizer)

print(f'The validation loss of the best saved model is: {valid_loss_min}')

The validation loss of the best saved model is: 0.003149190920190963
