In [1]:
!pip install transformers
!pip install sentencepiece



In [2]:
import pandas as pd
import numpy as np
import transformers
import torch
import torch.nn as nn
from sklearn import model_selection
from sklearn import metrics
from tqdm import tqdm , trange
from scipy import stats

import warnings
warnings.filterwarnings("ignore")

In [3]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [4]:
TOKENIZER = transformers.BertTokenizer.from_pretrained("bert-base-multilingual-uncased", do_lower_case=True)

In [5]:
print(transformers.__version__)

4.8.1


In [6]:
class TOXICDATASET:
    def __init__(self,comment_text,target,tokenizer,max_len):
        self.comment_text = comment_text
        self.target = target
        self.tokenizer = tokenizer 
        self.max_len = max_len
    
    def __len__(self):
        return len(self.comment_text)
    
    def __getitem__(self,item):
      comment_text = str(self.comment_text[item])
      comment_text = " ".join(comment_text.split())
        
      inputs = self.tokenizer.encode_plus(comment_text,
                                          None,
                                          add_special_tokens = True,
                                          max_length = self.max_len,
                                          truncation=True)      
      ids = inputs["input_ids"]
      masks = inputs["attention_mask"]
      token_type_ids = inputs["token_type_ids"]


      padding_length = self.max_len - len(ids)
      ids = ids + ([0] * padding_length)
      masks = masks + ([0] * padding_length)
      token_type_ids = token_type_ids + ([0] * padding_length)


      return {
          "input_ids" : torch.tensor(ids,dtype = torch.long),
          "attention_masks" : torch.tensor(masks,dtype = torch.long),
          "token_type_ids" : torch.tensor(token_type_ids,dtype = torch.long),
          "targets" : torch.tensor(self.target[item],dtype = torch.float)
      }

In [7]:
class TOXICMODEL(nn.Module):
    def __init__(self,conf):
        super(TOXICMODEL,self).__init__()
        self.conf = conf
        self.bert = transformers.BertModel.from_pretrained(self.conf)
        self.dropout = torch.nn.Dropout(p = 0.3)
        self.classifier = torch.nn.Linear(768,6)

    def forward(self,input_ids,attention_mask,token_type_ids):
        _, output = self.bert(input_ids,attention_mask,token_type_ids,return_dict = False)
        output = self.dropout(output)
        output = self.classifier(output)   
        return output 

In [8]:
def Loss_Func(output,targets):
  return nn.BCEWithLogitsLoss()(output,targets)


def Train_Func(dataLoader,model,optimizer,device,scheduler = None):
    model.train()

    for index,batch in enumerate(dataLoader):
      ids = batch["input_ids"]
      masks = batch["attention_masks"]
      token = batch["token_type_ids"]
      target = batch["targets"]

      ids = ids.to(device,dtype = torch.long)
      masks = masks.to(device,dtype = torch.long)
      token = token.to(device,dtype = torch.long)
      target = target.to(device,dtype = torch.float)
              
      optimizer.zero_grad()
      output = model(input_ids = ids,
                    attention_mask = masks,
                    token_type_ids = token)
      loss = Loss_Func(output,target)
      loss.backward()
      optimizer.step()

      if scheduler is not None:
        scheduler.step() 

      if index / 10 == 0:
        print(f"Index : {index} >>>=============================>>> Loss : {loss}")



def Eval_Func(dataLoader,model,device):
    model.eval()
    final_targets = []
    final_outputs = []
    
    for index,batch in enumerate(dataLoader):
      ids = batch["input_ids"]
      masks = batch["attention_masks"]
      token = batch["token_type_ids"]
      target = batch["targets"]


      ids = ids.to(device,dtype = torch.long)
      masks = masks.to(device,dtype = torch.long)
      token = token.to(device,dtype = torch.long)
      target = target.to(device,dtype = torch.float)

      output = model(input_ids = ids,
                    attention_mask = masks,
                    token_type_ids = token)    
      loss = Loss_Func(output,target)

      final_targets.extend(target.cpu().detach().numpy().tolist())
      final_outputs.extend(torch.sigmoid(output).cpu().detach().numpy().tolist())

      return loss, np.vstack(final_outputs),np.vstack(final_targets)


In [9]:

def train():

  df = pd.read_csv("/content/drive/MyDrive/Neuron/toxic/Train_Final.csv")
  
  target_cols = df.columns[2:]


  Train_Data,Valid_Data,Train_Target,Valid_Target = model_selection.train_test_split(df.comment_text.values,
                                                                               df[target_cols].values,
                                                                               test_size = .2,
                                                                               random_state = 2021,
                                                                               shuffle = True)
  
  Train_dataset = TOXICDATASET(comment_text = Train_Data,target = Train_Target,tokenizer = TOKENIZER,max_len = 128)
    
  

  Train_DataLoader = torch.utils.data.DataLoader(Train_dataset,
                                                 batch_size = 16,
                                                 sampler = torch.utils.data.RandomSampler(Train_dataset)
                                                 )
  
  Valid_dataset = TOXICDATASET(comment_text = Valid_Data,target = Valid_Target,tokenizer = TOKENIZER,max_len = 128)

  Valid_DataLoader = torch.utils.data.DataLoader(Valid_dataset,
                                                 batch_size = 8,
                                                 sampler = torch.utils.data.SequentialSampler(Valid_dataset)
                                              )
  
  config = "bert-base-multilingual-uncased"
  model = TOXICMODEL(conf = config)
  model.to(device)


  param_optimizer = list(model.named_parameters())
  no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
  optimizer_grouped_parameters = [
      {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
      'weight_decay_rate': 0.01},
      {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
      'weight_decay_rate': 0.0}
  ]
  

  optimizer = transformers.AdamW(optimizer_grouped_parameters,lr=2e-5,correct_bias=True)

  total_steps = int(len(df) / 16 * 3)
  scheduler = transformers.get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)

  best_loss = np.inf
  for epoch in trange(3, desc = "EPOCHS"):

    Train_Func(dataLoader = Train_DataLoader,optimizer = optimizer, device = device , model = model,scheduler = scheduler)
    Valid_loss, output, target = Eval_Func(dataLoader = Valid_DataLoader, model = model,device = device)

    if Valid_loss < best_loss:
      torch.save(model.state_dict(),"model.bin")
      Valid_loss = best_loss

   

  


In [10]:
train()

Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
EPOCHS:   0%|          | 0/3 [00:00<?, ?it/s]



EPOCHS:  33%|███▎      | 1/3 [00:32<01:05, 32.87s/it]



EPOCHS:  67%|██████▋   | 2/3 [01:06<00:33, 33.06s/it]



EPOCHS: 100%|██████████| 3/3 [01:40<00:00, 33.45s/it]
