In [1]:
import transformers
import pandas as pd 
import torch 
import numpy as np 
from sklearn import metrics 
from torch.utils.data import Dataset,DataLoader
from torch.utils.data import RandomSampler,SequentialSampler
from transformers import BertTokenizer,BertModel,BertConfig
from torch import cuda 
device = 'cuda' if cuda.is_available() else 'cpu'

In [2]:
print(device)

cpu


In [3]:
df = pd.read_csv('jigsaw_toxic_comment_classification.csv')

In [4]:
df.shape

(159571, 8)

In [5]:
print(df.columns.tolist())

['id', 'comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']


In [6]:
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [7]:
df['list'] = df[df.columns[2:]].values.tolist()
new_df = df[['comment_text', 'list']].copy()
new_df.head()

Unnamed: 0,comment_text,list
0,Explanation\nWhy the edits made under my usern...,"[0, 0, 0, 0, 0, 0]"
1,D'aww! He matches this background colour I'm s...,"[0, 0, 0, 0, 0, 0]"
2,"Hey man, I'm really not trying to edit war. It...","[0, 0, 0, 0, 0, 0]"
3,"""\nMore\nI can't make any real suggestions on ...","[0, 0, 0, 0, 0, 0]"
4,"You, sir, are my hero. Any chance you remember...","[0, 0, 0, 0, 0, 0]"


In [8]:
new_df.loc[0]['list']

[0, 0, 0, 0, 0, 0]

Dataloader that will feed the data in batches to the neural network for suitable training and processing. Dataset and Dataloader are constructs of the PyTorch library for defining and controlling the data pre-processing and its passage to neural network.

In [9]:
MAX_LEN = 50
TRAIN_BATCH_SIZE = 64 
VALID_BATCH_SIZE = 4 
EPOCHS = 1 
LEARNING_RATE = 1e-05
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [23]:
class CustomDataset(Dataset):
    def __init__(self,dataframe,tokenizer,max_len):
        self.tokenizer = tokenizer 
        self.data = dataframe 
        self.comment_text = dataframe.comment_text 
        self.targets = self.data.list 
        self.max_len = max_len 
    
    def __len__(self):
        return len(self.comment_text)
    
    def __getitem__(self,index):
        comment_text = str(self.comment_text[index])
        comment_text = " ".join(comment_text.split())
        
        inputs = self.tokenizer.encode_plus(
            comment_text,
            None,
            add_special_tokens = True,
            max_length = self.max_len,
            pad_to_max_length = True,
            return_token_type_ids = True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs['token_type_ids']
        
        return {
            'ids':torch.tensor(ids,dtype=torch.long),
            'mask':torch.tensor(mask,dtype=torch.long),
            'token_type_ids':torch.tensor(token_type_ids,dtype=torch.long),
            'target':torch.tensor(self.targets[index],dtype=torch.float)
        }

In [24]:
text = str(df['comment_text'][0]).split(' ')

In [25]:
print(text)

['Explanation\nWhy', 'the', 'edits', 'made', 'under', 'my', 'username', 'Hardcore', 'Metallica', 'Fan', 'were', 'reverted?', 'They', "weren't", 'vandalisms,', 'just', 'closure', 'on', 'some', 'GAs', 'after', 'I', 'voted', 'at', 'New', 'York', 'Dolls', 'FAC.', 'And', 'please', "don't", 'remove', 'the', 'template', 'from', 'the', 'talk', 'page', 'since', "I'm", 'retired', 'now.89.205.38.27']


In [26]:
print(tokenizer)

BertTokenizer(name_or_path='bert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True)


In [27]:
tokenizer.encode_plus(text)

{'input_ids': [101, 100, 1996, 100, 2081, 2104, 2026, 100, 100, 100, 100, 2020, 100, 100, 100, 100, 2074, 8503, 2006, 2070, 100, 2044, 100, 5444, 2012, 100, 100, 100, 100, 100, 3531, 100, 6366, 1996, 23561, 2013, 1996, 2831, 3931, 2144, 100, 3394, 100, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [28]:
len(tokenizer.encode_plus(text)['input_ids'])

44

In [29]:
print(tokenizer.encode(text))

[101, 100, 1996, 100, 2081, 2104, 2026, 100, 100, 100, 100, 2020, 100, 100, 100, 100, 2074, 8503, 2006, 2070, 100, 2044, 100, 5444, 2012, 100, 100, 100, 100, 100, 3531, 100, 6366, 1996, 23561, 2013, 1996, 2831, 3931, 2144, 100, 3394, 100, 102]


* token_type_ids: This list indicates to which segment each token belongs. 
* input_ids: This is a list of integers representing the tokenized version of the input text.
* attention_mask: This list of binary values (0 or 1) indicates which tokens should be attended to (1) and which should be ignored (0). 

In [30]:
train_size = 0.03
test_size = 0.01
train_dataset = new_df.sample(frac = train_size,random_state = 200)
test_dataset = new_df.drop(train_dataset.index).reset_index(drop = True)
train_dataset = train_dataset.reset_index(drop = True)
test_dataset = new_df.sample(frac = 0.01,random_state = 200)

In [31]:
print('Full dataset :{}'.format(new_df.shape))
print('Train dataset :{}'.format(train_dataset.shape))
print('Test dataset :{}'.format(test_dataset.shape))

training_set = CustomDataset(train_dataset,tokenizer,MAX_LEN)
testing_set = CustomDataset(test_dataset,tokenizer,MAX_LEN)

Full dataset :(159571, 2)
Train dataset :(4787, 2)
Test dataset :(1596, 2)


In [32]:
training_set.__len__()

4787

In [33]:
sample_index = 39
for i in training_set[sample_index].keys():
    print(i,training_set[sample_index][i].shape)

ids torch.Size([50])
mask torch.Size([50])
token_type_ids torch.Size([50])
target torch.Size([6])


In [34]:
train_params = {
    'batch_size':TRAIN_BATCH_SIZE,
    'shuffle':True,
    'num_workers':0
}
test_params = {
    'batch_size':VALID_BATCH_SIZE,
    'shuffle':True,
    'num_workers':0
}
training_loader = DataLoader(training_set,**train_params)
testing_loader = DataLoader(testing_set,**test_params)

In [35]:
class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass,self).__init__()
        self.bert_layer = transformers.BertModel.from_pretrained('bert-base-uncased')
        self.dropout_layer = torch.nn.Dropout(0.3)
        self.linear_layer = torch.nn.Linear(768,6)
        
    def forward(self,ids,mask,token_type_ids):
        _,output = self.bert_layer(
            ids,
            attention_mask = mask,
            token_type_ids = token_type_ids,
            return_dict = False
        )
        output_b = self.dropout_layer(output)
        output = self.linear_layer(output)
        return output

In [36]:
model = BERTClass()
model.to(device)

BERTClass(
  (bert_layer): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_a

In [37]:
def loss_fn(outputs,targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

In [38]:
optimizer = torch.optim.Adam(
    params = model.parameters(),
    lr = LEARNING_RATE
)

In [43]:
def train(epoch):
    model.train()
    for _,data in enumerate(training_loader,0):
        ids = data['ids'].to(device,dtype=torch.long)
        mask = data['mask'].to(device,dtype=torch.long)
        token_type_ids = data['token_type_ids'].to(device,dtype=torch.long)
        targets = data['target'].to(device)
        
        outputs = model(ids,mask,token_type_ids)
        optimizer.zero_grad()
        loss = loss_fn(outputs,targets)
        
        print(f'Epoch: {epoch}, Loss:  {loss.item()}')
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

In [None]:
for epoch in range(EPOCHS):
    train(epoch)

In [47]:
def validation():
    model.eval()
    fin_targets=[]
    fin_outputs=[]
    with torch.no_grad():
        for _, data in enumerate(testing_loader, 0):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            outputs = model(ids, mask, token_type_ids)
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
    return fin_outputs, fin_targets

In [63]:
model.eval()
fin_targets = []
fin_outputs = []
with torch.no_grad():
    for _,data in enumerate(training_loader,0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['target'].to(device, dtype = torch.float)
        outputs = model(ids,mask,token_type_ids)
        fin_targets.extend(targets.cpu().detach().numpy().tolist())
        fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist()) 

In [None]:
for i in fin_outputs:
    print(i)

In [None]:
for outputs,targets in zip(fin_outputs,fin_targets):
    outputs = np.array(outputs) >= 0.5
    accuracy = metrics.accuracy_score(targets, outputs)
    f1_score_micro = metrics.f1_score(targets, outputs, average='micro')
    f1_score_macro = metrics.f1_score(targets, outputs, average='macro')
    print(f"Accuracy Score = {accuracy}")
    print(f"F1 Score (Micro) = {f1_score_micro}")
    print(f"F1 Score (Macro) = {f1_score_macro}")