In [1]:
# Insert code here.
import pandas as pd
import numpy as np
import random
import re
import time
import datetime
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize
from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm, neighbors
from sklearn.preprocessing import LabelEncoder
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW, BertConfig, AutoModel
import torch
from torch.utils.data import TensorDataset, random_split, DataLoader, RandomSampler, SequentialSampler
from transformers import BertForSequenceClassification
from transformers import get_linear_schedule_with_warmup
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import classification_report
from torch.utils.data import Dataset
from tqdm import tqdm

# from sentence_transformers import SentenceTransformer
# sent_encoder = SentenceTransformer('bert-base-nli-mean-tokens')

In [2]:
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda:0")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")
torch.cuda.empty_cache()

There are 4 GPU(s) available.
We will use the GPU: GeForce RTX 2080 Ti


In [3]:
models = ['bert-base-multilingual-cased', 'xlm-roberta-base', 'sagorsarker/bangla-bert-base', 'ai4bharat/indic-bert']
model_num = 3
tokenizer = AutoTokenizer.from_pretrained(models[model_num])


In [4]:
train = pd.read_csv('data/train.tsv', sep='\t')
test = pd.read_csv('data/valid.tsv', sep='\t')

In [5]:
train.sample(20)

Unnamed: 0,Unique ID,Post,Labels Set
3657,3773,मां के बाद पिता को खोया: 10 दिन के अंदर हुआ गौ...,non-hostile
3932,4048,"#PranabMukherjee के निधन पर चीन, नेपाल और बांग...",non-hostile
228,229,"अमरनाथ यात्रा को लेकर सुरक्षा के कड़े इंतजाम, ...",non-hostile
2745,2861,"BJP की ओर से जवाब आया. ""डिसलाइक्स का 98 फी...",non-hostile
4840,5042,दुनियाभर में कोरोना का हाल https://t.co/fPMjt1...,non-hostile
667,668,मिलिए जामिया की महिला protester से इनके मा बा...,offensive
3350,3466,सुशांत केस में एक और ड्रग पेडलर गिरफ्तार #Sush...,non-hostile
1664,1732,28वें ओवर में राशिद ने झटके दो विकेट. पहले कैर...,non-hostile
3120,3236,एक बहुत ही गरीब परिवार में माँ और उसका बेटा रह...,defamation
1348,1416,"परफ़ेक्ट बॉडी ने मुझे लोगों का चहेता बनाया, पर...",non-hostile


In [6]:
def encode_labels(label):
    tmp = label.split(',')
    ls = [0, 0, 0, 0]
    if tmp[0] == 'non-hostile':
        return ls
    if 'fake' in tmp:
        ls[0] = 1
    if 'hate' in tmp:
        ls[1] = 1
    if 'offensive' in tmp:
        ls[2] = 1
    if 'defamation' in tmp:
        ls[3] = 1
    return ls

In [7]:
train['encodelabels'] = train['Labels Set'].apply(encode_labels)

In [8]:
train.sample(20)

Unnamed: 0,Unique ID,Post,Labels Set,encodelabels
4287,4403,चीनी मीडिया की धमकी: ग्लोबल टाइम्स ने लिखा- भा...,non-hostile,"[0, 0, 0, 0]"
2134,2210,"हमने नए एप मौसम, दामिनी और रेन अलार्म लॉन्च कि...",non-hostile,"[0, 0, 0, 0]"
5513,5715,एनएसए अजीत डोभाल ने यह स्वीकार कर लिया है कि च...,fake,"[1, 0, 0, 0]"
4305,4421,योगी सरकार बड़े-बड़े दावे ज़रूर कर रही है लेकि...,non-hostile,"[0, 0, 0, 0]"
3013,3129,@RaviGup53080986 @narendramodi मोदी जी के पागल...,offensive,"[0, 0, 1, 0]"
4930,5132,पूर्व राष्ट्रपति #PranabMukherjee की याद में र...,non-hostile,"[0, 0, 0, 0]"
3053,3169,कश्मीर में सेना को बड़ी कामयाबी: बारामूला के र...,non-hostile,"[0, 0, 0, 0]"
5197,5399,Realme V3 हो सकता है कंपनी का सबसे सस्ता 5जी स...,non-hostile,"[0, 0, 0, 0]"
713,714,#BREAKING | सुशांत के बिजनेस पार्टनर को समन जा...,non-hostile,"[0, 0, 0, 0]"
945,946,स्वस्थ समाज की कुंजी खान-पान के प्रति जागरूकता...,non-hostile,"[0, 0, 0, 0]"


In [9]:
def hamming_score(y_true, y_pred, normalize=True, sample_weight=None):
    acc_list = []
    for i in range(y_true.shape[0]):
        set_true = set( np.where(y_true[i])[0] )
        set_pred = set( np.where(y_pred[i])[0] )
        tmp_a = None
        if len(set_true) == 0 and len(set_pred) == 0:
            tmp_a = 1
        else:
            tmp_a = len(set_true.intersection(set_pred))/\
                    float( len(set_true.union(set_pred)) )
        acc_list.append(tmp_a)
    return np.mean(acc_list)

In [10]:
# Sections of config

# Defining some key variables that will be used later on in the training
MAX_LEN = 128
TRAIN_BATCH_SIZE = 32
VALID_BATCH_SIZE = 32
EPOCHS = 4
LEARNING_RATE = 1e-05


In [11]:
class MultiLabelDataset(Dataset):

    def __init__(self, dataframe, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.text = dataframe.Post
        self.targets = self.data.encodelabels
        self.max_len = max_len

    def __len__(self):
        return len(self.text)

    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            truncation=True,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_attention_mask = True,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]


        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float)
        }

In [12]:
# Creating the dataset and dataloader for the neural network

train_size = 0.8
train_data=train.sample(frac=train_size,random_state=200)
test_data=train.drop(train_data.index).reset_index(drop=True)
train_data = train_data.reset_index(drop=True)


print("FULL Dataset: {}".format(train.shape))
print("TRAIN Dataset: {}".format(train_data.shape))
print("TEST Dataset: {}".format(test_data.shape))

training_set = MultiLabelDataset(train_data, tokenizer, MAX_LEN)
testing_set = MultiLabelDataset(test_data, tokenizer, MAX_LEN)

FULL Dataset: (5527, 4)
TRAIN Dataset: (4422, 4)
TEST Dataset: (1105, 4)


In [13]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [14]:
class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.l1 = AutoModel.from_pretrained(models[model_num])
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(0.1)
        self.classifier = torch.nn.Linear(768, 4)

    def forward(self, input_ids, attention_mask, token_type_ids):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.Tanh()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output

model = BERTClass()
model.to(device)

BERTClass(
  (l1): AlbertModel(
    (embeddings): AlbertEmbeddings(
      (word_embeddings): Embedding(200000, 128, padding_idx=0)
      (position_embeddings): Embedding(512, 128)
      (token_type_embeddings): Embedding(2, 128)
      (LayerNorm): LayerNorm((128,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0, inplace=False)
    )
    (encoder): AlbertTransformer(
      (embedding_hidden_mapping_in): Linear(in_features=128, out_features=768, bias=True)
      (albert_layer_groups): ModuleList(
        (0): AlbertLayerGroup(
          (albert_layers): ModuleList(
            (0): AlbertLayer(
              (full_layer_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
              (attention): AlbertAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
             

In [15]:
def loss_fn(outputs, targets):
    return torch.nn.BCEWithLogitsLoss()(outputs, targets)

In [16]:
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [17]:
def train(epoch):
    model.train()
    for _,data in tqdm(enumerate(training_loader, 0)):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float)

        outputs = model(ids, mask, token_type_ids)

        optimizer.zero_grad()
        loss = loss_fn(outputs, targets)
#         if _%50==0:
#             print(f'Epoch: {epoch}, Loss:  {loss.item()}')
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    model.eval()
    fin_targets=[]
    fin_outputs=[]
    print(f'Epoch: {epoch}, Loss:  {loss.item()}')
    with torch.no_grad():
        for _, data in tqdm(enumerate(testing_loader, 0)):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            outputs = model(ids, mask, token_type_ids)
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
    final_outputs = np.array(fin_outputs) >=0.5
    final = []
    final_t = []
    final_fine = [[],[],[],[]]
    final_fine_t = [[],[],[],[]]
    for (i,j) in zip(final_outputs, fin_targets):
        output_sum = sum(i)
        target_sum = sum(j)
        if output_sum == 0:
            final.append(0)
        else:
            final.append(1)
        if target_sum == 0:
            final_t.append(0)
        else:
            final_t.append(1)
        for p in range(4):
            final_fine[p].append(int(i[p]))
            final_fine_t[p].append(int(j[p]))
    print("Coarse:")
    print(classification_report(final, final_t))
    for i in range(4):
        print("Fine", i)
        print(classification_report(final_fine[i], final_fine_t[i]))
#     return fin_outputs, fin_targets

In [18]:
for epoch in range(EPOCHS):
    train(epoch)

139it [00:33,  4.10it/s]
2it [00:00, 10.72it/s]

Epoch: 0, Loss:  0.539330780506134


4it [00:00, 10.20it/s]


KeyboardInterrupt: 

In [None]:
def validation(testing_loader):
    model.eval()
    fin_targets=[]
    fin_outputs=[]
    with torch.no_grad():
        for _, data in tqdm(enumerate(testing_loader, 0)):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            outputs = model(ids, mask, token_type_ids)
            fin_targets.extend(targets.cpu().detach().numpy().tolist())
            fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
    return fin_outputs, fin_targets

In [None]:
outputs, targets = validation(testing_loader)

final_outputs = np.array(outputs) >=0.5

In [None]:
val_hamming_loss = metrics.hamming_loss(targets, final_outputs)
val_hamming_score = hamming_score(np.array(targets), np.array(final_outputs))

print(f"Hamming Score = {val_hamming_score}")
print(f"Hamming Loss = {val_hamming_loss}")

In [None]:
from sklearn.metrics import classification_report
total = len(targets)
final = []
final_t = []
final_fine = [[],[],[],[]]
final_fine_t = [[],[],[],[]]
for (i,j) in zip(final_outputs, targets):
    output_sum = sum(i)
    target_sum = sum(j)
    if output_sum == 0:
        final.append(0)
    else:
        final.append(1)
    if target_sum == 0:
        final_t.append(0)
    else:
        final_t.append(1)
    for p in range(4):
        final_fine[p].append(int(i[p]))
        final_fine_t[p].append(int(j[p]))
print("Coarse:")
print(classification_report(final, final_t))
for i in range(4):
    print("Fine", i)
    print(classification_report(final_fine[i], final_fine_t[i]))

In [None]:
acc/total

In [None]:
final_fine = [[],[],[],[]]

In [None]:
final_fine