In [17]:
import torch
device = ('cuda' if torch.cuda.is_available() else 'cpu')
device='cpu'

In [32]:
import random
import numpy as np
import time
from sklearn.metrics import classification_report,accuracy_score,f1_score

total_t0 = time.time()

seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [18]:
import csv

import pandas as pd
from glob import glob

file_total_original = "data/hate_speech_hindi_english.tsv"
df_total = pd.read_csv(file_total_original,sep="\t",encoding='utf-8',quoting=csv.QUOTE_NONE,usecols=[0,1])
df_total = df_total.dropna()

total_sentences = list(df_total['text'].values)
total_labels = list(df_total['label'].values)

fin = open("sentences.txt","w+",encoding="utf-8")

for line in total_sentences:
    fin.write(line+"\n")

length = []
for line in total_sentences:
    line_list = line.split()
    length.append(len(line_list))

from collections import Counter

#print(Counter(length).most_common())

max_length = 2*Counter(length).most_common()[-1][0]
print(max_length)

134


In [19]:
from sklearn.preprocessing import LabelEncoder
new_total_labels=[]
for item in total_labels:
    if 'n' in item:
        new_total_labels.append('no')
    else:
        new_total_labels.append('yes')

le = LabelEncoder()
le.fit(new_total_labels)
encoded_labels = le.transform(new_total_labels)

In [20]:
df = pd.DataFrame(zip(total_sentences,encoded_labels),columns=['text','label'])

train = df.sample(frac = 0.75)
test_valid = df.drop(train.index)

test = test_valid.sample(frac=0.75)
valid = test_valid.drop(test.index)

print(len(train),len(test),len(valid))

3434 858 286


In [21]:
def encoder_generator(tokenizer,sentences,labels):
    
    sent_index = []
    input_ids = []
    attention_masks =[]

    for index,sent in enumerate(sentences):
        
        sent_index.append(index)
        
        encoded_dict = tokenizer.encode_plus(sent,
                                             add_special_tokens=True,
                                             max_length=max_length,
                                             pad_to_max_length=True,
                                             truncation = True,
                                             return_attention_mask=True,
                                             return_tensors='pt')
        #print(encoded_dict['input_ids'].shape)
        input_ids.append(encoded_dict['input_ids'])

        #print(encoded_dict['attention_mask'].shape)
        attention_masks.append(encoded_dict['attention_mask'])

    input_ids = torch.cat(input_ids,dim=0)
    print(input_ids.shape)
    attention_masks = torch.cat(attention_masks,dim=0)
    print(attention_masks.shape)
    labels = torch.tensor(labels)
    print(labels.shape)
    sent_index = torch.tensor(sent_index)

    return sent_index,input_ids,attention_masks,labels


In [22]:
from transformers import RobertaTokenizerFast
tokenizer = RobertaTokenizerFast.from_pretrained("./CM_bert", max_len=max_length)

train_sent_index,train_input_ids,train_attention_masks,train_encoded_label_tensors = encoder_generator(tokenizer,train['text'].values,train['label'].values)
test_sent_index,test_input_ids,test_attention_masks,test_encoded_label_tensors = encoder_generator(tokenizer,test['text'].values,test['label'].values)
valid_sent_index,valid_input_ids,valid_attention_masks,valid_encoded_label_tensors = encoder_generator(tokenizer,valid['text'].values,valid['label'].values)

print('Original: ', total_sentences[0])
print('Token IDs:', train_input_ids[0])
#print(encoded_label_tensors)
#print(encoded_test_label_tensors)



torch.Size([3434, 134])
torch.Size([3434, 134])
torch.Size([3434])
torch.Size([858, 134])
torch.Size([858, 134])
torch.Size([858])
torch.Size([286, 134])
torch.Size([286, 134])
torch.Size([286])
Original:  Knowing ki Vikas kitna samjhata hai Priyanka aur Itch Guard Luv ko, usne bola tha Ben wali baat me ab Sallu ne bhi agree kiya!
Token IDs: tensor([    0,  1252,   513,   289,   329,   291,   453,   448,   513,   292,
        10894,   292,   715,   587,   276,  1217,  1816,  5753,   293,   324,
          565,   374,  1058,   276,  6560,   289,  2200,   521,   276,     2,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,    

In [23]:
from transformers import BertTokenizer
bert_tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

pt_train_sent_index,pt_train_input_ids,pt_train_attention_masks,pt_train_encoded_label_tensors = encoder_generator(bert_tokenizer,train['text'].values,train['label'].values)
pt_test_sent_index,pt_test_input_ids,pt_test_attention_masks,pt_test_encoded_label_tensors = encoder_generator(bert_tokenizer,test['text'].values,test['label'].values)
pt_valid_sent_index,pt_valid_input_ids,pt_valid_attention_masks,pt_valid_encoded_label_tensors = encoder_generator(bert_tokenizer,valid['text'].values,valid['label'].values)

print('Original: ', total_sentences[0])
print('Token IDs:', pt_train_input_ids[0])

torch.Size([3434, 134])
torch.Size([3434, 134])
torch.Size([3434])
torch.Size([858, 134])
torch.Size([858, 134])
torch.Size([858])
torch.Size([286, 134])
torch.Size([286, 134])
torch.Size([286])
Original:  Knowing ki Vikas kitna samjhata hai Priyanka aur Itch Guard Luv ko, usne bola tha Ben wali baat me ab Sallu ne bhi agree kiya!
Token IDs: tensor([  101, 12541, 10139, 10237, 10730, 11520, 82523, 10879, 10679, 12796,
        10139, 10237, 10879, 28335, 59404, 10879, 11519, 82612, 10113, 13080,
        10730, 11107, 10124, 10440, 29694, 10237, 25085, 10104, 10120, 10679,
        11023, 41193, 10113, 13080, 97715, 10730, 29694, 10921, 26506, 10113,
        13080,   102,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,    

In [24]:
from torch.utils.data import TensorDataset,random_split

train_dataset = TensorDataset(train_input_ids,train_attention_masks,train_encoded_label_tensors)
test_dataset = TensorDataset(test_input_ids,test_attention_masks,test_encoded_label_tensors)
valid_dataset = TensorDataset(valid_input_ids,valid_attention_masks,valid_encoded_label_tensors)

pt_train_dataset = TensorDataset(pt_train_input_ids,pt_train_attention_masks,pt_train_encoded_label_tensors)
pt_test_dataset = TensorDataset(pt_test_input_ids,pt_test_attention_masks,pt_test_encoded_label_tensors)
pt_valid_dataset = TensorDataset(pt_valid_input_ids,pt_valid_attention_masks,pt_valid_encoded_label_tensors)

In [25]:
from torch.utils.data import DataLoader,SequentialSampler

bs=32

train_data_loader = DataLoader(train_dataset,
                              sampler=SequentialSampler(train_dataset),
                              batch_size=bs)
valid_data_loader = DataLoader(valid_dataset,
                              sampler=SequentialSampler(valid_dataset),
                              batch_size=bs)
test_data_loader = DataLoader(test_dataset,
                            sampler=SequentialSampler(test_dataset),
                            batch_size=bs)

pt_train_data_loader = DataLoader(pt_train_dataset,
                              sampler=SequentialSampler(pt_train_dataset),
                              batch_size=bs)
pt_valid_data_loader = DataLoader(pt_valid_dataset,
                              sampler=SequentialSampler(pt_valid_dataset),
                              batch_size=bs)
pt_test_data_loader = DataLoader(pt_test_dataset,
                            sampler=SequentialSampler(pt_test_dataset),
                            batch_size=bs)

In [26]:
from transformers import RobertaForSequenceClassification, AdamW

cm_model = RobertaForSequenceClassification.from_pretrained('./CM_bert',
                                                     num_labels=len(le.classes_),
                                                     output_attentions=False,
                                                     output_hidden_states=False)
#cm_model.cuda()


Some weights of the model checkpoint at ./CM_bert were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight', 'lm_head.decoder.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at ./CM_bert and are newly initialized: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.weight

In [27]:
from transformers import BertForSequenceClassification

pt_bert_model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-uncased',
                                                     num_labels=len(le.classes_),
                                                     output_attentions=False,
                                                     output_hidden_states=False)
#pt_bert_model.cuda()


Downloading:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/672M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-multilingual-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model 

In [31]:
model_parameters = list(cm_model.parameters()) + list(pt_bert_model.parameters())

optimizer = AdamW(model_parameters,lr=2e-5,eps=1e-8)

from transformers import get_linear_schedule_with_warmup

epochs=10
total_steps = len(train_data_loader) * epochs

scheduler = get_linear_schedule_with_warmup(optimizer,
                                           num_warmup_steps=0,
                                           num_training_steps=total_steps)

In [33]:
def categorical_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """
    max_preds = preds.argmax(dim = 1, keepdim = True) # get the index of the max probability
    correct = max_preds.squeeze(1).eq(y)
    return correct.sum() / torch.FloatTensor([y.shape[0]]).to(device)

def predictions_labels(preds,labels):
    pred = np.argmax(preds,axis=1).flatten()
    label = labels.flatten()
    return pred,label

In [34]:
from torch import nn

fc_linear = nn.Linear(2*len(le.classes_),len(le.classes_))

In [50]:
from tqdm import tqdm

alpha = 0.5

def train(cm_model,cm_data_loader,pt_model,pt_data_loader):
    total_train_loss = 0
    total_train_acc = 0
    
    cm_model.train() # set model in train model for batchnorm and dropout layers in bert model
    pt_model.train()
    
    #print(cm_model)
    #print(pt_model)
    
    for cm_batch,pt_batch in zip(cm_data_loader,pt_data_loader):
        b_cm_input_ids = cm_batch[0].to(device)
        b_cm_input_mask = cm_batch[1].to(device)
        b_cm_labels = cm_batch[2].to(device)

        b_pt_input_ids = pt_batch[0].to(device)
        print(b_pt_input_ids.shape)
        b_pt_input_mask = pt_batch[1].to(device)
        b_pt_labels = pt_batch[2].to(device)
        print(b_pt_labels.shape)
        
        cm_model.zero_grad()
        pt_model.zero_grad()
            
        cm_output = cm_model(b_cm_input_ids,
                            attention_mask=b_cm_input_mask,
                            labels=b_cm_labels.long())
        
        cm_loss = cm_output[0]
        cm_logits = cm_output[1]

        print(cm_loss.item())


        pt_output = pt_model(b_pt_input_ids,
                            attention_mask=b_pt_input_mask,
                            labels=b_pt_labels.long())
        
        pt_loss = pt_output[0]
        pt_logits = pt_output[1]

        print(pt_loss.item())
            
        loss=(1-alpha)*cm_loss.item()+alpha*pt_loss.item()

        total_train_loss+=loss

        logits = fc_linear(cm_logits,pt_logits)

        total_train_acc+=categorical_accuracy(logits,b_pt_labels).item()
            
        loss.backward()
            
        torch.nn.utils.clip_grad_norm_(cm_model.parameters(),1.0)
        torch.nn.utils.clip_grad_norm_(pt_model.parameters(),1.0)
            
        optimizer.step()
            
        scheduler.step() #go ahead and update the learning rate
            
    avg_train_loss = total_train_loss/len(data_loader)
    avg_train_acc = total_train_acc/len(data_loader)
    
    return avg_train_loss,avg_train_acc

In [51]:
train(cm_model,train_data_loader,pt_bert_model,pt_train_data_loader)

torch.Size([32, 134])
torch.Size([32])
0.7474662065505981


IndexError: index out of range in self