In [1]:
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
from transformers import BertTokenizer, BertForMaskedLM
from transformers import RobertaTokenizer, RobertaForMaskedLM
import torch
from torch.utils.data import TensorDataset, Dataset, DataLoader, SequentialSampler
import os
from torch.nn.utils.rnn import pad_sequence
import jsonlines as js
os.environ["CUDA_VISIBLE_DEVICES"] = "2"
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


In [2]:

def preparedata(infile):
    sentences = []
    labels = []
    
    f = open(inp_file)
    all_lines = f.readlines()
    for l in all_lines[1:]:
        temp_s = ''.join(l.split('\t')[0]).strip()
        temp_s = temp_s.replace("[MASK]", "<mask>")
        temp_l = ''.join(l.split('\t')[-1]).strip()
        temp_l = " " + temp_l
        sentences.append(temp_s)
        labels.append(temp_l)
    

    return sentences, labels


def prep_inputs(sents, tokenizer):
    
    mask_token_indices = []
    batch_input_ids = tokenizer.batch_encode_plus(sents, add_special_tokens=True, padding=True, return_attention_mask=True, return_tensors='pt')

    for i, inp_ids in enumerate(batch_input_ids['input_ids']):
        
        mask_index = (inp_ids == tokenizer.mask_token_id).nonzero(as_tuple=True)[0]
        mask_index = torch.where(inp_ids == tokenizer.mask_token_id)[0]
        mask_token_indices.append(mask_index)
    

    return batch_input_ids, torch.tensor(mask_token_indices)


def get_predictions(model, dataloader):  
    all_correct = 0
    tot_samples = 0

    with torch.no_grad():
            all_correct_samples_mask = []
            all_correct_labels = []
            for i, batch in enumerate(eval_dataloader):
                for x in range(len(batch)):
                    batch[x] = batch[x].to(device)
                logits = model(input_ids= batch[0],attention_mask = batch[1]).logits
                soft_preds = torch.nn.functional.softmax(logits, dim=-1)
                pred_token_ids = torch.tensor([soft_preds[i, batch[2][i]].argmax(axis=-1) for i in range(soft_preds.shape[0])], device=device)
                
                tot_correct = torch.eq(pred_token_ids, batch[3]).count_nonzero().item()
                corr_mask = torch.eq(pred_token_ids, batch[3])
                labs = batch[3]
                correct_labels = tokenizer.convert_ids_to_tokens(torch.masked_select(labs.unsqueeze(-1), corr_mask.unsqueeze(-1)))
                corr_mask = corr_mask.detach().tolist()
                all_correct_samples_mask += corr_mask
                all_correct += tot_correct
                all_correct_labels += correct_labels
                tot_samples+= batch[0].shape[0]
                
            acc = all_correct / tot_samples
            print(f"Total samples : {tot_samples}")
            print(f"Correctly predicted : {all_correct}")
            
            return acc, all_correct_samples_mask, all_correct_labels
                
            


In [3]:

inp_file = "/home/zsarwar/NLP/Sorting-Through-The-Noise/data/Varying_key_entity/test.csv"

sentences, labels = preparedata(inp_file)
labels = np.asarray(labels)

tokenizer = RobertaTokenizer.from_pretrained("roberta-large", cache_dir='/bigstor/zsarwar/models/cache/')
labels_tok_indices = torch.tensor([tokenizer(lab, return_attention_mask=False, add_special_tokens=False, return_token_type_ids=False)['input_ids'][0]  for lab in labels])


In [4]:
labels_tok_indices

tensor([ 7716, 11121,  8084,  ...,   724,   422,  3799])

In [5]:
tokenizer.convert_ids_to_tokens([4825])

['fl']

In [6]:


input_ids, mask_token_indices = prep_inputs(sentences, tokenizer)
eval_dataset = TensorDataset(input_ids['input_ids'],input_ids['attention_mask'], mask_token_indices, labels_tok_indices)
eval_dataloader = DataLoader(eval_dataset, sampler = SequentialSampler(eval_dataset), batch_size= 64, drop_last=True)
model = RobertaForMaskedLM.from_pretrained('roberta-large', cache_dir='bigstor/zsarwar/models/cache')
model = model.to(device)


In [11]:

acc, correctly_classified_mask, correctly_classified_labels = get_predictions(model, eval_dataloader)


Total samples : 16768
Correctly predicted : 8761


In [12]:
sentences = np.asarray(sentences)

In [14]:
sentences_short = sentences[0:len(correctly_classified_mask)]

In [15]:
corr_class_sent = sentences_short[correctly_classified_mask]

In [17]:
corr_class_sent = corr_class_sent.tolist()

In [18]:
with js.open("/home/zsarwar/NLP/Sorting-Through-The-Noise/data/Varying_key_entity/Correctly_classified_roberta_large.jsonl", 'w') as out_file:
    for i, sample in enumerate(corr_class_sent):
        out = {"Index" : i, "Text" : sample, "Label" : correctly_classified_labels[i]}
        out_file.write(out)
