In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install torch

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
! pip install pytorch_pretrained_bert==0.6.1

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [4]:
!pip install boto3

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [5]:
%cd "/content/drive/MyDrive/riya"

/content/drive/.shortcut-targets-by-id/1TCvtUjcTYKWgalUeZbJk_aFhLp1aLjr6/riya


In [6]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [7]:
import csv
import logging
import os
import random
import sys

import numpy as np
import torch
from torch.utils.data import (DataLoader, RandomSampler, SequentialSampler,
                              TensorDataset)
from torch.utils.data.distributed import DistributedSampler
from tqdm import tqdm, trange

from pytorch_pretrained_bert.file_utils import PYTORCH_PRETRAINED_BERT_CACHE
from pytorch_pretrained_bert.modeling import BertForSequenceClassification, BertConfig, WEIGHTS_NAME, CONFIG_NAME
#from pytorch_pretrained_bert.tokenization import BertTokenizer
from pytorch_pretrained_bert.optimization import BertAdam, warmup_linear

from bertviz.bertviz import attention, visualization
from bertviz.bertviz.pytorch_pretrained_bert import BertModel, BertTokenizer

Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.
Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.


In [8]:
from transformers import (
    BertForSequenceClassification,
    BertTokenizer,
    RobertaModel,
    RobertaForSequenceClassification,
    RobertaTokenizer,
    AdamW)

In [9]:
logger = logging.getLogger(__name__)
bert_classifier_model_dir = "/content/drive/MyDrive/riya/robertaOutput/roberta.pt" ## Path of BERT classifier model path
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
logger.info("device: {}, n_gpu {}".format(device, n_gpu))

In [10]:
# file paths
data_dir = "/content/drive/MyDrive/riya/data"
dataset = "yelp" # amazon / yelp / imagecaption
train_0 = os.path.join(data_dir ,"{}/sentiment.train.0".format(dataset))
train_1 = os.path.join(data_dir,"{}/sentiment.train.1".format(dataset))
test_0 = os.path.join(data_dir,"{}/sentiment.test.0".format(dataset))
test_1 = os.path.join(data_dir,"{}/sentiment.test.1".format(dataset))
dev_0 = os.path.join(data_dir,"{}/sentiment.dev.0".format(dataset))
dev_1 = os.path.join(data_dir,"{}/sentiment.dev.1".format(dataset))
reference_0 = os.path.join(data_dir,"{}/reference.0".format(dataset))
reference_1 = os.path.join(data_dir,"{}/reference.1".format(dataset))

In [11]:
# file paths
data_dir = "/content/drive/MyDrive/riya/data"
dataset = "yelp" # amazon / yelp / imagecaption
train_0_out = os.path.join(data_dir ,"{}/processed_files_with_bert_with_best_head/sentiment_train_0.txt".format(dataset))
train_1_out = os.path.join(data_dir,"{}/processed_files_with_bert_with_best_head/sentiment_train_1.txt".format(dataset))
test_0_out = os.path.join(data_dir,"{}/processed_files_with_bert_with_best_head/sentiment_test_0.txt".format(dataset))
test_1_out = os.path.join(data_dir,"{}/processed_files_with_bert_with_best_head/sentiment_test_1.txt".format(dataset))
dev_0_out = os.path.join(data_dir,"{}/processed_files_with_bert_with_best_head/sentiment_dev_0.txt".format(dataset))
dev_1_out = os.path.join(data_dir,"{}/processed_files_with_bert_with_best_head/sentiment_dev_1.txt".format(dataset))
reference_0_out = os.path.join(data_dir,"{}/processed_files_with_bert_with_best_head/reference_0.txt".format(dataset))
reference_1_out = os.path.join(data_dir,"{}/processed_files_with_bert_with_best_head/reference_1.txt".format(dataset))

In [12]:
## Model for performing Classification
roberta_tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model_dict = torch.load('/content/drive/MyDrive/riya/robertaOutput/roberta.pt', map_location=device)
model_cls = RobertaForSequenceClassification.from_pretrained(pretrained_model_name_or_path='roberta-base', state_dict=model_dict)
model_cls.to(device)
model_cls.eval()

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
             

In [13]:
## Model to get the attention weights of all the heads
model_dict = torch.load('/content/drive/MyDrive/riya/robertaOutput/roberta.pt', map_location=device)
model = RobertaModel.from_pretrained(pretrained_model_name_or_path='roberta-base', state_dict=model_dict, add_cross_attention=True, is_decoder = True)

tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model.to(device)
model.eval()

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['classifier.out_proj.bias', 'classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


RobertaModel(
  (embeddings): RobertaEmbeddings(
    (word_embeddings): Embedding(50265, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (token_type_embeddings): Embedding(1, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): RobertaEncoder(
    (layer): ModuleList(
      (0-11): 12 x RobertaLayer(
        (attention): RobertaAttention(
          (self): RobertaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): RobertaSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (dropou

In [14]:
max_seq_len=70 # Maximum sequence length 
sm = torch.nn.Softmax(dim=-1) ## Softmax over the batch

In [15]:
common_words=['is','are','was','were','has','have','had','a','an','the','this','that','these','those','there','how','i','we',
             'he','she','it','they','them','their','his','him','her','us','our', 'and','in','my','your','you', 'will', 'shall']
common_words_tokens = tokenizer.convert_tokens_to_ids(common_words)
not_to_remove_ids = tokenizer.convert_tokens_to_ids(["<s>","</s>", ".", "?", "!"])
not_to_remove_ids += common_words_tokens

In [16]:
def read_file(file_path):
    with open(file_path) as fp:
        data = fp.read().splitlines()
    return data

In [17]:
def create_output_file(original_sentences, processed_sentences, output_file, sentiment="<POS>"):
    with open(output_file,"w") as fp:
        for sen1,sen2 in zip(original_sentences,processed_sentences):
            if sen1 != None and sen2 != None:
                str1 = sentiment + " <CON_START> " + sen2 + " <START> " + sen1 + " <END>\n"
                fp.write(str1)

In [18]:
def create_ref_output_file(processed_sentences, output_file, sentiment="<POS>"):
    with open(output_file,"w") as fp:
        for sen in tqdm(processed_sentences):
            if sen != None:
                str1 = sentiment + " <CON_START> " + sen + " <START>\n"
                fp.write(str1)

In [19]:
def concate_files(inp_files, out_files):
    with open(out_files,"w") as fp:
        for file in inp_files:
            with open(file) as f:
                for line in f:
                    fp.write(line)

In [20]:
def run_attn_examples(input_sentences, layer, head, bs=128):
    """
    Returns Attention weights for selected Layer and Head along with ids and tokens
    of the input_sentence
    """
    ids = []
    ids_to_decode = [None for k in range(len(input_sentences))]
    tokens_to_decode = [None for k in range(len(input_sentences))]
    segment_ids = []
    input_masks = []
    attention_weights = [None for z in input_sentences]

    roberta_input_ids = []
    roberta_attention_masks = []
    sentence_ids = []
    counter = 0
    pred_lt = []

    ## BERT pre-processing
    for j,sen in enumerate(tqdm(input_sentences)):
        roberta_encoded_dict = roberta_tokenizer.encode_plus(
                        sen,                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = 128,           # Pad & truncate all sentences.
                        pad_to_max_length = True,
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                   )
        roberta_input_ids.append(roberta_encoded_dict['input_ids'])
        roberta_attention_masks.append(roberta_encoded_dict['attention_mask'])
        ids_to_decode[j] = roberta_encoded_dict['input_ids']
        if len(sen.split(' ')) >= max_seq_len - 2:
            sen = sen.split()[:128-4]
        # tokens = ["[CLS]"] + sen + ["[SEP]"]
        tokens_to_decode[j] = ["<s>"] + [i for i in sen.split()] + ["</s>"] + ["<pad>" for i in range(128 - 2 - len(sen.split()))]
        # tokens_to_decode[j] = ["<s>"] + [i for i in sen.split()] + ["</s>"]
        
        
    roberta_input_ids = torch.cat(roberta_input_ids, dim=0)
    roberta_attention_masks = torch.cat(roberta_attention_masks, dim=0)
    roberta_dataset = TensorDataset(roberta_input_ids, roberta_attention_masks)
    roberta_train_dataloader = DataLoader(
            roberta_dataset,  # The training samples.
            sampler = RandomSampler(roberta_dataset), # Select batches randomly
            batch_size = bs # Trains with this batch size.
        )
    all_attn_probs = []
    index = 0
    for step, batch in tqdm(enumerate(roberta_train_dataloader)):
        batch_attn_probs = []
        torch.cuda.empty_cache()
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        with torch.no_grad():
            attn = model(b_input_ids, b_input_mask, output_attentions=True).cross_attentions
        # attn = list(attn)
        for x in attn:
            batch_attn_probs.append(x[0].detach().unsqueeze(1))
        batch_attn_probs = torch.cat(batch_attn_probs, dim=1)
        batch_attn_probs = batch_attn_probs.transpose(0, 1)
            
        for j in range(len(b_input_ids)):
            attention_weights[index] = (batch_attn_probs[layer][j][head][0]).to('cpu')
            index += 1
    
    return attention_weights, ids_to_decode, tokens_to_decode

In [21]:
def prepare_data(aw, ids_to_decode, tokens_to_decode):
    out_sen = [None for i in range(len(aw))]
    for i in trange(len(aw)):
        #topv, topi = aw[i].topk(len(inps_tokens[i]))
        
        topv, topi = aw[i].topk(ids_to_decode[i][0].numpy().tolist().index(1), largest = True)
        # print(len(tokens_to_decode[i]))
        # topv, topi = aw[i].topk(len(tokens_to_decode[i]), largest = False)
        topi = topi.tolist()
        topi.sort()
        topv = topv.tolist()
        #print(i,train_0[i])
        #print(ids_to_decode[i][0])
        #print("Original Top Indexes = {}".format(topi))
        topi = [topi[j] for j in range(len(topi)) if ids_to_decode[i][0][topi[j]] not in not_to_remove_ids] # remove noun and common words
        #print("After removing Nouns = {}".format(topi))
        # topi = [topi[j] for j in range(len(topi)) if "##" not in tokens_to_decode[i][topi[j]]] # Remove half words
        #print("I = {}".format(ids_to_decode[i][0]))
        # print("After removing Half-words = {}".format(topi))

        if (len(topi) < 4 and len(topi) > 0):
            topi = topi[:2]
        elif(len(topi) < 8):
            topi = topi[:4]
        else:
            topi = topi[:6]

        #print("Final Topi = {}".format(topi))
        final_indexes = []
        count = 0
        count1 = 0
        #print(ids_to_decode[i], tokens_to_decode[i])
        while ids_to_decode[i][0][count] != 1:
            if count in topi:
                while ids_to_decode[i][0][count + count1 + 1] != 1:
                    if "##" in tokens_to_decode[i][count + count1 + 1]:
                        count1 += 1
                    else:
                        break
                count += count1
                count1 = 0
            else:
                final_indexes.append(ids_to_decode[i][0][count])
            count += 1
        final_indexes = [int(t) for t in final_indexes]
        #print(final_indexes)
        temp_out_sen = tokenizer.decode(final_indexes, skip_special_tokens = True)
        # temp_out_sen = " ".join(temp_out_sen).replace('Ġ', '').replace(" ##", "").replace("<s>","").replace("</s>","").replace("<pad>","").strip()
        #print(temp_out_sen, "\n\n")
        out_sen[i] = temp_out_sen
    
    return out_sen

In [22]:
train_0_data = read_file(train_0)
train_1_data = read_file(train_1)
dev_0_data = read_file(dev_0)
dev_1_data = read_file(dev_1)
test_0_data = read_file(test_0)
test_1_data = read_file(test_1)
ref_0_data = read_file(reference_0)
ref_1_data = read_file(reference_1)

In [23]:
aw, ids_to_decode, tokens_to_decode = run_attn_examples(train_0_data, layer=8, head=3, bs=128)
train_0_out_sen = prepare_data(aw, ids_to_decode, tokens_to_decode)
create_output_file(train_0_data, train_0_out_sen, train_0_out, sentiment="<NEG>")

  0%|          | 0/177218 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
100%|██████████| 177218/177218 [00:59<00:00, 2966.63it/s]
100%|██████████| 1385/1385 [20:13<00:00,  1.14it/s]
100%|██████████| 177218/177218 [10:11<00:00, 289.86it/s]


In [25]:
aw, ids_to_decode, tokens_to_decode = run_attn_examples(train_1_data, layer=8, head=3, bs=128)
train_1_out_sen = prepare_data(aw, ids_to_decode, tokens_to_decode)
create_output_file(train_1_data, train_1_out_sen, train_1_out, sentiment="<POS>")

100%|██████████| 266041/266041 [01:23<00:00, 3170.81it/s]
100%|██████████| 2079/2079 [30:32<00:00,  1.13it/s]
100%|██████████| 266041/266041 [13:23<00:00, 331.20it/s]


In [26]:
aw, ids_to_decode, tokens_to_decode = run_attn_examples(dev_0_data, layer=8, head=3, bs=128)
dev_0_out_sen = prepare_data(aw, ids_to_decode, tokens_to_decode)
create_output_file(dev_0_data, dev_0_out_sen, dev_0_out, sentiment="<NEG>")

100%|██████████| 2000/2000 [00:00<00:00, 3922.83it/s]
100%|██████████| 16/16 [00:11<00:00,  1.33it/s]
100%|██████████| 2000/2000 [00:10<00:00, 193.84it/s]


In [27]:
aw, ids_to_decode, tokens_to_decode = run_attn_examples(dev_1_data, layer=8, head=3, bs=128)
dev_1_out_sen = prepare_data(aw, ids_to_decode, tokens_to_decode)
create_output_file(dev_1_data, dev_1_out_sen, dev_1_out, sentiment="<POS>")

100%|██████████| 2000/2000 [00:00<00:00, 3602.92it/s]
100%|██████████| 16/16 [00:12<00:00,  1.33it/s]
100%|██████████| 2000/2000 [00:06<00:00, 319.13it/s]


In [28]:
aw, ids_to_decode, tokens_to_decode = run_attn_examples(test_1_data, layer=8, head=3, bs=128)
test_1_out_sen = prepare_data(aw, ids_to_decode, tokens_to_decode)
create_output_file(test_1_data, test_1_out_sen, test_1_out, sentiment="<POS>")

100%|██████████| 500/500 [00:00<00:00, 4034.88it/s]
100%|██████████| 4/4 [00:03<00:00,  1.28it/s]
100%|██████████| 500/500 [00:01<00:00, 340.89it/s]


In [29]:
aw, ids_to_decode, tokens_to_decode = run_attn_examples(test_0_data, layer=8, head=3, bs=128)
test_0_out_sen = prepare_data(aw, ids_to_decode, tokens_to_decode)
create_output_file(test_0_data, test_0_out_sen, test_0_out, sentiment="<NEG>")

100%|██████████| 500/500 [00:00<00:00, 3715.03it/s]
100%|██████████| 4/4 [00:03<00:00,  1.30it/s]
100%|██████████| 500/500 [00:02<00:00, 200.27it/s]


In [30]:
aw, ids_to_decode, tokens_to_decode = run_attn_examples(ref_1_data, layer=8, head=3, bs=128)
ref_1_out_sen = prepare_data(aw, ids_to_decode, tokens_to_decode)
create_ref_output_file(ref_1_out_sen, reference_1_out, sentiment="<NEG>")

100%|██████████| 500/500 [00:00<00:00, 2559.95it/s]
100%|██████████| 4/4 [00:03<00:00,  1.27it/s]
100%|██████████| 500/500 [00:02<00:00, 186.27it/s]
100%|██████████| 500/500 [00:00<00:00, 805048.75it/s]


In [31]:
aw, ids_to_decode, tokens_to_decode = run_attn_examples(ref_0_data, layer=8, head=3, bs=128)
ref_0_out_sen = prepare_data(aw, ids_to_decode, tokens_to_decode)
create_ref_output_file(ref_0_out_sen, reference_0_out, sentiment="<POS>")

100%|██████████| 500/500 [00:00<00:00, 3107.85it/s]
100%|██████████| 4/4 [00:03<00:00,  1.26it/s]
100%|██████████| 500/500 [00:04<00:00, 119.58it/s]
100%|██████████| 500/500 [00:00<00:00, 481993.11it/s]
