In [17]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, AutoTokenizer
import torch
from tqdm import tqdm
model_name = 't5-3b'
cache_dir = '/work/09127/tomyoung/ls6/inconsistencies_project/t5-3b-cache'
tokenizer = T5Tokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name, cache_dir=cache_dir).cuda()
    

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-3b automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [12]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import math
loss_fn = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id) #reduction='sum'
# loss = loss_fn(logits.view(-1, logits.shape[-1]), labels.view(-1))
loss_fn_sum = nn.CrossEntropyLoss(ignore_index=tokenizer.pad_token_id, reduction='sum') #reduction='sum'

In [13]:
import pickle
with open('/work/09127/tomyoung/ls6/data/pkls/dict_url_to_options_5_grams.pkl','rb') as f:
    dict_url_to_options_5_grams = pickle.load(f)
# get a list of punctuations
import string
punctuations = string.punctuation

In [21]:
dict_url_to_options_5_grams_keys = list(dict_url_to_options_5_grams.keys())
url_to_t5_5_gram_probs_dict = {}
# make sure it ends with a punctuation
for i in tqdm(range(2500)):
    key = dict_url_to_options_5_grams_keys[i]
    # print('key:', key)
    # print(dict_url_to_options_5_grams[key])
    # alternative: make sure it ends with a punctuation
    if dict_url_to_options_5_grams[key]['alternative'][-1] not in punctuations:
        continue
    # for 10-grams
    # get the input string 
    original_sentence_words = dict_url_to_options_5_grams[key]['original_sentence'].split(' ')
    # len should >= 15
    if len(original_sentence_words) < 15:
        continue
    input_for_10_grams = ' '.join(original_sentence_words[:-10]) + ' <extra_id_0>'
    input_for_10_grams_ids = tokenizer(input_for_10_grams, return_tensors="pt").input_ids.to("cuda")
    # original 10-gram
    original_10_gram = "<extra_id_0> " + ' '.join(original_sentence_words[-10:]) + ' <extra_id_1>'
    labels_original_10_gram = tokenizer(original_10_gram, return_tensors="pt").input_ids.to("cuda")
    labels_original_10_gram = labels_original_10_gram[:, :-1].contiguous() # remove the last token '</s>'
    outputs = model(input_for_10_grams_ids, labels=labels_original_10_gram)
    log_p_original_10_gram = -loss_fn_sum(outputs.logits[0][1:-1], labels_original_10_gram[0][1:-1]) # lose the <extra_id_0> and <extra_id_1>
    
    # probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
    # for i in range(len(labels_original_10_gram[0])):
    #     print('label:', labels_original_10_gram[0][i], 'prob:', probs[0][i][labels_original_10_gram[0][i]])

    # continue
    # proposed 10-gram
    proposed_10_gram = "<extra_id_0> " + \
     ' '.join(original_sentence_words[-10:-5]) + ' ' + \
    dict_url_to_options_5_grams[key]['alternative'] + ' <extra_id_1>'
    labels_proposed_10_gram = tokenizer(proposed_10_gram, return_tensors="pt").input_ids.to("cuda")
    labels_proposed_10_gram = labels_proposed_10_gram[:, :-1].contiguous() # remove the last token '</s>'
    outputs = model(input_for_10_grams_ids, labels=labels_proposed_10_gram)
    log_p_proposed_10_gram = -loss_fn_sum(outputs.logits[0][1:-1], labels_proposed_10_gram[0][1:-1]) # lose the <extra_id_0> and <extra_id_1>
    # for 5-grams
    # get the input string
    input_for_5_grams = ' '.join(original_sentence_words[:-5]) + ' <extra_id_0>'
    input_for_5_grams_ids = tokenizer(input_for_5_grams, return_tensors="pt").input_ids.to("cuda")
    # original 5-gram
    original_5_gram = "<extra_id_0> " + ' '.join(original_sentence_words[-5:]) + ' <extra_id_1>'
    labels_original_5_gram = tokenizer(original_5_gram, return_tensors="pt").input_ids.to("cuda")
    labels_original_5_gram = labels_original_5_gram[:, :-1].contiguous() # remove the last token '</s>'
    outputs = model(input_for_5_grams_ids, labels=labels_original_5_gram)
    log_p_original_5_gram = -loss_fn_sum(outputs.logits[0][1:-1], labels_original_5_gram[0][1:-1]) # lose the <extra_id_0> and <extra_id_1>
    # proposed 5-gram
    proposed_5_gram = "<extra_id_0> " + dict_url_to_options_5_grams[key]['alternative'] + ' <extra_id_1>'
    labels_proposed_5_gram = tokenizer(proposed_5_gram, return_tensors="pt").input_ids.to("cuda")
    labels_proposed_5_gram = labels_proposed_5_gram[:, :-1].contiguous() # remove the last token '</s>'
    outputs = model(input_for_5_grams_ids, labels=labels_proposed_5_gram)
    log_p_proposed_5_gram = -loss_fn_sum(outputs.logits[0][1:-1], labels_proposed_5_gram[0][1:-1]) # lose the <extra_id_0> and <extra_id_1>
    # add them to the dictionary
    url_to_t5_5_gram_probs_dict[key] = {'proposed 5_gram': math.exp(log_p_proposed_5_gram.to(torch.float32).detach().cpu().numpy()),
                                         'original 5_gram': math.exp(log_p_original_5_gram.to(torch.float32).detach().cpu().numpy()),
                                         'proposed 10_gram': math.exp(log_p_proposed_10_gram.to(torch.float32).detach().cpu().numpy()),
                                         'original 10_gram': math.exp(log_p_original_10_gram.to(torch.float32).detach().cpu().numpy())}                                        
        
# save url_to_ul2_5_gram_probs_dict as a pkl
with open('/work/09127/tomyoung/ls6/data/pkls/url_to_' + model_name + '_5_gram_probs_dict.pkl', 'wb') as f:
    pickle.dump(url_to_t5_5_gram_probs_dict, f)

100%|██████████| 2500/2500 [06:39<00:00,  6.26it/s]


In [20]:
input_for_5_grams_ids

tensor([[   94,    19,     3,     9,  1709,   179,   533,   962,     5,    94,
            19,    29,    31,    17,   424,    24,    62,    31,    60,  2794,
            21,     3,     9, 10620,    21,     5, 32099,     1]],
       device='cuda:0')