In [1]:
print("I am fine")

I am fine


In [2]:
import torch
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [3]:
# from transformers import LlamaForCausalLM, LlamaTokenizer
from transformers import AutoTokenizer, AutoModelForCausalLM


# tokenizer = LlamaTokenizer.from_pretrained("/120040051/hf_llama2")
# model = LlamaForCausalLM.from_pretrained("/120040051/hf_llama2").to(device)

tokenizer = AutoTokenizer.from_pretrained('/120040051/vicuna-7b-v1.5')
model = AutoModelForCausalLM.from_pretrained('/120040051/vicuna-7b-v1.5').to(device)

Loading checkpoint shards: 100%|██████████| 2/2 [01:10<00:00, 35.37s/it]


In [4]:
def calculate_perplexity_new(sentence, tokenizer, model, device):
    model.eval()
    # Tokenize the input sentence
    encodings = tokenizer(sentence, return_tensors="pt")

    input_ids = encodings.input_ids.to(device)
    attention_mask = encodings.attention_mask.to(device)
    # Note: No need to clone input_ids for labels as we're using them directly

    # Calculate Negative Log-Likelihood with no_grad context
    with torch.no_grad():
        outputs = model(input_ids, labels=input_ids, attention_mask=attention_mask)
        neg_log_likelihood = outputs.loss

    # Calculate Perplexity directly from NLL
    perplexity = torch.exp(neg_log_likelihood).item()

    return perplexity

In [5]:
def calculate_perplexity(sentence, tokenizer, model, device):
    model.eval()  # Ensure the model is in evaluation mode
    # Tokenize the input sentence
    encodings = tokenizer(sentence, return_tensors="pt")

    # Move encodings to the same device as the model
    input_ids = encodings.input_ids.to(device)
    target_ids = input_ids.clone()
    attention_mask = encodings.attention_mask.to(device)

    # Calculate Negative Log-Likelihood
    with torch.no_grad():
        outputs = model(input_ids, labels=target_ids, attention_mask=attention_mask)
        neg_log_likelihood = outputs.loss * input_ids.size(1)  # Multiply by sequence length

    # Calculate Perplexity and append to list
    ppl = torch.exp(neg_log_likelihood / input_ids.size(1))  # Divide by sequence length
    perplexity = ppl.item()

    return perplexity

### WARNING: High Computing Resources Required

In [None]:
import json
with open("./tmp_compositional_sents.json", 'r') as f:
    j_data = json.load(f)
print("Loading Complete")

sent_list = j_data["data"]
sent_list_ppl = []


i = 0
for sent_dict in sent_list:
    text = sent_dict["text"]
    ppl = calculate_perplexity(text, tokenizer, model, device)
    sent_dict["ppl"] = round(ppl, 2)
    sent_list_ppl.append(sent_dict)
    i += 1
if (i % 1000 == 0):
    print(f"Processed {i} sentences")
print(f"Total: Processed {i} sentences")
j_data["data"] = sent_list_ppl

outfile = "sentence_lookups.json"
print(f"Writing the results to {outfile}, ...")
with open(outfile, 'w') as f:
    json.dump(j_data, f)
print("Writing complete")

In [17]:
sent1 = "A shit is eating airplane in the toilet"
perplexity_values_1 = calculate_perplexity(sent1, tokenizer, model, device)
perplexity_values_2 = calculate_perplexity_new(sent1, tokenizer, model, device)

print(perplexity_values_1)
print(perplexity_values_2)

115.03160095214844
115.03153991699219
