In [1]:
import json
import random
import os
import math
import sys
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
import matplotlib.pyplot as plt
from tqdm import tqdm
from typing import Any, List, Optional
import nnsight
from nnsight import CONFIG, LanguageModel
import numpy as np
from collections import defaultdict
from einops import einsum
import time
from einops import rearrange, reduce
import pandas as pd

sys.path.append("../")
from src.dataset import SampleV3, DatasetV3, STORY_TEMPLATES
from src.utils import env_utils
from utils import *

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
random.seed(10)

CONFIG.set_default_api_key("d9e00ab7d4f74643b3176de0913f24a7")
os.environ["HF_TOKEN"] = "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"

# Ignore warnings
import warnings
warnings.filterwarnings("ignore")
CONFIG.APP.REMOTE_LOGGING = False

%load_ext autoreload
%autoreload 2

  from .autonotebook import tqdm as notebook_tqdm
env.yml not found in /disk/u/nikhil/mind!
Setting MODEL_ROOT="". Models will now be downloaded to conda env cache, if not already there
Other defaults are set to:
    DATA_DIR = "data"
    RESULTS_DIR = "results"
    HPARAMS_DIR = "hparams"


# Loading Raw Data

In [2]:
all_states = {}
all_containers= {}
all_characters = json.load(open(os.path.join(env_utils.DEFAULT_DATA_DIR, "synthetic_entities", "characters.json"), "r"))

for TYPE, DCT in {"states": all_states, "containers": all_containers}.items():
    ROOT = os.path.join(
        env_utils.DEFAULT_DATA_DIR, "synthetic_entities", TYPE
    )
    for file in os.listdir(ROOT):
        file_path = os.path.join(ROOT, file)
        with open(file_path, "r") as f:
            names = json.load(f)
        DCT[file.split(".")[0]] = names

# Loading model

In [3]:
# model = LanguageModel("meta-llama/Meta-Llama-3.1-405B")
model = LanguageModel("meta-llama/Meta-Llama-3-70B-Instruct", cache_dir="/disk/u/nikhil/.cache/huggingface/hub/", device_map="auto", load_in_4bit=True, torch_dtype=torch.float16, dispatch=True)

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
Loading checkpoint shards: 100%|██████████| 30/30 [00:49<00:00,  1.64s/it]


In [4]:
model.eval()
for param in model.parameters():
    param.requires_grad_(False)

# Loading Helper Functions

In [9]:
def get_ques_start_token_idx(tokenizer, prompt):
    if isinstance(prompt, list):
        
    input_tokens = tokenizer.encode(prompt, return_tensors="pt").squeeze()
    corrolary_token = tokenizer.encode(":", return_tensors="pt").squeeze()[-1].item()
    ques_start_idx = (input_tokens == corrolary_token).nonzero()[2].item()

    return ques_start_idx-1

In [10]:
def get_prompt_token_len(tokenizer, prompt):
    input_tokens = tokenizer.encode(prompt, return_tensors="pt").squeeze()
    return len(input_tokens)

In [11]:
def check_pred(pred, target, verbose=False):
    prompt = f"Instruction: Check if the following ground truth and prediction are same or different. If they are the same, then predict 'Yes', else 'No' \n\nGround truth: {target}\nPrediction: {pred}\nAnswer:"
    
    if verbose:
        print(prompt)

    with torch.no_grad():
        with model.generate(prompt, max_new_tokens=5, do_sample=False, num_return_sequences=1, pad_token_id=model.tokenizer.pad_token_id):
            out = model.generator.output.save()

    prompt_len = get_prompt_token_len(model.tokenizer, prompt)

    return model.tokenizer.decode(out[0][prompt_len:-1]).strip()

# Loading BigToM dataset

In [120]:
# Read a csv file
df_false = pd.read_csv("../data/bigtom/0_forward_belief_false_belief/stories.csv", delimiter=";")
df_true = pd.read_csv("../data/bigtom/0_forward_belief_true_belief/stories.csv", delimiter=";")

In [135]:
# For each row in the dataframe extract story, answer, and distractor
true_stories, false_stories = [], []
for i in range(len(df_true)):
    story = df_true.iloc[i]['story']
    question = df_true.iloc[i]['question']
    answer = df_true.iloc[i]['answer']
    distractor = df_true.iloc[i]['distractor']
    true_stories.append({"story": story, "question": question, "answer": answer, "distractor": distractor})

for i in range(len(df_false)):
    story = df_false.iloc[i]['story']
    question = df_true.iloc[i]['question']
    answer = df_false.iloc[i]['answer']
    distractor = df_false.iloc[i]['distractor']
    false_stories.append({"story": story, "question": question, "answer": answer, "distractor": distractor})

dataset = []
instruction = "1. Track the belief of each character as described in the story. 2. A character's belief is formed only when they perform an action themselves or can observe the action taking place. 3. A character does not have any belief about the container or its content which they cannot observe directly. 4. To answer the question, predict only the final state of the queried container in fewest tokens possible, strictly based on the belief of the character, mentioned in the question. 5. Do not predict the entire sentence with character or container as the final output."

for i in range(min(len(true_stories), len(false_stories))):
    question = true_stories[i]['question']
    visible_prompt = f"Instructions: {instruction}\n\nStory: {true_stories[i]['story']}\nQuestion: {question}\nAnswer:"

    question = false_stories[i]['question']
    invisible_prompt = f"Instructions: {instruction}\n\nStory: {false_stories[i]['story']}\nQuestion: {question}\nAnswer:"

    visible_ans = true_stories[i]['answer'].split()
    invisible_ans = false_stories[i]['answer'].split()

    # Find the index of first word which is different in both answers
    diff_idx = 0
    for idx, (v, j) in enumerate(zip(visible_ans, invisible_ans)):
        if v != j:
            diff_idx = idx
            break
    
    visible_ans = " ".join(visible_ans[diff_idx:])[:-1]
    invisible_ans = " ".join(invisible_ans[diff_idx:])[:-1]

    visible_ans_one_word = " ".join(visible_ans[diff_idx:diff_idx+1])
    invisible_ans_one_word = " ".join(invisible_ans[diff_idx:diff_idx+1])

    visible_ans_one_word = visible_ans[:-1] if visible_ans[-1] == "." else visible_ans
    invisible_ans_one_word = invisible_ans[:-1] if invisible_ans[-1] == "." else invisible_ans

    dataset.append({
        "alt_story": true_stories[i]['story'],
        "alt_question": true_stories[i]['question'],
        "alt_prompt": visible_prompt,
        "alt_ans": visible_ans,
        "org_story": false_stories[i]['story'],
        "org_question": false_stories[i]['question'],
        "org_prompt": invisible_prompt,
        "org_ans": invisible_ans,
        "target": visible_ans,
    })

dataloader = DataLoader(dataset, batch_size=1, shuffle=False)

In [136]:
idx = 0
print(dataset[idx]['alt_prompt'], dataset[idx]['alt_ans'])
print(dataset[idx]['org_prompt'], dataset[idx]['org_ans'])
print(f"Target: {dataset[idx]['target']}")

Instructions: 1. Track the belief of each character as described in the story. 2. A character's belief is formed only when they perform an action themselves or can observe the action taking place. 3. A character does not have any belief about the container or its content which they cannot observe directly. 4. To answer the question, predict only the final state of the queried container in fewest tokens possible, strictly based on the belief of the character, mentioned in the question. 5. Do not predict the entire sentence with character or container as the final output.

Story: Noor is working as a barista at a busy coffee shop. Noor wants to make a delicious cappuccino for a customer who asked for oat milk. Noor grabs a milk pitcher and fills it with oat milk. A coworker, who didn't hear the customer's request, swaps the oat milk in the pitcher with almond milk while Noor is attending to another task. Noor sees her coworker swapping the milk.
Question: Does Noor believe the milk pitch

In [137]:
train_size = 80
valid_size = 20
batch_size = 1

train_dataset = dataset[:train_size]
valid_dataset = dataset[train_size:train_size+valid_size]

train_dataloader = DataLoader(train_dataset, batch_size=4, shuffle=False)
valid_dataloader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False)

# Loading Custom Data

In [11]:
train_size = 40
valid_size = 20
batch_size = 4

train_dataset = get_visibility_align_exps(STORY_TEMPLATES,
                             all_characters,
                             all_containers,
                             all_states,
                             train_size,
                             question_type="belief_question",
                             diff_visibility=True)
valid_dataset = get_visibility_align_exps(STORY_TEMPLATES,
                             all_characters,
                             all_containers,
                             all_states,
                             valid_size,
                             question_type="belief_question",
                             diff_visibility=True)

train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False)
valid_dataloader = DataLoader(valid_dataset, batch_size=valid_size, shuffle=False)

In [12]:
idx = 0
print(train_dataset[idx]['corrupt_prompt'], train_dataset[idx]['corrupt_ans'])
print(train_dataset[idx]['clean_prompt'], train_dataset[idx]['clean_ans'])
print(train_dataset[idx]['target'])

Instruction: 1. Track the belief of each character as described in the story. 2. A character's belief is formed only when they perform an action themselves or can observe the action taking place. 3. A character does not have any beliefs about the container and its contents which they cannot observe. 4. To answer the question, predict only what is inside the queried container, strictly based on the belief of the character, mentioned in the question. 5. If the queried character has no belief about the container in question, then predict 'unknown'. 6. Do not predict container or character as the final output.

Story: Max and Karen are working in a busy restaurant. To complete an order, Max grabs an opaque tun and fills it with coffee. Then Karen grabs another opaque dispenser and fills it with cocoa. They are working side by side and can clearly observe each other's actions.
Question: What does Karen believe the tun contains?
Answer: coffee
Instruction: 1. Track the belief of each charact

# Loading Singular Vectors

In [12]:
sing_vecs = defaultdict(dict)
for l in range(41):
    sing_vecs[l] = torch.load(f"../svd_results/bigtom/singular_vecs/{l}.pt").cpu()
    # sing_vecs[l] = torch.load(f"../svd_results/toy/singular_vecs/{l}.pt").cpu()

# DCM

In [21]:
patch_layers = [i for i in range(0, 30, 10)] + [i for i in range(22, 30, 2)]

In [155]:
alt_prompt

["Instructions: 1. Track the belief of each character as described in the story. 2. A character's belief is formed only when they perform an action themselves or can observe the action taking place. 3. A character does not have any belief about the container or its content which they cannot observe directly. 4. To answer the question, predict only the final state of the queried container in fewest tokens possible, strictly based on the belief of the character, mentioned in the question. 5. Do not predict the entire sentence with character or container as the final output.\n\nStory: Noor is working as a barista at a busy coffee shop. Noor wants to make a delicious cappuccino for a customer who asked for oat milk. Noor grabs a milk pitcher and fills it with oat milk. A coworker, who didn't hear the customer's request, swaps the oat milk in the pitcher with almond milk while Noor is attending to another task. Noor sees her coworker swapping the milk.\nQuestion: Does Noor believe the milk 

In [141]:
valid_accs, rank = {}, {}

for layer_idx in range(32, 34, 2):
    modules = [i for i in range(sing_vecs[0].shape[0])]
    mask = torch.ones(len(modules), requires_grad=True, device="cuda", dtype=torch.bfloat16)
    optimizer = torch.optim.Adam([mask], lr=1e-1)
    n_epochs = 3
    lamb = 0.05

    print(f"Training layer: {layer_idx}")
    for epoch in range(n_epochs):
        epoch_loss = 0
        
        for bi, batch in tqdm(enumerate(train_dataloader), total=len(train_dataloader)):
            alt_prompt = batch["alt_prompt"]
            org_prompt = batch["org_prompt"]
            target = batch["target"]
            target_token = model.tokenizer(target, return_tensors="pt", padding=True, padding_side="right").input_ids[:, 1:]
            # print(f"Target: {model.tokenizer.decode(target_token[0])}")
            batch_size = target_token.shape[0]
            # print(f"Batch size: {batch_size}")

            alt_ques_idx = get_ques_start_token_idx(model.tokenizer, alt_prompt)
            alt_prompt_len = get_prompt_token_len(model.tokenizer, alt_prompt)
            org_ques_idx = get_ques_start_token_idx(model.tokenizer, org_prompt)
            org_prompt_len = get_prompt_token_len(model.tokenizer, org_prompt)

            optimizer.zero_grad()
            
            with model.trace() as tracer:
                # Cache alternative activations
                alt_acts = defaultdict(dict)
                with tracer.invoke(alt_prompt):
                    for t_idx, t in enumerate(range(alt_ques_idx, alt_prompt_len)):
                        alt_acts[t_idx] = model.model.layers[layer_idx].output[0][:, t].clone()
                
                # Process original prompt with modifications
                with tracer.invoke(org_prompt):
                    sing_vec = sing_vecs[layer_idx].cuda()
                    # Apply mask and ensure gradients flow
                    masked_vec = sing_vec * mask.unsqueeze(-1)
                    proj_matrix = torch.matmul(masked_vec.t(), masked_vec).half()
                    
                    for t in enumerate(range(org_ques_idx, org_prompt_len)):
                        curr_output = model.model.layers[layer_idx].output[0][:, t].clone()
                        
                        # Compute projections while maintaining gradients
                        alt_proj = torch.matmul(alt_acts[t_idx], proj_matrix)
                        org_proj = torch.matmul(curr_output, proj_matrix)

                        modified_out = curr_output - org_proj + alt_proj
                        model.model.layers[layer_idx].output[0][:, t] = modified_out
                    
                    del sing_vec, proj_matrix, masked_vec
                    torch.cuda.empty_cache()
                    
                    logits = model.lm_head.output[:, -1].save()
            
            # Compute loss with L1 regularization
            target_logit = logits[torch.arange(batch_size), target_token]
            print(f"Target logit: {target_logit}")

            task_loss = -torch.mean(target_logit)
            l1_loss = lamb * torch.norm(mask, p=1)
            loss = task_loss + l1_loss.to(task_loss.device)
            
            epoch_loss += loss.item()
            
            if bi % (len(train_dataloader)//10) == 0:
                mean_loss = epoch_loss / (bi + 1)
                print(f"Epoch: {epoch}, Batch: {bi}, Task Loss: {task_loss.item():.4f}, "
                    f"L1 Loss: {l1_loss.item():.4f}, Total Loss: {mean_loss:.4f}")
                with torch.no_grad():
                    mask.data.clamp_(0, 1)
                    rounded = torch.round(mask)
                    print(f"#Causal SVs: {(rounded == 1).sum().item()}")
            
            loss.backward()
            optimizer.step()
            
            # Clamp after optimizer step
            with torch.no_grad():
                mask.data.clamp_(0, 1)

    print(f"Training complete for {layer_idx}!")

    print(f"Validation started for {layer_idx}")
    correct, total = 0, 0
    
    with torch.inference_mode():
        mask_data = mask.data.clone()
        mask_data.clamp_(0, 1)
        rounded = torch.round(mask_data)
        print(f"#Causal SVs: {(rounded == 1).sum().item()}")
        rank[layer_idx] = (rounded == 1).sum().item()
        # Save rounded on disk
        torch.save(rounded, f"../masks/bigtom/{layer_idx}.pt")

        for bi, batch in tqdm(enumerate(valid_dataloader), total=len(valid_dataloader)):
            alt_prompt = batch["alt_prompt"][0]
            org_prompt = batch["org_prompt"][0]
            alt_ans = batch["alt_ans"][0]
            batch_size = 1

            alt_ques_idx = get_ques_start_token_idx(model.tokenizer, alt_prompt)
            alt_prompt_len = get_prompt_token_len(model.tokenizer, alt_prompt)
            org_ques_idx = get_ques_start_token_idx(model.tokenizer, org_prompt)
            org_prompt_len = get_prompt_token_len(model.tokenizer, org_prompt)

            with model.session() as session:
                # Cache alternative activations
                alt_acts = defaultdict(dict)
                with model.trace(alt_prompt):
                    for t_idx, t in enumerate(range(alt_ques_idx, alt_prompt_len)):
                        alt_acts[t_idx] = model.model.layers[layer_idx].output[0][:, t].save()

                # Process original prompt with modifications
                with model.generate(org_prompt, max_new_tokens=8, do_sample=False, num_return_sequences=1, pad_token_id=model.tokenizer.pad_token_id, eos_token_id=model.tokenizer.eos_token_id):
                    sing_vec = sing_vecs[layer_idx].cuda()
                    masked_vec = sing_vec.to(rounded.device) * rounded.unsqueeze(-1)
                    proj_matrix = torch.matmul(masked_vec.t(), masked_vec).half()

                    # sing_vec = sing_vecs[layer_idx][:100, :].t().half()
                    # proj_matrix = torch.matmul(sing_vec, sing_vec.t())

                    for t_idx, t in enumerate(range(org_ques_idx, org_prompt_len)):
                        curr_output = model.model.layers[layer_idx].output[0][:, t].clone()

                        # Compute projections while maintaining gradients
                        alt_proj = torch.matmul(alt_acts[t_idx], proj_matrix)
                        org_proj = torch.matmul(curr_output, proj_matrix)

                        modified_out = curr_output - org_proj + alt_proj
                        model.model.layers[layer_idx].output[0][:, t] = modified_out

                    out = model.generator.output.save()

                    del sing_vec, proj_matrix
                    torch.cuda.empty_cache()

            for i in range(batch_size):
                check = check_pred(model.tokenizer.decode(out[0][org_prompt_len:-1]), alt_ans, verbose=True)
                print(f"Check: {check}")
                if check == "Yes":
                    correct += 1
                total += 1
            
            del alt_acts, alt_prompt, org_prompt, alt_ans, out
            torch.cuda.empty_cache()

    print(f"Validation accuracy: {correct / total:.2f}\n")
    valid_accs[layer_idx] = round(correct / total, 2)

Training layer: 32


  0%|          | 0/20 [00:00<?, ?it/s]




ValueError: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your features (`input_ids` in this case) have excessive nesting (inputs type `list` where type `int` is expected).

In [133]:
torch.arange(batch_size)

tensor([0])

In [132]:
logits[torch.arange(batch_size), target_token]

tensor([[15.3672,  3.3418, 16.8438, 17.6406]], device='cuda:7',
       dtype=torch.float16, grad_fn=<IndexBackward0>)

In [116]:
correct, total = 0, 0

with torch.inference_mode():
    mask_data = mask.data.clone()
    mask_data.clamp_(0, 1)
    rounded = torch.round(mask_data)
    print(f"#Causal SVs: {(rounded == 1).sum().item()}")
    rank[layer_idx] = (rounded == 1).sum().item()
    # Save rounded on disk
    torch.save(rounded, f"../masks/bigtom/{layer_idx}.pt")

    for bi, batch in tqdm(enumerate(valid_dataloader), total=len(valid_dataloader)):
        alt_prompt = batch["alt_prompt"][0]
        org_prompt = batch["org_prompt"][0]
        alt_ans = batch["alt_ans"][0]
        batch_size = 1

        alt_ques_idx = get_ques_start_token_idx(model.tokenizer, alt_prompt)
        alt_prompt_len = get_prompt_token_len(model.tokenizer, alt_prompt)
        org_ques_idx = get_ques_start_token_idx(model.tokenizer, org_prompt)
        org_prompt_len = get_prompt_token_len(model.tokenizer, org_prompt)

        with model.session() as session:
            # Cache alternative activations
            alt_acts = defaultdict(dict)
            with model.trace(alt_prompt):
                for t_idx, t in enumerate(range(alt_ques_idx, alt_prompt_len)):
                    alt_acts[t_idx] = model.model.layers[layer_idx].output[0][:, t].save()

            # Process original prompt with modifications
            with model.generate(org_prompt, max_new_tokens=8, do_sample=False, num_return_sequences=1, eos_token_id=model.tokenizer.eos_token_id):
                sing_vec = sing_vecs[layer_idx].cuda()
                masked_vec = sing_vec.to(rounded.device) * rounded.unsqueeze(-1)
                proj_matrix = torch.matmul(masked_vec.t(), masked_vec).half()

                # sing_vec = sing_vecs[layer_idx][:100, :].t().half()
                # proj_matrix = torch.matmul(sing_vec, sing_vec.t())

                for t_idx, t in enumerate(range(org_ques_idx, org_prompt_len)):
                    curr_output = model.model.layers[layer_idx].output[0][:, t].clone()

                    # Compute projections while maintaining gradients
                    alt_proj = torch.matmul(alt_acts[t_idx], proj_matrix)
                    org_proj = torch.matmul(curr_output, proj_matrix)

                    modified_out = curr_output - org_proj + alt_proj
                    model.model.layers[layer_idx].output[0][:, t] = modified_out

                out = model.generator.output.save()

                del sing_vec, proj_matrix
                torch.cuda.empty_cache()

        for i in range(batch_size):
            check = check_pred(model.tokenizer.decode(out[0][org_prompt_len:-1]), alt_ans, verbose=True)
            print(f"Check: {check}")
            if check == "Yes":
                correct += 1
            total += 1
        
        del alt_acts, alt_prompt, org_prompt, alt_ans, out
        torch.cuda.empty_cache()
    
    print(f"Validation accuracy: {correct / total:.2f}\n")


#Causal SVs: 44


  0%|          | 0/20 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Instruction: Check if the following ground truth and prediction are same or different. If they are the same, then predict 'Yes', else 'No' 

Ground truth: the mole sauce is extremely spicy
Prediction:  extremely spicy
Answer:


  5%|▌         | 1/20 [00:07<02:19,  7.34s/it]

Check: Yes


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Instruction: Check if the following ground truth and prediction are same or different. If they are the same, then predict 'Yes', else 'No' 

Ground truth: ruined with black paint
Prediction:  smooth and flawless
Answer:


 10%|█         | 2/20 [00:14<02:13,  7.42s/it]

Check: No


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Instruction: Check if the following ground truth and prediction are same or different. If they are the same, then predict 'Yes', else 'No' 

Ground truth: the paintbrushes are ruined
Prediction:  ruined paintbrushes
Answer:


 15%|█▌        | 3/20 [00:22<02:09,  7.65s/it]

Check: Yes


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Instruction: Check if the following ground truth and prediction are same or different. If they are the same, then predict 'Yes', else 'No' 

Ground truth: damaged by the sandstorm
Prediction:  Damaged by the sandstorm (
Answer:


 20%|██        | 4/20 [00:31<02:06,  7.93s/it]

Check: Yes


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Instruction: Check if the following ground truth and prediction are same or different. If they are the same, then predict 'Yes', else 'No' 

Ground truth: filled with sand
Prediction:  Filled with honey.
Answer:
Check: No


 25%|██▌       | 5/20 [00:42<02:15,  9.04s/it]Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Instruction: Check if the following ground truth and prediction are same or different. If they are the same, then predict 'Yes', else 'No' 

Ground truth: washed away and diluted
Prediction:  Full and ready for harvest
Answer:


 30%|███       | 6/20 [00:51<02:07,  9.09s/it]

Check: No


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Instruction: Check if the following ground truth and prediction are same or different. If they are the same, then predict 'Yes', else 'No' 

Ground truth: punctured and non-functional
Prediction:  functional
Answer:


 35%|███▌      | 7/20 [00:58<01:50,  8.51s/it]

Check: No


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Instruction: Check if the following ground truth and prediction are same or different. If they are the same, then predict 'Yes', else 'No' 

Ground truth: disrupted by fallen leaves
Prediction:  Disrupted by fallen leaves.
Answer:


 40%|████      | 8/20 [01:06<01:40,  8.39s/it]

Check: Yes


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Instruction: Check if the following ground truth and prediction are same or different. If they are the same, then predict 'Yes', else 'No' 

Ground truth: disrupted by wind and leaves
Prediction:  perfectly raked
Answer:


 45%|████▌     | 9/20 [01:16<01:35,  8.68s/it]

Check: No


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Instruction: Check if the following ground truth and prediction are same or different. If they are the same, then predict 'Yes', else 'No' 

Ground truth: disarrayed by the wind
Prediction:  Disarrayed by the wind.
Answer:


 50%|█████     | 10/20 [01:26<01:33,  9.34s/it]

Check: Yes


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Instruction: Check if the following ground truth and prediction are same or different. If they are the same, then predict 'Yes', else 'No' 

Ground truth: branches have been snapped off by the wind
Prediction:  Overgrown branches
Answer:


 55%|█████▌    | 11/20 [01:34<01:19,  8.84s/it]

Check: No


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Instruction: Check if the following ground truth and prediction are same or different. If they are the same, then predict 'Yes', else 'No' 

Ground truth: torn
Prediction:  torn
Answer:


 60%|██████    | 12/20 [01:41<01:07,  8.39s/it]

Check: Yes


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Instruction: Check if the following ground truth and prediction are same or different. If they are the same, then predict 'Yes', else 'No' 

Ground truth: there is a storm approaching
Prediction:  Perfect for sailing
Answer:


 65%|██████▌   | 13/20 [01:49<00:56,  8.14s/it]

Check: No


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Instruction: Check if the following ground truth and prediction are same or different. If they are the same, then predict 'Yes', else 'No' 

Ground truth: cool and unsuitable for baking
Prediction:  cool and unsuitable for baking
Answer:


 70%|███████   | 14/20 [01:57<00:49,  8.20s/it]

Check: Yes


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Instruction: Check if the following ground truth and prediction are same or different. If they are the same, then predict 'Yes', else 'No' 

Ground truth: has loosened straps
Prediction:  Working condition
Answer:


 75%|███████▌  | 15/20 [02:04<00:39,  7.86s/it]

Check: No


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Instruction: Check if the following ground truth and prediction are same or different. If they are the same, then predict 'Yes', else 'No' 

Ground truth: 3 AM
Prediction:  5 AM
Answer:


 80%|████████  | 16/20 [02:13<00:32,  8.22s/it]

Check: No


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Instruction: Check if the following ground truth and prediction are same or different. If they are the same, then predict 'Yes', else 'No' 

Ground truth: has hardened
Prediction:  soft and loose
Answer:
Check: No


 85%|████████▌ | 17/20 [02:23<00:25,  8.61s/it]Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Instruction: Check if the following ground truth and prediction are same or different. If they are the same, then predict 'Yes', else 'No' 

Ground truth: weakened and infested with termites
Prediction:  strong and free of imperfections
Answer:


 90%|█████████ | 18/20 [02:32<00:17,  8.79s/it]

Check: No


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Instruction: Check if the following ground truth and prediction are same or different. If they are the same, then predict 'Yes', else 'No' 

Ground truth: damaged with teeth marks
Prediction:  Damaged with teeth marks (No
Answer:


 95%|█████████▌| 19/20 [02:41<00:08,  8.68s/it]

Check: Yes


Setting `pad_token_id` to `eos_token_id`:128009 for open-end generation.


Instruction: Check if the following ground truth and prediction are same or different. If they are the same, then predict 'Yes', else 'No' 

Ground truth: blunt and damaged
Prediction:  Blunt and damaged
Answer:
Check: Yes


100%|██████████| 20/20 [02:52<00:00,  8.60s/it]


NameError: name 'rint' is not defined