- Server address : `https://fb90-165-132-144-84.ngrok-free.app`
- Endpoints:
    - `/api/inference`: For calling API to generate outputs from LMs
    - `/api/change_password`: Change your password. Arguments: `student_id`, `old_password`, `new_password`.

- Check your remaining credit at this page.[https://fb90-165-132-144-84.ngrok-free.app/student_credits] 

### Before we get started, let's change the password!

In [147]:
import requests
import os

server_address = "https://cot.ngrok.app"
os.environ['INTRO_AI_API_KEY'] ='kimys0613'

In [148]:
class LLM:
    def __init__(self, model_name, student_id, password):
        self.student_id = student_id
        self.password = password
        server_address = "https://cot.ngrok.app"
        self.api_url = f'{server_address}/api/inference'  # Replace with your server address
        self.model_name = model_name

    def generate(self, model_input, max_tokens=100, top_p=1.0, temperature=0.7, frequency_penalty=0, stop="\n\n"):
        data = {
            'model_name': self.model_name,
            'model_input': model_input,
            'student_id': self.student_id,
            'password': self.password,
            'max_tokens': max_tokens,
            'top_p': top_p,
            'temperature': temperature,
            'frequency_penalty': frequency_penalty,
            'stop': stop
        }
        response = requests.post(self.api_url, json=data)

        if response.status_code == 200:
            return response.json()  # Assuming the server returns a JSON response
        else:
            return f"Error: {response.text}"

In [149]:
student_id = "2022148083"
password = os.environ['INTRO_AI_API_KEY']

available_models = [
    "meta-llama/Llama-2-7b-hf",
    "facebook/opt-350m",
    "facebook/opt-1.3b",
    "facebook/opt-2.7b"
]
opt = LLM(available_models[1], student_id, password)

### Generate your first output by calling the API

In [150]:
model_input = "Hello World!"
result = opt.generate([model_input], stop=None)

In [151]:
print(result)

{'generations': [{'generation_info': {'finish_reason': 'stop', 'logprobs': None}, 'text': '\n\nWelcome to our website!\n\nWelcome to our website! We are constantly working to make our site better by helping you find what you’re looking for. If you have any questions you should have a chat with us. We’ll be happy to help.'}], 'llm_output': {'model_name': 'facebook/opt-350m', 'token_usage': {'completion_tokens': 58, 'prompt_tokens': 4, 'total_tokens': 62}}, 'run_info': [{'run_id': '6690b6aa-e924-4fb9-83f6-73b817d32f6c'}]}


### Load GSM8K dataset

In [152]:
from datasets import load_dataset

gsm8k = load_dataset("gsm8k", "main")['test']

### An example from GSM8k

In [153]:
print("Question:")
print(gsm8k['question'][0])
print("="*100)
print("Answer:")
print(gsm8k['answer'][0])

Question:
Janet’s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?
Answer:
Janet sells 16 - 3 - 4 = <<16-3-4=9>>9 duck eggs a day.
She makes 9 * 2 = $<<9*2=18>>18 every day at the farmer’s market.
#### 18


## **Experiment 1**: Zero-shot prompting v.s. Few-shot prompting

### Constructing prompts

In [154]:
import random
random.seed(0)

def construct_prompt(num_exemplars):
    # Load train set of GSM8K1
    gsm8k = load_dataset("gsm8k", "main")['train']

    sampled_indices = random.sample([i for i in range(len(gsm8k['question']))], num_exemplars)


    instruction = "Instruction:\nSolve the following mathematical question and generate the answer after a tag, 'Answer:'."

    # Constructing a prompt with few-shot demonstrations from GSM8K
    prompt = instruction
    for i in range(num_exemplars):
        cur_question = gsm8k['question'][i]
        cur_answer = gsm8k['answer'][i].split("####")[-1].strip()
        prompt += f"\n[Example {i+1}]\n"
        prompt += f"Question:\n{cur_question}\n"
        prompt += f"Answer:{cur_answer}\n"

    prompt += f"\n[Example {num_exemplars+1}]\n"
    prompt += "Question:\n{question}\nAnswer:"

    # Write the prompt to a .txt file
    with open(f"prompt_{num_exemplars}shot.txt", "w") as f:
        f.write(prompt)



In [155]:
construct_prompt(0)
construct_prompt(3)

### Load the prompt for zero-shot prompting

In [156]:
with open("prompt_0shot.txt", "r") as f:
    prompt_0shot = f.read()

In [161]:
from tqdm import tqdm
import json

llama = LLM("meta-llama/Llama-2-7b-hf", student_id, password)
results_collected = []
pass_collected = []
for i in tqdm(range(50)):
    cur_question = gsm8k['question'][i]
    cur_anwer = gsm8k['answer'][i].split("####")[-1].strip()
    cur_model_input = prompt_0shot.format(question=cur_question)
    result = llama.generate([cur_model_input], stop=["\n\n", "[Example 2]"])
    if "Error:" in result:
        print(result)
        break
    cur_prediction = result['generations'][0]['text']
    # print(cur_prediction)

    pass_collected.append(cur_prediction.strip().replace("$", "0") == cur_anwer)
    results_collected.append({"question": cur_question, "answer": cur_anwer, "prediction": cur_prediction})
    print(f"Acc: {sum(pass_collected)/ len(pass_collected)}")

with open("result_0shot_direct.json", "w") as f:
    json.dump(results_collected, f, indent=4)

  2%|▏         | 1/50 [00:00<00:42,  1.15it/s]

Acc: 0.0


  4%|▍         | 2/50 [00:01<00:30,  1.59it/s]

Acc: 0.5


  6%|▌         | 3/50 [00:01<00:24,  1.92it/s]

Acc: 0.3333333333333333


  8%|▊         | 4/50 [00:02<00:21,  2.17it/s]

Acc: 0.25


 10%|█         | 5/50 [00:02<00:19,  2.26it/s]

Acc: 0.2


 12%|█▏        | 6/50 [00:02<00:18,  2.33it/s]

Acc: 0.16666666666666666


 14%|█▍        | 7/50 [00:03<00:17,  2.40it/s]

Acc: 0.14285714285714285


 16%|█▌        | 8/50 [00:03<00:17,  2.39it/s]

Acc: 0.125


 18%|█▊        | 9/50 [00:04<00:18,  2.19it/s]

Acc: 0.1111111111111111


 20%|██        | 10/50 [00:04<00:17,  2.32it/s]

Acc: 0.1


 22%|██▏       | 11/50 [00:05<00:16,  2.41it/s]

Acc: 0.09090909090909091


 24%|██▍       | 12/50 [00:05<00:15,  2.47it/s]

Acc: 0.08333333333333333


 26%|██▌       | 13/50 [00:05<00:14,  2.49it/s]

Acc: 0.07692307692307693


 28%|██▊       | 14/50 [00:06<00:20,  1.72it/s]

Acc: 0.07142857142857142


 30%|███       | 15/50 [00:07<00:18,  1.86it/s]

Acc: 0.06666666666666667


 32%|███▏      | 16/50 [00:07<00:16,  2.00it/s]

Acc: 0.0625


 34%|███▍      | 17/50 [00:08<00:16,  2.06it/s]

Acc: 0.058823529411764705


 36%|███▌      | 18/50 [00:10<00:38,  1.20s/it]

Acc: 0.05555555555555555


 38%|███▊      | 19/50 [00:11<00:29,  1.05it/s]

Acc: 0.05263157894736842


 40%|████      | 20/50 [00:11<00:24,  1.25it/s]

Acc: 0.05


 42%|████▏     | 21/50 [00:12<00:19,  1.48it/s]

Acc: 0.047619047619047616


 44%|████▍     | 22/50 [00:12<00:16,  1.69it/s]

Acc: 0.045454545454545456


 46%|████▌     | 23/50 [00:13<00:17,  1.56it/s]

Acc: 0.043478260869565216


 48%|████▊     | 24/50 [00:13<00:14,  1.78it/s]

Acc: 0.041666666666666664


 50%|█████     | 25/50 [00:14<00:13,  1.84it/s]

Acc: 0.04


 52%|█████▏    | 26/50 [00:14<00:11,  2.01it/s]

Acc: 0.038461538461538464


 54%|█████▍    | 27/50 [00:14<00:10,  2.11it/s]

Acc: 0.037037037037037035


 56%|█████▌    | 28/50 [00:15<00:13,  1.69it/s]

Acc: 0.03571428571428571


 58%|█████▊    | 29/50 [00:16<00:11,  1.89it/s]

Acc: 0.034482758620689655


 60%|██████    | 30/50 [00:16<00:10,  1.87it/s]

Acc: 0.03333333333333333


 62%|██████▏   | 31/50 [00:17<00:09,  2.05it/s]

Acc: 0.03225806451612903


 64%|██████▍   | 32/50 [00:17<00:08,  2.19it/s]

Acc: 0.03125


 66%|██████▌   | 33/50 [00:17<00:07,  2.30it/s]

Acc: 0.030303030303030304


 68%|██████▊   | 34/50 [00:18<00:06,  2.58it/s]

Acc: 0.029411764705882353


 70%|███████   | 35/50 [00:18<00:05,  2.61it/s]

Acc: 0.02857142857142857


 72%|███████▏  | 36/50 [00:18<00:05,  2.68it/s]

Acc: 0.027777777777777776


 74%|███████▍  | 37/50 [00:19<00:04,  2.65it/s]

Acc: 0.02702702702702703


 76%|███████▌  | 38/50 [00:19<00:04,  2.47it/s]

Acc: 0.02631578947368421


 78%|███████▊  | 39/50 [00:20<00:04,  2.36it/s]

Acc: 0.02564102564102564


 80%|████████  | 40/50 [00:20<00:04,  2.17it/s]

Acc: 0.025


 82%|████████▏ | 41/50 [00:21<00:04,  2.19it/s]

Acc: 0.024390243902439025


 84%|████████▍ | 42/50 [00:21<00:03,  2.04it/s]

Acc: 0.023809523809523808


 86%|████████▌ | 43/50 [00:22<00:03,  1.90it/s]

Acc: 0.023255813953488372


 88%|████████▊ | 44/50 [00:22<00:03,  1.94it/s]

Acc: 0.022727272727272728


 90%|█████████ | 45/50 [00:23<00:02,  1.91it/s]

Acc: 0.022222222222222223


 92%|█████████▏| 46/50 [00:26<00:05,  1.30s/it]

Acc: 0.021739130434782608


 94%|█████████▍| 47/50 [00:26<00:03,  1.02s/it]

Acc: 0.02127659574468085


 96%|█████████▌| 48/50 [00:27<00:01,  1.11it/s]

Acc: 0.020833333333333332


 98%|█████████▊| 49/50 [00:27<00:00,  1.35it/s]

Acc: 0.02040816326530612


100%|██████████| 50/50 [00:28<00:00,  1.76it/s]

Acc: 0.02





In [163]:
with open("prompt_3shot.txt", "r") as f:
    prompt_3shot = f.read()
    
from tqdm import tqdm

llama = LLM("meta-llama/Llama-2-7b-hf", student_id, password)
results_collected = []
pass_collected = []
for i in tqdm(range(50)):
    cur_question = gsm8k['question'][i]
    cur_anwer = gsm8k['answer'][i].split("####")[-1].strip()
    cur_model_input = prompt_3shot.format(question=cur_question)
    result = llama.generate([cur_model_input], stop=["\n\n"])
    if "Error:" in result:
        print(result)
        break
    cur_prediction = result['generations'][0]['text']
    # print(cur_prediction)

    pass_collected.append(cur_prediction.strip().replace("$","") == cur_anwer)
    results_collected.append({"question": cur_question, "answer": cur_anwer, "prediction": cur_prediction})
    print(f"Acc: {sum(pass_collected)/ len(pass_collected)}")
    
with open("result_3shot_direct.json", "w") as f:
    json.dump(results_collected, f, indent=4)

  2%|▏         | 1/50 [00:00<00:20,  2.36it/s]

Acc: 0.0


  4%|▍         | 2/50 [00:00<00:18,  2.62it/s]

Acc: 0.5


  6%|▌         | 3/50 [00:01<00:20,  2.34it/s]

Acc: 0.3333333333333333


  8%|▊         | 4/50 [00:01<00:19,  2.36it/s]

Acc: 0.25


 10%|█         | 5/50 [00:02<00:19,  2.36it/s]

Acc: 0.2


 12%|█▏        | 6/50 [00:02<00:18,  2.42it/s]

Acc: 0.16666666666666666


 14%|█▍        | 7/50 [00:02<00:17,  2.44it/s]

Acc: 0.14285714285714285


 16%|█▌        | 8/50 [00:03<00:17,  2.41it/s]

Acc: 0.125


 18%|█▊        | 9/50 [00:03<00:17,  2.33it/s]

Acc: 0.1111111111111111


 20%|██        | 10/50 [00:04<00:16,  2.41it/s]

Acc: 0.1


 22%|██▏       | 11/50 [00:04<00:16,  2.44it/s]

Acc: 0.09090909090909091


 24%|██▍       | 12/50 [00:04<00:15,  2.41it/s]

Acc: 0.08333333333333333


 26%|██▌       | 13/50 [00:05<00:15,  2.35it/s]

Acc: 0.07692307692307693


 28%|██▊       | 14/50 [00:05<00:14,  2.42it/s]

Acc: 0.07142857142857142


 30%|███       | 15/50 [00:06<00:15,  2.32it/s]

Acc: 0.06666666666666667


 32%|███▏      | 16/50 [00:06<00:15,  2.23it/s]

Acc: 0.0625


 34%|███▍      | 17/50 [00:07<00:14,  2.26it/s]

Acc: 0.11764705882352941


 36%|███▌      | 18/50 [00:07<00:14,  2.13it/s]

Acc: 0.1111111111111111


 38%|███▊      | 19/50 [00:08<00:13,  2.28it/s]

Acc: 0.10526315789473684


 40%|████      | 20/50 [00:08<00:12,  2.39it/s]

Acc: 0.1


 42%|████▏     | 21/50 [00:08<00:12,  2.38it/s]

Acc: 0.09523809523809523


 44%|████▍     | 22/50 [00:09<00:11,  2.47it/s]

Acc: 0.09090909090909091


 46%|████▌     | 23/50 [00:09<00:10,  2.46it/s]

Acc: 0.08695652173913043


 48%|████▊     | 24/50 [00:10<00:10,  2.55it/s]

Acc: 0.08333333333333333


 50%|█████     | 25/50 [00:10<00:10,  2.46it/s]

Acc: 0.08


 52%|█████▏    | 26/50 [00:10<00:09,  2.44it/s]

Acc: 0.07692307692307693


 54%|█████▍    | 27/50 [00:11<00:09,  2.40it/s]

Acc: 0.07407407407407407


 56%|█████▌    | 28/50 [00:11<00:09,  2.35it/s]

Acc: 0.07142857142857142


 58%|█████▊    | 29/50 [00:12<00:08,  2.34it/s]

Acc: 0.06896551724137931


 60%|██████    | 30/50 [00:12<00:08,  2.40it/s]

Acc: 0.06666666666666667


 62%|██████▏   | 31/50 [00:13<00:07,  2.44it/s]

Acc: 0.06451612903225806


 64%|██████▍   | 32/50 [00:13<00:07,  2.46it/s]

Acc: 0.0625


 66%|██████▌   | 33/50 [00:13<00:06,  2.52it/s]

Acc: 0.06060606060606061


 68%|██████▊   | 34/50 [00:14<00:06,  2.56it/s]

Acc: 0.058823529411764705


 70%|███████   | 35/50 [00:14<00:06,  2.46it/s]

Acc: 0.05714285714285714


 72%|███████▏  | 36/50 [00:15<00:05,  2.45it/s]

Acc: 0.05555555555555555


 74%|███████▍  | 37/50 [00:15<00:05,  2.43it/s]

Acc: 0.05405405405405406


 76%|███████▌  | 38/50 [00:15<00:04,  2.48it/s]

Acc: 0.07894736842105263


 78%|███████▊  | 39/50 [00:16<00:04,  2.46it/s]

Acc: 0.07692307692307693


 80%|████████  | 40/50 [00:16<00:04,  2.46it/s]

Acc: 0.075


 82%|████████▏ | 41/50 [00:17<00:03,  2.47it/s]

Acc: 0.07317073170731707


 84%|████████▍ | 42/50 [00:17<00:03,  2.36it/s]

Acc: 0.07142857142857142


 86%|████████▌ | 43/50 [00:17<00:02,  2.39it/s]

Acc: 0.06976744186046512


 88%|████████▊ | 44/50 [00:18<00:02,  2.42it/s]

Acc: 0.06818181818181818


 90%|█████████ | 45/50 [00:18<00:02,  2.46it/s]

Acc: 0.06666666666666667


 92%|█████████▏| 46/50 [00:19<00:01,  2.33it/s]

Acc: 0.06521739130434782


 94%|█████████▍| 47/50 [00:19<00:01,  2.36it/s]

Acc: 0.06382978723404255


 96%|█████████▌| 48/50 [00:20<00:00,  2.37it/s]

Acc: 0.0625


 98%|█████████▊| 49/50 [00:20<00:00,  2.38it/s]

Acc: 0.061224489795918366


100%|██████████| 50/50 [00:20<00:00,  2.40it/s]

Acc: 0.06





## **Experiment 2**: CoT prompting vs Standard prompting

In [164]:
import random

def generate_chain_of_thought(prompt, num_chains=2):
    chains = []

    for _ in range(num_chains):
        # Create a chain of thought
        chain = ""
        tokens = prompt.split()
        for i in range(len(tokens)):
            # Randomly decide whether to include each token in the chain
            if random.choice([True, False]):
                chain += tokens[i] + " "
        chains.append(chain.strip())

    return chains

def construct_chain_of_thought_prompt(num_chains):
    # Load train set of GSM8K1
    gsm8k = load_dataset("gsm8k", "main")['train']

    sampled_indices = random.sample([i for i in range(len(gsm8k['question']))], num_chains)

    instruction = "Instruction:\nSolve the following mathematical question and generate the answer after a tag, 'Answer:'."
    prompt = instruction

    for i in range(num_chains):
        cur_question = gsm8k['question'][i]
        cur_answer = gsm8k['answer'][i].split("####")[-1].strip()

        # Generate chain of thought for each example
        chains = generate_chain_of_thought(f"{cur_question} Answer: {cur_answer}", num_chains=2)

        prompt += f"\n[Example {i+1}]\n"
        prompt += f"Question:\n{cur_question}\n"
        
        for j, chain in enumerate(chains):
            prompt += f"Chain of Thought {j+1}:\n{chain}\n"

        prompt += f"Answer:{cur_answer}\n"

    # Write the prompt to a .txt file
    with open(f"prompt_chain_of_thought_{num_chains}chains.txt", "w") as f:
        f.write(prompt)


In [165]:
construct_chain_of_thought_prompt(3)

In [None]:
from tqdm import tqdm
import json

# Assuming gsm8k is a valid dataset containing questions and answers
gsm8k = load_dataset("gsm8k", "main")['train']

# Assuming prompt_chain_of_thought_3chains.txt is the file you created using the provided code
with open("prompt_chain_of_thought_3chains.txt", "r") as f:
    prompt_chain_of_thought = f.read()

llama = LLM("meta-llama/Llama-2-7b-hf", student_id, password)
results_collected = []
pass_collected = []

# Number of chains of thought to consider for each example
num_chains = 2

for i in tqdm(range(50)):
    cur_question = gsm8k['question'][i]
    cur_answer = gsm8k['answer'][i].split("####")[-1].strip()
    cur_model_input = prompt_chain_of_thought.format(question=cur_question)
    
    # Generate chains of thought
    chains_of_thought = generate_chain_of_thought(cur_question, num_chains)
    
    # Combine chains of thought with the original model input
    cur_model_input_with_chains = f"{cur_model_input}\n\n"
    for j, chain in enumerate(chains_of_thought):
        cur_model_input_with_chains += f"Chain of Thought {j + 1}:\n{chain}\n"
    
    # Assuming generate method of LLM class supports a list of inputs
    result = llama.generate([cur_model_input_with_chains], stop=["\n\n", "[Example 4]"])
    
    if "Error:" in result:
        print(result)
        break
    
    # Extracting the generated answer from the result
    generated_text = result['generations'][0]['text']
    
    # Check if "Answer:" is present in the generated text
    if "Answer:" in generated_text:
        cur_prediction = generated_text.split("Answer:")[1].strip()
        
        # Compare the prediction with the answer
        pass_collected.append(cur_prediction.strip().replace("$", "") == cur_answer)
        
        results_collected.append({
            "question": cur_question,
            "answer": cur_answer,
            "prediction": f"Answer:{cur_prediction}"
        })
    else:
        print(f"Error: 'Answer:' not found in the generated text.")
    
    print(f"Acc: {sum(pass_collected) / len(pass_collected)}")

with open("result_chain_of_thought_3shot_direct.json", "w") as f:
    json.dump(results_collected, f, indent=4)


## **Experiment 3**: Applying CoT prompting on models of various sizes

In [168]:
from tqdm import tqdm
import json

# Assuming gsm8k is a valid dataset containing questions and answers
gsm8k = load_dataset("gsm8k", "main")['train']

# Assuming prompt_chain_of_thought_3chains.txt is the file you created using the provided code
with open("prompt_chain_of_thought_3chains.txt", "r") as f:
    prompt_chain_of_thought = f.read()

llama = LLM("meta-llama/Llama-2-7b-hf", student_id, password)
results_collected = []
pass_collected = []

# Number of chains of thought to consider for each example
num_chains = 2

for i in tqdm(range(50)):
    cur_question = gsm8k['question'][i]
    cur_answer = gsm8k['answer'][i].split("####")[-1].strip()
    cur_model_input = prompt_chain_of_thought.format(question=cur_question)
    
    # Generate chains of thought
    chains_of_thought = generate_chain_of_thought(cur_question, num_chains)
    
    # Combine chains of thought with the original model input
    cur_model_input_with_chains = f"{cur_model_input}\n\n"
    for j, chain in enumerate(chains_of_thought):
        cur_model_input_with_chains += f"Chain of Thought {j + 1}:\n{chain}\n"
    
    # Assuming generate method of LLM class supports a list of inputs
    result = llama.generate([cur_model_input_with_chains], stop=["\n\n", "[Example 4]"])
    
    if "Error:" in result:
        print(result)
        break
    
    # Extracting the generated answer from the result
    generated_text = result['generations'][0]['text']
    
    # Check if "Answer:" is present in the generated text
    if "Answer:" in generated_text:
        cur_prediction = generated_text.split("Answer:")[1].strip()
        
        # Compare the prediction with the answer
        pass_collected.append(cur_prediction.strip().replace("$", "") == cur_answer)
        
        results_collected.append({
            "question": cur_question,
            "answer": cur_answer,
            "prediction": f"Answer:{cur_prediction}"
        })
    else:
        print(f"Error: 'Answer:' not found in the generated text.")
    
    print(f"Acc: {sum(pass_collected) / len(pass_collected)}")

with open("LLaMA_CoT.json", "w") as f:
    json.dump(results_collected, f, indent=4)


  2%|▏         | 1/50 [00:01<01:34,  1.94s/it]

Acc: 1.0


  4%|▍         | 2/50 [00:02<00:52,  1.09s/it]

Acc: 0.5


  6%|▌         | 3/50 [00:02<00:37,  1.24it/s]

Error: 'Answer:' not found in the generated text.
Acc: 0.5


  8%|▊         | 4/50 [00:03<00:35,  1.30it/s]

Acc: 0.3333333333333333


 10%|█         | 5/50 [00:04<00:30,  1.48it/s]

Acc: 0.25


 12%|█▏        | 6/50 [00:04<00:27,  1.60it/s]

Acc: 0.2


 14%|█▍        | 7/50 [00:06<00:40,  1.07it/s]

Acc: 0.16666666666666666


 16%|█▌        | 8/50 [00:06<00:34,  1.22it/s]

Acc: 0.14285714285714285


 18%|█▊        | 9/50 [00:07<00:31,  1.32it/s]

Acc: 0.125


 20%|██        | 10/50 [00:08<00:31,  1.27it/s]

Acc: 0.1111111111111111


 22%|██▏       | 11/50 [00:08<00:28,  1.35it/s]

Acc: 0.1


 24%|██▍       | 12/50 [00:09<00:25,  1.46it/s]

Acc: 0.09090909090909091


 26%|██▌       | 13/50 [00:09<00:23,  1.59it/s]

Acc: 0.08333333333333333


 28%|██▊       | 14/50 [00:10<00:23,  1.53it/s]

Acc: 0.07692307692307693


 30%|███       | 15/50 [00:11<00:22,  1.57it/s]

Acc: 0.07142857142857142


 32%|███▏      | 16/50 [00:11<00:20,  1.66it/s]

Acc: 0.06666666666666667


 34%|███▍      | 17/50 [00:12<00:19,  1.70it/s]

Acc: 0.0625


 36%|███▌      | 18/50 [00:13<00:19,  1.64it/s]

Error: 'Answer:' not found in the generated text.
Acc: 0.0625


 38%|███▊      | 19/50 [00:13<00:17,  1.73it/s]

Acc: 0.058823529411764705


 40%|████      | 20/50 [00:15<00:28,  1.07it/s]

Error: 'Answer:' not found in the generated text.
Acc: 0.058823529411764705


 42%|████▏     | 21/50 [00:15<00:24,  1.19it/s]

Acc: 0.05555555555555555


 44%|████▍     | 22/50 [00:16<00:21,  1.32it/s]

Acc: 0.05263157894736842


 46%|████▌     | 23/50 [00:17<00:21,  1.27it/s]

Acc: 0.05


 48%|████▊     | 24/50 [00:17<00:18,  1.40it/s]

Acc: 0.047619047619047616


 50%|█████     | 25/50 [00:18<00:18,  1.34it/s]

Acc: 0.045454545454545456


 52%|█████▏    | 26/50 [00:19<00:16,  1.42it/s]

Acc: 0.043478260869565216


 54%|█████▍    | 27/50 [00:19<00:14,  1.54it/s]

Acc: 0.041666666666666664


 56%|█████▌    | 28/50 [00:20<00:13,  1.62it/s]

Acc: 0.04


 58%|█████▊    | 29/50 [00:20<00:12,  1.64it/s]

Acc: 0.038461538461538464


 60%|██████    | 30/50 [00:21<00:11,  1.72it/s]

Acc: 0.037037037037037035


 62%|██████▏   | 31/50 [00:21<00:10,  1.78it/s]

Acc: 0.03571428571428571


 64%|██████▍   | 32/50 [00:22<00:09,  1.81it/s]

Acc: 0.034482758620689655


 66%|██████▌   | 33/50 [00:23<00:09,  1.78it/s]

Acc: 0.03333333333333333


 68%|██████▊   | 34/50 [00:23<00:08,  1.78it/s]

Acc: 0.03225806451612903


 70%|███████   | 35/50 [00:24<00:08,  1.84it/s]

Acc: 0.03125


 72%|███████▏  | 36/50 [00:24<00:07,  1.85it/s]

Acc: 0.030303030303030304


 74%|███████▍  | 37/50 [00:25<00:06,  1.88it/s]

Acc: 0.029411764705882353


 76%|███████▌  | 38/50 [00:25<00:06,  1.88it/s]

Acc: 0.02857142857142857


 78%|███████▊  | 39/50 [00:26<00:06,  1.72it/s]

Acc: 0.027777777777777776


 80%|████████  | 40/50 [00:26<00:05,  1.75it/s]

Acc: 0.02702702702702703


 82%|████████▏ | 41/50 [00:27<00:04,  1.82it/s]

Acc: 0.02631578947368421


 84%|████████▍ | 42/50 [00:28<00:04,  1.72it/s]

Acc: 0.02564102564102564


 86%|████████▌ | 43/50 [00:31<00:09,  1.35s/it]

Error: 'Answer:' not found in the generated text.
Acc: 0.02564102564102564


 88%|████████▊ | 44/50 [00:31<00:06,  1.10s/it]

Acc: 0.025


 90%|█████████ | 45/50 [00:32<00:04,  1.01it/s]

Error: 'Answer:' not found in the generated text.
Acc: 0.025


 92%|█████████▏| 46/50 [00:33<00:03,  1.10it/s]

Acc: 0.024390243902439025


 94%|█████████▍| 47/50 [00:33<00:02,  1.25it/s]

Acc: 0.023809523809523808


 96%|█████████▌| 48/50 [00:34<00:01,  1.41it/s]

Acc: 0.023255813953488372


 98%|█████████▊| 49/50 [00:34<00:00,  1.48it/s]

Acc: 0.022727272727272728


100%|██████████| 50/50 [00:35<00:00,  1.41it/s]

Acc: 0.022222222222222223





In [179]:
from tqdm import tqdm
import json

# Assuming gsm8k is a valid dataset containing questions and answers
gsm8k = load_dataset("gsm8k", "main")['train']

# Assuming prompt_chain_of_thought_3chains.txt is the file you created using the provided code
with open("prompt_chain_of_thought_3chains.txt", "r") as f:
    prompt_chain_of_thought = f.read()

llama = LLM("facebook/opt-350m", student_id, password)
results_collected = []
pass_collected = []

# Number of chains of thought to consider for each example
num_chains = 2

for i in tqdm(range(50)):
    cur_question = gsm8k['question'][i]
    cur_answer = gsm8k['answer'][i].split("####")[-1].strip()
    cur_model_input = prompt_chain_of_thought.format(question=cur_question)
    
    # Generate chains of thought
    chains_of_thought = generate_chain_of_thought(cur_question, num_chains)
    
    # Combine chains of thought with the original model input
    cur_model_input_with_chains = f"{cur_model_input}\n\n"
    for j, chain in enumerate(chains_of_thought):
        cur_model_input_with_chains += f"Chain of Thought {j + 1}:\n{chain}\n"
    
    # Assuming generate method of LLM class supports a list of inputs
    result = llama.generate([cur_model_input_with_chains], stop=["\n\n", "[Example 4]"])
    
    if "Error:" in result:
        print(result)
        break
    
    # Extracting the generated answer from the result
    generated_text = result['generations'][0]['text']
    
    # Check if "Answer:" is present in the generated text
    if "Answer:" in generated_text:
        cur_prediction = generated_text.split("Answer:")[1].strip()
        
        # Compare the prediction with the answer
        pass_collected.append(cur_prediction.strip().replace("$", "") == cur_answer)
        
        results_collected.append({
            "question": cur_question,
            "answer": cur_answer,
            "prediction": f"Answer:{cur_prediction}"
        })
    else:
        print(f"Error: 'Answer:' not found in the generated text.")
    
    # Check if pass_collected is not empty before calculating accuracy
    if pass_collected:
        print(f"Acc: {sum(pass_collected) / len(pass_collected)}")

with open("opt-350m.json", "w") as f:
    json.dump(results_collected, f, indent=4)


  2%|▏         | 1/50 [00:01<01:00,  1.24s/it]

Error: 'Answer:' not found in the generated text.


  4%|▍         | 2/50 [00:02<00:49,  1.03s/it]

Acc: 0.0


  6%|▌         | 3/50 [00:02<00:33,  1.39it/s]

Acc: 0.5


  8%|▊         | 4/50 [00:02<00:27,  1.67it/s]

Acc: 0.3333333333333333


 10%|█         | 5/50 [00:03<00:23,  1.89it/s]

Acc: 0.25


 12%|█▏        | 6/50 [00:04<00:36,  1.21it/s]

Acc: 0.2


 14%|█▍        | 7/50 [00:04<00:27,  1.54it/s]

Acc: 0.16666666666666666


 16%|█▌        | 8/50 [00:05<00:22,  1.90it/s]

Acc: 0.14285714285714285


 18%|█▊        | 9/50 [00:06<00:30,  1.34it/s]

Acc: 0.125


 20%|██        | 10/50 [00:06<00:23,  1.67it/s]

Acc: 0.1111111111111111


 22%|██▏       | 11/50 [00:06<00:19,  2.02it/s]

Acc: 0.1


 24%|██▍       | 12/50 [00:08<00:27,  1.40it/s]

Acc: 0.09090909090909091


 26%|██▌       | 13/50 [00:08<00:21,  1.72it/s]

Acc: 0.08333333333333333


 28%|██▊       | 14/50 [00:08<00:17,  2.04it/s]

Acc: 0.07692307692307693


 30%|███       | 15/50 [00:09<00:14,  2.38it/s]

Acc: 0.07142857142857142


 32%|███▏      | 16/50 [00:09<00:12,  2.64it/s]

Acc: 0.06666666666666667


 34%|███▍      | 17/50 [00:10<00:17,  1.90it/s]

Acc: 0.0625


 36%|███▌      | 18/50 [00:10<00:14,  2.23it/s]

Acc: 0.058823529411764705


 38%|███▊      | 19/50 [00:11<00:18,  1.66it/s]

Acc: 0.05555555555555555


 40%|████      | 20/50 [00:12<00:19,  1.54it/s]

Acc: 0.05263157894736842


 42%|████▏     | 21/50 [00:13<00:23,  1.21it/s]

Acc: 0.05


 44%|████▍     | 22/50 [00:13<00:19,  1.46it/s]

Acc: 0.047619047619047616


 46%|████▌     | 23/50 [00:14<00:18,  1.43it/s]

Acc: 0.045454545454545456


 48%|████▊     | 24/50 [00:14<00:14,  1.74it/s]

Acc: 0.043478260869565216


 50%|█████     | 25/50 [00:15<00:12,  2.05it/s]

Acc: 0.041666666666666664


 52%|█████▏    | 26/50 [00:15<00:10,  2.28it/s]

Error: 'Answer:' not found in the generated text.
Acc: 0.041666666666666664


 54%|█████▍    | 27/50 [00:16<00:13,  1.72it/s]

Acc: 0.04


 56%|█████▌    | 28/50 [00:16<00:11,  1.93it/s]

Acc: 0.038461538461538464


 58%|█████▊    | 29/50 [00:17<00:10,  1.95it/s]

Acc: 0.037037037037037035


 60%|██████    | 30/50 [00:18<00:14,  1.37it/s]

Acc: 0.03571428571428571


 62%|██████▏   | 31/50 [00:19<00:16,  1.14it/s]

Error: 'Answer:' not found in the generated text.
Acc: 0.03571428571428571


 64%|██████▍   | 32/50 [00:19<00:12,  1.43it/s]

Acc: 0.034482758620689655


 66%|██████▌   | 33/50 [00:20<00:09,  1.73it/s]

Acc: 0.03333333333333333


 68%|██████▊   | 34/50 [00:20<00:07,  2.07it/s]

Acc: 0.03225806451612903


 70%|███████   | 35/50 [00:21<00:10,  1.43it/s]

Error: 'Answer:' not found in the generated text.
Acc: 0.03225806451612903


 72%|███████▏  | 36/50 [00:22<00:09,  1.51it/s]

Error: 'Answer:' not found in the generated text.
Acc: 0.03225806451612903


 74%|███████▍  | 37/50 [00:23<00:09,  1.37it/s]

Acc: 0.03125


 76%|███████▌  | 38/50 [00:24<00:11,  1.08it/s]

Error: 'Answer:' not found in the generated text.
Acc: 0.03125


 78%|███████▊  | 39/50 [00:24<00:08,  1.36it/s]

Acc: 0.030303030303030304


 80%|████████  | 40/50 [00:25<00:05,  1.68it/s]

Acc: 0.029411764705882353


 82%|████████▏ | 41/50 [00:26<00:06,  1.37it/s]

Acc: 0.02857142857142857


 84%|████████▍ | 42/50 [00:26<00:05,  1.54it/s]

Acc: 0.027777777777777776


 86%|████████▌ | 43/50 [00:27<00:04,  1.46it/s]

Acc: 0.02702702702702703


 88%|████████▊ | 44/50 [00:27<00:03,  1.57it/s]

Error: 'Answer:' not found in the generated text.
Acc: 0.02702702702702703


 90%|█████████ | 45/50 [00:29<00:04,  1.24it/s]

Error: 'Answer:' not found in the generated text.
Acc: 0.02702702702702703


 92%|█████████▏| 46/50 [00:29<00:02,  1.54it/s]

Acc: 0.02631578947368421


 94%|█████████▍| 47/50 [00:29<00:01,  1.87it/s]

Acc: 0.02564102564102564


 96%|█████████▌| 48/50 [00:30<00:01,  1.35it/s]

Error: 'Answer:' not found in the generated text.
Acc: 0.02564102564102564


 98%|█████████▊| 49/50 [00:31<00:00,  1.67it/s]

Acc: 0.025


100%|██████████| 50/50 [00:32<00:00,  1.55it/s]

Error: 'Answer:' not found in the generated text.
Acc: 0.025





In [176]:
from tqdm import tqdm
import json

# Assuming gsm8k is a valid dataset containing questions and answers
gsm8k = load_dataset("gsm8k", "main")['train']

# Assuming prompt_chain_of_thought_3chains.txt is the file you created using the provided code
with open("prompt_chain_of_thought_3chains.txt", "r") as f:
    prompt_chain_of_thought = f.read()

llama = LLM("facebook/opt-1.3b", student_id, password)
results_collected = []
pass_collected = []

# Number of chains of thought to consider for each example
num_chains = 2

for i in tqdm(range(50)):
    cur_question = gsm8k['question'][i]
    cur_answer = gsm8k['answer'][i].split("####")[-1].strip()
    cur_model_input = prompt_chain_of_thought.format(question=cur_question)
    
    # Generate chains of thought
    chains_of_thought = generate_chain_of_thought(cur_question, num_chains)
    
    # Combine chains of thought with the original model input
    cur_model_input_with_chains = f"{cur_model_input}\n\n"
    for j, chain in enumerate(chains_of_thought):
        cur_model_input_with_chains += f"Chain of Thought {j + 1}:\n{chain}\n"
    
    # Assuming generate method of LLM class supports a list of inputs
    result = llama.generate([cur_model_input_with_chains], stop=["\n\n", "[Example 4]"])
    
    if "Error:" in result:
        print(result)
        break
    
    # Extracting the generated answer from the result
    generated_text = result['generations'][0]['text']
    
    # Check if "Answer:" is present in the generated text
    if "Answer:" in generated_text:
        cur_prediction = generated_text.split("Answer:")[1].strip()
        
        # Compare the prediction with the answer
        pass_collected.append(cur_prediction.strip().replace("$", "") == cur_answer)
        
        results_collected.append({
            "question": cur_question,
            "answer": cur_answer,
            "prediction": f"Answer:{cur_prediction}"
        })
    else:
        print(f"Error: 'Answer:' not found in the generated text.")
    
    print(f"Acc: {sum(pass_collected) / len(pass_collected)}")

with open("opt-1.3b.json", "w") as f:
    json.dump(results_collected, f, indent=4)


  2%|▏         | 1/50 [00:00<00:18,  2.66it/s]

Acc: 1.0


  4%|▍         | 2/50 [00:00<00:15,  3.20it/s]

Acc: 0.5


  6%|▌         | 3/50 [00:01<00:34,  1.36it/s]

Error: 'Answer:' not found in the generated text.
Acc: 0.5


  8%|▊         | 4/50 [00:02<00:25,  1.82it/s]

Acc: 0.3333333333333333


 10%|█         | 5/50 [00:02<00:21,  2.06it/s]

Error: 'Answer:' not found in the generated text.
Acc: 0.3333333333333333


 12%|█▏        | 6/50 [00:02<00:18,  2.42it/s]

Acc: 0.25


 14%|█▍        | 7/50 [00:03<00:15,  2.73it/s]

Acc: 0.2


 16%|█▌        | 8/50 [00:03<00:14,  2.90it/s]

Acc: 0.16666666666666666


 18%|█▊        | 9/50 [00:03<00:13,  3.05it/s]

Acc: 0.14285714285714285


 20%|██        | 10/50 [00:03<00:12,  3.18it/s]

Acc: 0.125


 22%|██▏       | 11/50 [00:05<00:24,  1.59it/s]

Acc: 0.1111111111111111


 24%|██▍       | 12/50 [00:06<00:33,  1.14it/s]

Error: 'Answer:' not found in the generated text.
Acc: 0.1111111111111111


 26%|██▌       | 13/50 [00:07<00:25,  1.44it/s]

Acc: 0.1


 28%|██▊       | 14/50 [00:07<00:21,  1.68it/s]

Acc: 0.09090909090909091


 30%|███       | 15/50 [00:07<00:17,  2.00it/s]

Acc: 0.08333333333333333


 32%|███▏      | 16/50 [00:07<00:14,  2.29it/s]

Acc: 0.07692307692307693


 34%|███▍      | 17/50 [00:08<00:13,  2.52it/s]

Acc: 0.07142857142857142


 36%|███▌      | 18/50 [00:08<00:11,  2.76it/s]

Acc: 0.06666666666666667


 38%|███▊      | 19/50 [00:08<00:10,  2.92it/s]

Acc: 0.0625


 40%|████      | 20/50 [00:10<00:20,  1.49it/s]

Error: 'Answer:' not found in the generated text.
Acc: 0.0625


 42%|████▏     | 21/50 [00:10<00:16,  1.80it/s]

Acc: 0.058823529411764705


 44%|████▍     | 22/50 [00:10<00:13,  2.10it/s]

Acc: 0.05555555555555555


 46%|████▌     | 23/50 [00:12<00:20,  1.30it/s]

Error: 'Answer:' not found in the generated text.
Acc: 0.05555555555555555


 48%|████▊     | 24/50 [00:12<00:16,  1.60it/s]

Acc: 0.05263157894736842


 50%|█████     | 25/50 [00:14<00:21,  1.15it/s]

Error: 'Answer:' not found in the generated text.
Acc: 0.05263157894736842


 52%|█████▏    | 26/50 [00:14<00:16,  1.42it/s]

Error: 'Answer:' not found in the generated text.
Acc: 0.05263157894736842


 54%|█████▍    | 27/50 [00:14<00:13,  1.73it/s]

Acc: 0.05


 56%|█████▌    | 28/50 [00:15<00:15,  1.38it/s]

Acc: 0.047619047619047616


 58%|█████▊    | 29/50 [00:15<00:12,  1.68it/s]

Acc: 0.045454545454545456


 60%|██████    | 30/50 [00:16<00:10,  2.00it/s]

Acc: 0.043478260869565216


 62%|██████▏   | 31/50 [00:16<00:08,  2.24it/s]

Acc: 0.041666666666666664


 64%|██████▍   | 32/50 [00:16<00:07,  2.54it/s]

Acc: 0.04


 66%|██████▌   | 33/50 [00:18<00:10,  1.57it/s]

Error: 'Answer:' not found in the generated text.
Acc: 0.04


 68%|██████▊   | 34/50 [00:18<00:08,  1.89it/s]

Error: 'Answer:' not found in the generated text.
Acc: 0.04


 70%|███████   | 35/50 [00:18<00:08,  1.80it/s]

Acc: 0.038461538461538464


 72%|███████▏  | 36/50 [00:19<00:07,  1.96it/s]

Acc: 0.037037037037037035


 74%|███████▍  | 37/50 [00:19<00:05,  2.24it/s]

Acc: 0.03571428571428571


 76%|███████▌  | 38/50 [00:20<00:08,  1.48it/s]

Error: 'Answer:' not found in the generated text.
Acc: 0.03571428571428571


 78%|███████▊  | 39/50 [00:21<00:06,  1.78it/s]

Acc: 0.034482758620689655


 80%|████████  | 40/50 [00:22<00:07,  1.31it/s]

Error: 'Answer:' not found in the generated text.
Acc: 0.034482758620689655


 82%|████████▏ | 41/50 [00:22<00:05,  1.62it/s]

Acc: 0.03333333333333333


 84%|████████▍ | 42/50 [00:22<00:04,  1.95it/s]

Acc: 0.03225806451612903


 86%|████████▌ | 43/50 [00:23<00:03,  2.20it/s]

Acc: 0.03125


 88%|████████▊ | 44/50 [00:23<00:02,  2.26it/s]

Error: 'Answer:' not found in the generated text.
Acc: 0.03125


 90%|█████████ | 45/50 [00:24<00:03,  1.49it/s]

Error: 'Answer:' not found in the generated text.
Acc: 0.03125


 92%|█████████▏| 46/50 [00:25<00:02,  1.81it/s]

Acc: 0.030303030303030304


 94%|█████████▍| 47/50 [00:25<00:01,  2.10it/s]

Acc: 0.029411764705882353


 96%|█████████▌| 48/50 [00:26<00:01,  1.42it/s]

Error: 'Answer:' not found in the generated text.
Acc: 0.029411764705882353


 98%|█████████▊| 49/50 [00:26<00:00,  1.72it/s]

Acc: 0.02857142857142857


100%|██████████| 50/50 [00:27<00:00,  1.83it/s]

Acc: 0.027777777777777776





In [178]:
from tqdm import tqdm
import json

# Assuming gsm8k is a valid dataset containing questions and answers
gsm8k = load_dataset("gsm8k", "main")['train']

# Assuming prompt_chain_of_thought_3chains.txt is the file you created using the provided code
with open("prompt_chain_of_thought_3chains.txt", "r") as f:
    prompt_chain_of_thought = f.read()

llama = LLM("facebook/opt-2.7b", student_id, password)
results_collected = []
pass_collected = []

# Number of chains of thought to consider for each example
num_chains = 2

for i in tqdm(range(50)):
    cur_question = gsm8k['question'][i]
    cur_answer = gsm8k['answer'][i].split("####")[-1].strip()
    cur_model_input = prompt_chain_of_thought.format(question=cur_question)
    
    # Generate chains of thought
    chains_of_thought = generate_chain_of_thought(cur_question, num_chains)
    
    # Combine chains of thought with the original model input
    cur_model_input_with_chains = f"{cur_model_input}\n\n"
    for j, chain in enumerate(chains_of_thought):
        cur_model_input_with_chains += f"Chain of Thought {j + 1}:\n{chain}\n"
    
    # Assuming generate method of LLM class supports a list of inputs
    result = llama.generate([cur_model_input_with_chains], stop=["\n\n", "[Example 4]"])
    
    if "Error:" in result:
        print(result)
        break
    
    # Extracting the generated answer from the result
    generated_text = result['generations'][0]['text']
    
    # Check if "Answer:" is present in the generated text
    if "Answer:" in generated_text:
        cur_prediction = generated_text.split("Answer:")[1].strip()
        
        # Compare the prediction with the answer
        pass_collected.append(cur_prediction.strip().replace("$", "") == cur_answer)
        
        results_collected.append({
            "question": cur_question,
            "answer": cur_answer,
            "prediction": f"Answer:{cur_prediction}"
        })
    else:
        print(f"Error: 'Answer:' not found in the generated text.")
    
    print(f"Acc: {sum(pass_collected) / len(pass_collected)}")

with open("opt-2.7b.json", "w") as f:
    json.dump(results_collected, f, indent=4)


  2%|▏         | 1/50 [00:00<00:17,  2.75it/s]

Acc: 0.0


  4%|▍         | 2/50 [00:00<00:15,  3.07it/s]

Acc: 0.0


  6%|▌         | 3/50 [00:00<00:15,  3.09it/s]

Acc: 0.0


  8%|▊         | 4/50 [00:01<00:15,  2.89it/s]

Acc: 0.0


 10%|█         | 5/50 [00:02<00:22,  2.01it/s]

Acc: 0.0


 12%|█▏        | 6/50 [00:02<00:21,  2.04it/s]

Acc: 0.0


 14%|█▍        | 7/50 [00:02<00:18,  2.32it/s]

Acc: 0.0


 16%|█▌        | 8/50 [00:04<00:27,  1.54it/s]

Error: 'Answer:' not found in the generated text.
Acc: 0.0


 18%|█▊        | 9/50 [00:04<00:22,  1.80it/s]

Acc: 0.0


 20%|██        | 10/50 [00:05<00:26,  1.53it/s]

Acc: 0.0


 22%|██▏       | 11/50 [00:05<00:21,  1.81it/s]

Acc: 0.0


 24%|██▍       | 12/50 [00:07<00:32,  1.18it/s]

Error: 'Answer:' not found in the generated text.
Acc: 0.0


 26%|██▌       | 13/50 [00:07<00:25,  1.47it/s]

Acc: 0.0


 28%|██▊       | 14/50 [00:08<00:31,  1.15it/s]

Acc: 0.0


 30%|███       | 15/50 [00:08<00:24,  1.44it/s]

Acc: 0.0


 32%|███▏      | 16/50 [00:09<00:19,  1.71it/s]

Acc: 0.0


 34%|███▍      | 17/50 [00:09<00:17,  1.90it/s]

Acc: 0.0


 36%|███▌      | 18/50 [00:10<00:19,  1.61it/s]

Acc: 0.0


 38%|███▊      | 19/50 [00:11<00:20,  1.51it/s]

Error: 'Answer:' not found in the generated text.
Acc: 0.0


 40%|████      | 20/50 [00:11<00:19,  1.56it/s]

Acc: 0.0


 42%|████▏     | 21/50 [00:13<00:23,  1.26it/s]

Acc: 0.0


 44%|████▍     | 22/50 [00:13<00:18,  1.48it/s]

Acc: 0.0


 46%|████▌     | 23/50 [00:14<00:22,  1.21it/s]

Acc: 0.0


 48%|████▊     | 24/50 [00:15<00:18,  1.42it/s]

Acc: 0.0


 50%|█████     | 25/50 [00:15<00:19,  1.29it/s]

Acc: 0.0


 52%|█████▏    | 26/50 [00:16<00:19,  1.25it/s]

Acc: 0.0


 54%|█████▍    | 27/50 [00:17<00:15,  1.49it/s]

Acc: 0.0


 56%|█████▌    | 28/50 [00:17<00:12,  1.71it/s]

Acc: 0.0


 58%|█████▊    | 29/50 [00:19<00:18,  1.16it/s]

Error: 'Answer:' not found in the generated text.
Acc: 0.0


 60%|██████    | 30/50 [00:19<00:13,  1.43it/s]

Acc: 0.0


 62%|██████▏   | 31/50 [00:19<00:11,  1.72it/s]

Acc: 0.0


 64%|██████▍   | 32/50 [00:20<00:13,  1.32it/s]

Acc: 0.0


 66%|██████▌   | 33/50 [00:21<00:10,  1.57it/s]

Error: 'Answer:' not found in the generated text.
Acc: 0.0


 68%|██████▊   | 34/50 [00:21<00:08,  1.85it/s]

Acc: 0.0


 70%|███████   | 35/50 [00:22<00:07,  1.99it/s]

Acc: 0.0


 72%|███████▏  | 36/50 [00:23<00:11,  1.23it/s]

Acc: 0.0


 74%|███████▍  | 37/50 [00:23<00:08,  1.48it/s]

Acc: 0.0


 76%|███████▌  | 38/50 [00:24<00:07,  1.71it/s]

Acc: 0.0


 78%|███████▊  | 39/50 [00:24<00:05,  1.98it/s]

Acc: 0.0


 80%|████████  | 40/50 [00:25<00:06,  1.64it/s]

Acc: 0.0


 82%|████████▏ | 41/50 [00:26<00:05,  1.54it/s]

Acc: 0.0


 84%|████████▍ | 42/50 [00:26<00:04,  1.82it/s]

Acc: 0.0


 86%|████████▌ | 43/50 [00:26<00:03,  2.06it/s]

Acc: 0.0


 88%|████████▊ | 44/50 [00:27<00:03,  1.56it/s]

Acc: 0.0


 90%|█████████ | 45/50 [00:28<00:03,  1.28it/s]

Acc: 0.0


 92%|█████████▏| 46/50 [00:29<00:02,  1.50it/s]

Acc: 0.0


 94%|█████████▍| 47/50 [00:29<00:01,  1.78it/s]

Acc: 0.0


 96%|█████████▌| 48/50 [00:29<00:00,  2.08it/s]

Acc: 0.0


 98%|█████████▊| 49/50 [00:31<00:00,  1.26it/s]

Error: 'Answer:' not found in the generated text.
Acc: 0.0


100%|██████████| 50/50 [00:31<00:00,  1.57it/s]

Acc: 0.0



