In [None]:
!pip install groq python-dotenv numpy tqdm datasets

[1;32m2[0m[1;32m channel Terms of Service accepted[0m
Channels:
 - defaults
Platform: linux-64
Collecting package metadata (repodata.json): done
Solving environment: done

## Package Plan ##

  environment location: /home/ungeun/miniconda

  added / updated specs:
    - datasets
    - numpy
    - python-dotenv
    - tqdm


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    aiohappyeyeballs-2.4.4     |  py313h06a4308_0          27 KB
    aiohttp-3.11.10            |  py313h5eee18b_0         945 KB
    aiosignal-1.2.0            |     pyhd3eb1b0_0          12 KB
    arrow-cpp-19.0.0           |       h8011034_4        12.4 MB
    attrs-24.3.0               |  py313h06a4308_0         175 KB
    aws-c-auth-0.9.0           |       h5eee18b_1         119 KB
    aws-c-cal-0.9.2            |       hdb10cc7_0          50 KB
    aws-c-common-0.12.3        |       h5eee18b_0         254 KB
    aws

In [1]:
from groq import Groq
from dotenv import load_dotenv
from datasets import load_dataset

import os
from tqdm import tqdm
import re
import random
import pprint

from typing import List, Dict, Any

load_dotenv()
random.seed(0)

client = Groq()
gsm8k_dataset = load_dataset("gsm8k", "main")

gsm8k_train = gsm8k_dataset["train"]
gsm8k_test  = gsm8k_dataset["test"]

In [2]:
def generate_response_using_Llama(
        prompt: str,
        model: str = "llama3-8b-8192"
    ):
    try:
        chat_completion = client.chat.completions.create(
            messages=[
                {
                    "role": "system",
                    "content": "You are a helpful assistant that solves math problems."
                },
                {
                    "role": "user", 
                    "content": prompt
                }
            ],
            model=model,
            temperature=0.3, ### 수정해도 됩니다!
            stream=False
        )
        return chat_completion.choices[0].message.content
    
    except Exception as e:
        print(f"API call error: {str(e)}")
        return None

#### 응답 잘 나오는지 확인해보기

In [3]:
response = generate_response_using_Llama(
    prompt="Hello world!",
)
print(response)

Hello! I'm excited to help you with any math problems you have. What kind of math are you working on? Do you have a specific problem you're stuck on or a concept you're trying to understand? Let me know and I'll do my best to assist you!


#### GSM8K 데이터셋 확인해보기

In [4]:
print("[Question]")
for l in gsm8k_test['question'][0].split("."):
    print(l)
print("="*100)
print("[Answer]")
print(gsm8k_test['answer'][0])

[Question]
Janet’s ducks lay 16 eggs per day
 She eats three for breakfast every morning and bakes muffins for her friends every day with four
 She sells the remainder at the farmers' market daily for $2 per fresh duck egg
 How much in dollars does she make every day at the farmers' market?
[Answer]
Janet sells 16 - 3 - 4 = <<16-3-4=9>>9 duck eggs a day.
She makes 9 * 2 = $<<9*2=18>>18 every day at the farmer’s market.
#### 18


#### Util 함수들
- extract_final_answer: LLM의 응답을 parse하여 최종 결과만 추출 (정답과 비교하기 위해)
- run_benchmark_test: 벤치마크 테스트
- save_final_result: 결과물 제출을 위한 함수

In [5]:
### 수정해도 됩니다!
def extract_final_answer(response: str):
    regex = r"(?:Answer:|Model response:)\s*\$?([0-9,]+)\b|([0-9,]+)\s*(meters|cups|miles|minutes)"
    matches = re.finditer(regex, response, re.MULTILINE)
    results = [match.group(1) if match.group(1) else match.group(2).replace(",", "") for match in matches]

    if len(results) == 0:
        additional_regex = r"\$?([0-9,]+)"
        additional_matches = re.finditer(additional_regex, response, re.MULTILINE)
        results.extend([match.group(1).replace(",", "") for match in additional_matches])

    return results[-1] if results else None

In [None]:
### 수정해도 됩니다!
def run_benchmark_test(
        dataset,
        prompt: str,
        model: str = "llama3-8b-8192",
        num_samples: int = 50,
        VERBOSE: bool = False
    ):
    correct = 0
    total   = 0
    results = []

    for i in tqdm(range(min(num_samples, len(dataset)))):
        question = dataset[i]["question"]
        correct_answer = float(re.findall(r'\d+(?:\.\d+)?', dataset[i]["answer"].split('####')[-1])[0])

        response = generate_response_using_Llama(
            prompt=prompt.format(question=question),
            model=model
        )

        if response:
            if VERBOSE:
                print("="*50)
                print(response)
                print("="*50)
            predicted_answer = extract_final_answer(response)

            if isinstance(predicted_answer, str):
                # 말잘듣는것도 실력이다
                try:
                    predicted_answer = float(predicted_answer.replace(",", ""))
                except ValueError:
                    predicted_answer = 0
            
            diff = abs(predicted_answer - correct_answer)
            is_correct = diff < 1e-5 if predicted_answer is not None else False
            
            if is_correct:
                correct += 1
            total += 1
            
            results.append({
                'question': question,
                'correct_answer': correct_answer,
                'predicted_answer': predicted_answer,
                'response': response,
                'correct': is_correct
            })

            if (i + 1) % 5 == 0:
                current_acc = correct/total if total > 0 else 0
                print(f"Progress: [{i+1}/{num_samples}]")
                print(f"Current Acc.: [{current_acc:.2%}]")

    return results, correct/total if total > 0 else 0

In [7]:
def save_final_result(results: List[Dict[str, Any]], accuracy: float, filename: str) -> None:
    result_str = f"====== ACCURACY: {accuracy} ======\n\n"
    result_str += f"[Details]\n"
    
    for idx, result in enumerate(results):
        result_str += f"Question {idx+1}: {result['question']}\n"
        result_str += f"Correct Answer: {result['correct_answer']}\n"
        result_str += f"Predicted Answer: {result['predicted_answer']}\n"
        result_str += f"Correct: {result['correct']}\n\n"
    
    with open(filename, "w", encoding="utf-8") as f:
        f.write(result_str)

#### Direct prompting with few-shot example

In [10]:
def construct_direct_prompt(num_examples: int = 3) -> str:
    train_dataset = gsm8k_train

    sampled_indices = random.sample(
        [i for i in range(len(train_dataset['question']))],
        num_examples
    )

    prompt = "Instruction:\nSolve the following mathematical question and generate ONLY the answer after a tag, 'Answer:' without any rationale.\n"

    for i in range(num_examples):
        cur_question = train_dataset['question'][i]
        cur_answer = train_dataset['answer'][i].split("####")[-1].strip()

        prompt += f"\n[Example {i+1}]\n"
        prompt += f"Question:\n{cur_question}\n"
        prompt += f"Answer:{cur_answer}\n"

    prompt += "\nQuestion:\n{question}\nAnswer:"

    return prompt

In [11]:
### 어떤 방식으로 저장되는지 확인해보세요!
PROMPT = construct_direct_prompt(3)
VERBOSE = False

results, accuracy = run_benchmark_test(
    dataset=gsm8k_test,
    prompt=PROMPT,
    VERBOSE=VERBOSE,
    num_samples=10
)
save_final_result(results, accuracy, "example.txt")

 50%|█████     | 5/10 [00:02<00:02,  2.00it/s]

Progress: [5/10]
Current Acc.: [20.00%]


100%|██████████| 10/10 [00:05<00:00,  1.70it/s]

Progress: [10/10]
Current Acc.: [20.00%]





In [10]:
# TODO: 0 shot, 3 shot, 5 shot direct prompting을 통해 벤치마크 테스트를 한 후, 각각 direct_prompting_{shot: int}.txt로 저장해주세요!
# 예시: shot이 5인 경우 direct_prompting_5.txt
# 항상 num_samples=50 입니다!

for shot in [0, 3, 5]:
    PROMPT = construct_direct_prompt(shot)
    results, accuracy = run_benchmark_test(
        dataset=gsm8k_test,
        prompt=PROMPT,
        VERBOSE=VERBOSE,
        num_samples=50
    )
    save_final_result(results, accuracy, f"direct_prompting_{shot}.txt")
    print(f"{shot} shot completed with {accuracy:.2%} accuracy")


 10%|█         | 5/50 [00:03<00:32,  1.40it/s]

Progress: [5/50]
Current Acc.: [40.00%]


 20%|██        | 10/50 [00:05<00:22,  1.82it/s]

Progress: [10/50]
Current Acc.: [30.00%]


 30%|███       | 15/50 [00:08<00:17,  1.95it/s]

Progress: [15/50]
Current Acc.: [20.00%]


 40%|████      | 20/50 [00:10<00:14,  2.03it/s]

Progress: [20/50]
Current Acc.: [25.00%]


 50%|█████     | 25/50 [00:13<00:17,  1.41it/s]

Progress: [25/50]
Current Acc.: [24.00%]


 60%|██████    | 30/50 [00:16<00:11,  1.75it/s]

Progress: [30/50]
Current Acc.: [26.67%]


 70%|███████   | 35/50 [00:30<00:35,  2.35s/it]

Progress: [35/50]
Current Acc.: [22.86%]


 80%|████████  | 40/50 [00:44<00:26,  2.64s/it]

Progress: [40/50]
Current Acc.: [20.00%]


 90%|█████████ | 45/50 [00:56<00:13,  2.62s/it]

Progress: [45/50]
Current Acc.: [20.00%]


100%|██████████| 50/50 [01:09<00:00,  1.40s/it]


Progress: [50/50]
Current Acc.: [20.00%]
0 shot completed with 20.00% accuracy


 10%|█         | 5/50 [00:12<01:55,  2.58s/it]

Progress: [5/50]
Current Acc.: [40.00%]


 20%|██        | 10/50 [00:26<01:50,  2.76s/it]

Progress: [10/50]
Current Acc.: [30.00%]


 30%|███       | 15/50 [00:39<01:31,  2.60s/it]

Progress: [15/50]
Current Acc.: [26.67%]


 40%|████      | 20/50 [00:53<01:20,  2.67s/it]

Progress: [20/50]
Current Acc.: [25.00%]


 50%|█████     | 25/50 [01:06<01:08,  2.73s/it]

Progress: [25/50]
Current Acc.: [24.00%]


 60%|██████    | 30/50 [01:20<00:55,  2.76s/it]

Progress: [30/50]
Current Acc.: [20.00%]


 70%|███████   | 35/50 [01:34<00:41,  2.74s/it]

Progress: [35/50]
Current Acc.: [17.14%]


 80%|████████  | 40/50 [01:48<00:27,  2.76s/it]

Progress: [40/50]
Current Acc.: [15.00%]


 90%|█████████ | 45/50 [02:01<00:12,  2.44s/it]

Progress: [45/50]
Current Acc.: [17.78%]


100%|██████████| 50/50 [02:14<00:00,  2.70s/it]


Progress: [50/50]
Current Acc.: [20.00%]
3 shot completed with 20.00% accuracy


 10%|█         | 5/50 [00:14<02:09,  2.87s/it]

Progress: [5/50]
Current Acc.: [40.00%]


 20%|██        | 10/50 [00:27<01:48,  2.72s/it]

Progress: [10/50]
Current Acc.: [20.00%]


 30%|███       | 15/50 [00:41<01:33,  2.68s/it]

Progress: [15/50]
Current Acc.: [13.33%]


 40%|████      | 20/50 [00:54<01:17,  2.58s/it]

Progress: [20/50]
Current Acc.: [20.00%]


 50%|█████     | 25/50 [01:08<01:06,  2.66s/it]

Progress: [25/50]
Current Acc.: [24.00%]


 60%|██████    | 30/50 [01:21<00:51,  2.59s/it]

Progress: [30/50]
Current Acc.: [23.33%]


 70%|███████   | 35/50 [01:35<00:41,  2.79s/it]

Progress: [35/50]
Current Acc.: [20.00%]


 80%|████████  | 40/50 [01:49<00:27,  2.77s/it]

Progress: [40/50]
Current Acc.: [17.50%]


 90%|█████████ | 45/50 [02:03<00:14,  2.85s/it]

Progress: [45/50]
Current Acc.: [17.78%]


100%|██████████| 50/50 [02:16<00:00,  2.73s/it]

Progress: [50/50]
Current Acc.: [18.00%]
5 shot completed with 18.00% accuracy





### Chain-of-Thought prompting with few-shot example
```text
[Question]
Janet’s ducks lay 16 eggs per day
 She eats three for breakfast every morning and bakes muffins for her friends every day with four
 She sells the remainder at the farmers' market daily for $2 per fresh duck egg
 How much in dollars does she make every day at the farmers' market?
====================================================================================================
[Answer]
Janet sells 16 - 3 - 4 = <<16-3-4=9>>9 duck eggs a day.
She makes 9 * 2 = $<<9*2=18>>18 every day at the farmer’s market.
#### 18
```

[Answer] 아래의 정답을 도출하는 과정을 예시로 달아주면 CoT의 few shot이 되겠죠?

In [20]:
def construct_CoT_prompt(example_list: List[str], num_examples: int = 3):
    train_dataset = gsm8k_train

    sampled_indices = random.sample(
        [i for i in range(len(train_dataset['question']))],
        num_examples
    )
    prompt = "For each question, 1) Show a clear chain of thought. 2) Verify every arithmetic step. 3) In the LAST LINE, your text MUST be 'Answer:' without any rationale.\nAlso, make sure that you are correct.\n"

    for i in range(num_examples):
        idx = sampled_indices[i]
        q = train_dataset['question'][idx]
        a = re.findall(r'\d+(?:\.\d+)?', train_dataset['answer'][idx].split("####")[-1])[0]

        prompt += f"Q: {q}\n"
        prompt += "A: Let me think step by step.\n"
        prompt += "1) Extract numbers.\n"
        prompt += "2) Compute intermediate result.\n"
        prompt += f"3) Final: {a}\n"

    prompt += "\nQuestion:\n{question}\nAnswer:"
    prompt += "Check your work:\n"
    prompt += "1) Recompute key steps.\n"
    prompt += "2) Matches previous result?\n"
    prompt += "3) If not, correct the answer.\n"
    prompt += "Answer:"

    return prompt


In [None]:
# TODO: 0 shot, 3 shot, 5 shot CoT prompting을 통해 벤치마크 테스트를 한 후, 각각 CoT_prompting_{shot: int}.txt로 저장해주세요!
# 예시: shot이 5인 경우 CoT_prompting_5.txt
# 항상 num_samples=50 입니다!

for shot in [0, 3, 5]:
    PROMPT = construct_CoT_prompt([], shot)
    results, accuracy = run_benchmark_test(
        dataset=gsm8k_test,
        prompt=PROMPT,
        VERBOSE=VERBOSE,
        num_samples=50
    )
    save_final_result(results, accuracy, f"CoT_prompting_{shot}.txt")
    print(f"{shot} shot completed with {accuracy:.2%} accuracy")

 10%|█         | 5/50 [00:27<04:28,  5.97s/it]

Progress: [5/50]
Current Acc.: [60.00%]


 20%|██        | 10/50 [01:00<04:25,  6.63s/it]

Progress: [10/50]
Current Acc.: [40.00%]


 30%|███       | 15/50 [01:44<05:36,  9.62s/it]

Progress: [15/50]
Current Acc.: [33.33%]


 40%|████      | 20/50 [02:15<03:22,  6.75s/it]

Progress: [20/50]
Current Acc.: [30.00%]


 50%|█████     | 25/50 [02:45<02:30,  6.02s/it]

Progress: [25/50]
Current Acc.: [28.00%]


 60%|██████    | 30/50 [03:14<01:58,  5.93s/it]

Progress: [30/50]
Current Acc.: [30.00%]


 70%|███████   | 35/50 [03:44<01:30,  6.05s/it]

Progress: [35/50]
Current Acc.: [31.43%]


 80%|████████  | 40/50 [04:17<01:06,  6.62s/it]

Progress: [40/50]
Current Acc.: [32.50%]


 90%|█████████ | 45/50 [04:50<00:33,  6.69s/it]

Progress: [45/50]
Current Acc.: [31.11%]


100%|██████████| 50/50 [05:21<00:00,  6.44s/it]


Progress: [50/50]
Current Acc.: [30.00%]
0 shot completed with 30.00% accuracy


 10%|█         | 5/50 [00:33<05:13,  6.96s/it]

Progress: [5/50]
Current Acc.: [60.00%]


 20%|██        | 10/50 [01:07<04:47,  7.19s/it]

Progress: [10/50]
Current Acc.: [50.00%]


 30%|███       | 15/50 [01:42<04:11,  7.18s/it]

Progress: [15/50]
Current Acc.: [46.67%]


 40%|████      | 20/50 [02:15<03:22,  6.76s/it]

Progress: [20/50]
Current Acc.: [45.00%]


 50%|█████     | 25/50 [02:45<02:26,  5.84s/it]

Progress: [25/50]
Current Acc.: [40.00%]


 60%|██████    | 30/50 [03:18<02:09,  6.48s/it]

Progress: [30/50]
Current Acc.: [36.67%]


 70%|███████   | 35/50 [03:47<01:29,  5.98s/it]

Progress: [35/50]
Current Acc.: [34.29%]


 80%|████████  | 40/50 [04:20<01:04,  6.49s/it]

Progress: [40/50]
Current Acc.: [37.50%]


 90%|█████████ | 45/50 [04:58<00:37,  7.41s/it]

Progress: [45/50]
Current Acc.: [33.33%]


100%|██████████| 50/50 [05:28<00:00,  6.56s/it]


Progress: [50/50]
Current Acc.: [34.00%]
3 shot completed with 34.00% accuracy


 10%|█         | 5/50 [00:30<04:32,  6.06s/it]

Progress: [5/50]
Current Acc.: [100.00%]


 20%|██        | 10/50 [01:04<04:43,  7.09s/it]

Progress: [10/50]
Current Acc.: [50.00%]


 30%|███       | 15/50 [01:38<03:47,  6.50s/it]

Progress: [15/50]
Current Acc.: [46.67%]


 40%|████      | 20/50 [02:04<02:59,  5.99s/it]

Progress: [20/50]
Current Acc.: [50.00%]


 50%|█████     | 25/50 [02:35<02:31,  6.06s/it]

Progress: [25/50]
Current Acc.: [44.00%]


 60%|██████    | 30/50 [03:08<02:10,  6.51s/it]

Progress: [30/50]
Current Acc.: [43.33%]


 70%|███████   | 35/50 [03:38<01:32,  6.17s/it]

Progress: [35/50]
Current Acc.: [45.71%]


 80%|████████  | 40/50 [04:10<01:04,  6.47s/it]

Progress: [40/50]
Current Acc.: [42.50%]


 90%|█████████ | 45/50 [04:44<00:32,  6.50s/it]

Progress: [45/50]
Current Acc.: [42.22%]


100%|██████████| 50/50 [05:16<00:00,  6.33s/it]

Progress: [50/50]
Current Acc.: [38.00%]
5 shot completed with 38.00% accuracy





### Construct your prompt!!

목표: 본인만의 프롬프트를 통해 정답률을 더 끌어올려보기!
- gsm8k의 train 데이터셋에서 예시를 가져온 다음 (자유롭게!)
- 그 예시들에 대한 풀이 과정을 만들어주세요!
- 모든 것들이 자유입니다! Direct Prompting, CoT Prompting을 한 결과보다 정답률만 높으면 돼요.

In [8]:
def construct_my_prompt(num_examples: int = 3) -> str:
    train_dataset = gsm8k_train

    sampled_indices = random.sample(
        [i for i in range(len(train_dataset['question']))],
        num_examples
    )
    prompt = "Instruction: Solve each problem step by step, showing your chain of thought. In the last row of your output, your text MUST be 'Answer:' without any rationale.\n"

    for i in range(num_examples):
        idx = sampled_indices[i]
        q = train_dataset['question'][idx]
        a = re.findall(r'\d+(?:\.\d+)?', train_dataset['answer'][idx].split("####")[-1])[0]

        prompt += f"Q: {q}\n"
        prompt += "A: Let me think step by step.\n"
        prompt += "1) Understand the problem.\n"
        prompt += "2) Perform the calculations step by step.\n"
        prompt += f"3) The final answer is: {a}\n"

    prompt += "\nQuestion:\n{question}\nAnswer:"
    
    return prompt

In [12]:
# TODO: 만든 0 shot, 3 shot, 5 shot example과 프롬프트를 통해 벤치마크 테스트를 한 후, 각각 My_prompting_{shot: int}.txt로 저장해주세요!
# 예시: shot이 5인 경우 My_prompting_5.txt
# 항상 num_samples=50 입니다!

for shot in [0, 3, 5]:
    PROMPT = construct_my_prompt(shot)
    results, accuracy = run_benchmark_test(
        dataset=gsm8k_test,
        prompt=PROMPT,
        VERBOSE=VERBOSE,
        num_samples=50
    )
    save_final_result(results, accuracy, f"My_prompting_{shot}.txt")
    print(f"{shot} shot completed with {accuracy:.2%} accuracy")

 10%|█         | 5/50 [00:04<00:36,  1.23it/s]

Progress: [5/50]
Current Acc.: [40.00%]


 20%|██        | 10/50 [00:08<00:34,  1.17it/s]

Progress: [10/50]
Current Acc.: [50.00%]


 30%|███       | 15/50 [00:21<01:36,  2.74s/it]

Progress: [15/50]
Current Acc.: [53.33%]


 40%|████      | 20/50 [00:26<00:37,  1.23s/it]

Progress: [20/50]
Current Acc.: [55.00%]


 50%|█████     | 25/50 [00:39<01:00,  2.43s/it]

Progress: [25/50]
Current Acc.: [48.00%]


 60%|██████    | 30/50 [00:53<00:55,  2.80s/it]

Progress: [30/50]
Current Acc.: [50.00%]


 70%|███████   | 35/50 [01:06<00:37,  2.49s/it]

Progress: [35/50]
Current Acc.: [57.14%]


 80%|████████  | 40/50 [01:23<00:36,  3.61s/it]

Progress: [40/50]
Current Acc.: [55.00%]


 90%|█████████ | 45/50 [01:38<00:15,  3.10s/it]

Progress: [45/50]
Current Acc.: [57.78%]


100%|██████████| 50/50 [01:56<00:00,  2.33s/it]


Progress: [50/50]
Current Acc.: [58.00%]
0 shot completed with 58.00% accuracy


 10%|█         | 5/50 [00:35<05:29,  7.33s/it]

Progress: [5/50]
Current Acc.: [80.00%]


 20%|██        | 10/50 [01:15<05:15,  7.89s/it]

Progress: [10/50]
Current Acc.: [60.00%]


 30%|███       | 15/50 [01:53<04:40,  8.00s/it]

Progress: [15/50]
Current Acc.: [53.33%]


 40%|████      | 20/50 [02:33<03:58,  7.94s/it]

Progress: [20/50]
Current Acc.: [55.00%]


 50%|█████     | 25/50 [03:13<03:08,  7.53s/it]

Progress: [25/50]
Current Acc.: [48.00%]


 60%|██████    | 30/50 [03:51<02:33,  7.65s/it]

Progress: [30/50]
Current Acc.: [53.33%]


 70%|███████   | 35/50 [04:28<01:53,  7.58s/it]

Progress: [35/50]
Current Acc.: [57.14%]


 80%|████████  | 40/50 [05:06<01:16,  7.69s/it]

Progress: [40/50]
Current Acc.: [55.00%]


 90%|█████████ | 45/50 [05:44<00:38,  7.60s/it]

Progress: [45/50]
Current Acc.: [51.11%]


100%|██████████| 50/50 [06:26<00:00,  7.73s/it]


Progress: [50/50]
Current Acc.: [50.00%]
3 shot completed with 50.00% accuracy


 10%|█         | 5/50 [00:39<05:53,  7.85s/it]

Progress: [5/50]
Current Acc.: [80.00%]


 20%|██        | 10/50 [01:23<05:52,  8.81s/it]

Progress: [10/50]
Current Acc.: [50.00%]


 30%|███       | 15/50 [02:11<05:18,  9.11s/it]

Progress: [15/50]
Current Acc.: [40.00%]


 40%|████      | 20/50 [02:41<03:49,  7.66s/it]

Progress: [20/50]
Current Acc.: [40.00%]


 50%|█████     | 25/50 [03:24<03:23,  8.15s/it]

Progress: [25/50]
Current Acc.: [40.00%]


 62%|██████▏   | 31/50 [04:06<01:51,  5.84s/it]

Progress: [30/50]
Current Acc.: [36.67%]
API call error: Error code: 429 - {'error': {'message': 'Rate limit reached for model `llama3-8b-8192` in organization `org_01k1mj2907erjv98ets7sd74x3` service tier `on_demand` on tokens per day (TPD): Limit 500000, Used 499906, Requested 675. Please try again in 1m40.3916s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}


 66%|██████▌   | 33/50 [04:06<00:50,  2.95s/it]

API call error: Error code: 429 - {'error': {'message': 'Rate limit reached for model `llama3-8b-8192` in organization `org_01k1mj2907erjv98ets7sd74x3` service tier `on_demand` on tokens per day (TPD): Limit 500000, Used 499905, Requested 704. Please try again in 1m45.2268s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}
API call error: Error code: 429 - {'error': {'message': 'Rate limit reached for model `llama3-8b-8192` in organization `org_01k1mj2907erjv98ets7sd74x3` service tier `on_demand` on tokens per day (TPD): Limit 500000, Used 499904, Requested 684. Please try again in 1m41.5938s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}


 70%|███████   | 35/50 [04:07<00:23,  1.55s/it]

API call error: Error code: 429 - {'error': {'message': 'Rate limit reached for model `llama3-8b-8192` in organization `org_01k1mj2907erjv98ets7sd74x3` service tier `on_demand` on tokens per day (TPD): Limit 500000, Used 499903, Requested 672. Please try again in 1m39.3432s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}
API call error: Error code: 429 - {'error': {'message': 'Rate limit reached for model `llama3-8b-8192` in organization `org_01k1mj2907erjv98ets7sd74x3` service tier `on_demand` on tokens per day (TPD): Limit 500000, Used 499902, Requested 683. Please try again in 1m41.05s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}


 74%|███████▍  | 37/50 [04:07<00:11,  1.18it/s]

API call error: Error code: 429 - {'error': {'message': 'Rate limit reached for model `llama3-8b-8192` in organization `org_01k1mj2907erjv98ets7sd74x3` service tier `on_demand` on tokens per day (TPD): Limit 500000, Used 499901, Requested 688. Please try again in 1m41.736s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}
API call error: Error code: 429 - {'error': {'message': 'Rate limit reached for model `llama3-8b-8192` in organization `org_01k1mj2907erjv98ets7sd74x3` service tier `on_demand` on tokens per day (TPD): Limit 500000, Used 499900, Requested 676. Please try again in 1m39.4894s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}


 78%|███████▊  | 39/50 [04:07<00:05,  1.98it/s]

API call error: Error code: 429 - {'error': {'message': 'Rate limit reached for model `llama3-8b-8192` in organization `org_01k1mj2907erjv98ets7sd74x3` service tier `on_demand` on tokens per day (TPD): Limit 500000, Used 499899, Requested 700. Please try again in 1m43.457599999s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}
API call error: Error code: 429 - {'error': {'message': 'Rate limit reached for model `llama3-8b-8192` in organization `org_01k1mj2907erjv98ets7sd74x3` service tier `on_demand` on tokens per day (TPD): Limit 500000, Used 499898, Requested 682. Please try again in 1m40.1722s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}


 82%|████████▏ | 41/50 [04:08<00:02,  3.04it/s]

API call error: Error code: 429 - {'error': {'message': 'Rate limit reached for model `llama3-8b-8192` in organization `org_01k1mj2907erjv98ets7sd74x3` service tier `on_demand` on tokens per day (TPD): Limit 500000, Used 499897, Requested 720. Please try again in 1m46.5836s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}
API call error: Error code: 429 - {'error': {'message': 'Rate limit reached for model `llama3-8b-8192` in organization `org_01k1mj2907erjv98ets7sd74x3` service tier `on_demand` on tokens per day (TPD): Limit 500000, Used 499896, Requested 687. Please try again in 1m40.724199999s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}


 86%|████████▌ | 43/50 [04:08<00:01,  3.99it/s]

API call error: Error code: 429 - {'error': {'message': 'Rate limit reached for model `llama3-8b-8192` in organization `org_01k1mj2907erjv98ets7sd74x3` service tier `on_demand` on tokens per day (TPD): Limit 500000, Used 499895, Requested 781. Please try again in 1m56.8124s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}
API call error: Error code: 429 - {'error': {'message': 'Rate limit reached for model `llama3-8b-8192` in organization `org_01k1mj2907erjv98ets7sd74x3` service tier `on_demand` on tokens per day (TPD): Limit 500000, Used 499894, Requested 730. Please try again in 1m47.815599999s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}


 90%|█████████ | 45/50 [04:08<00:01,  4.67it/s]

API call error: Error code: 429 - {'error': {'message': 'Rate limit reached for model `llama3-8b-8192` in organization `org_01k1mj2907erjv98ets7sd74x3` service tier `on_demand` on tokens per day (TPD): Limit 500000, Used 499893, Requested 699. Please try again in 1m42.2838s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}
API call error: Error code: 429 - {'error': {'message': 'Rate limit reached for model `llama3-8b-8192` in organization `org_01k1mj2907erjv98ets7sd74x3` service tier `on_demand` on tokens per day (TPD): Limit 500000, Used 499892, Requested 709. Please try again in 1m43.8338s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}


 94%|█████████▍| 47/50 [04:09<00:00,  3.91it/s]

API call error: Error code: 429 - {'error': {'message': 'Rate limit reached for model `llama3-8b-8192` in organization `org_01k1mj2907erjv98ets7sd74x3` service tier `on_demand` on tokens per day (TPD): Limit 500000, Used 499890, Requested 755. Please try again in 1m51.333599999s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}
API call error: Error code: 429 - {'error': {'message': 'Rate limit reached for model `llama3-8b-8192` in organization `org_01k1mj2907erjv98ets7sd74x3` service tier `on_demand` on tokens per day (TPD): Limit 500000, Used 499889, Requested 738. Please try again in 1m48.214999999s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}


 98%|█████████▊| 49/50 [04:09<00:00,  4.71it/s]

API call error: Error code: 429 - {'error': {'message': 'Rate limit reached for model `llama3-8b-8192` in organization `org_01k1mj2907erjv98ets7sd74x3` service tier `on_demand` on tokens per day (TPD): Limit 500000, Used 499888, Requested 687. Please try again in 1m39.2332s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}
API call error: Error code: 429 - {'error': {'message': 'Rate limit reached for model `llama3-8b-8192` in organization `org_01k1mj2907erjv98ets7sd74x3` service tier `on_demand` on tokens per day (TPD): Limit 500000, Used 499887, Requested 683. Please try again in 1m38.374s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}


100%|██████████| 50/50 [04:10<00:00,  5.00s/it]

API call error: Error code: 429 - {'error': {'message': 'Rate limit reached for model `llama3-8b-8192` in organization `org_01k1mj2907erjv98ets7sd74x3` service tier `on_demand` on tokens per day (TPD): Limit 500000, Used 499886, Requested 690. Please try again in 1m39.4146s. Need more tokens? Upgrade to Dev Tier today at https://console.groq.com/settings/billing', 'type': 'tokens', 'code': 'rate_limit_exceeded'}}
5 shot completed with 36.67% accuracy





### 보고서 작성하기
#### 아래의 내용이 포함되면 됩니다!

1. Direct Prompting, CoT Prompting, My Prompting을 0 shot, 3 shot, 5 shot 정답률을 표로 보여주세요!
2. CoT Prompting이 Direct Prompting에 비해 왜 좋을 수 있는지에 대해서 서술해주세요!
3. 본인이 작성한 프롬프트 기법이 CoT에 비해서 왜 더 좋을 수 있는지에 대해서 설명해주세요!
4. 최종적으로, `PROMPTING.md`에 보고서를 작성해주세요!