In [1]:
import huggingface
from datasets import load_dataset
import requests
import json
import os
from dotenv import load_dotenv

load_dotenv()

True

In [130]:
gsm8k = load_dataset("gsm8k", "main")
gsm8k_answers = []
for i in range(len(gsm8k["train"])):
    sample = gsm8k["train"][i]
    for line in sample['answer'].split("\n"):
        if line.strip().startswith("####"):
            gsm8k_answers.append(int(line.replace("####", "").strip().replace(",","")))


In [22]:
math500 = load_dataset("HuggingFaceH4/MATH-500")
math500_answers = math500['test']['answer']
math500 = math500['test']['problem']

In [27]:
gpqa = load_dataset("Idavidrein/gpqa",'gpqa_main')
gpqa_answers = gpqa['train']['Correct Answer']
gpqa = gpqa['train']['Question']

In [None]:
lcb_codegen = load_dataset("livecodebench/code_generation_lite", version_tag="release_v5")

### FUNCTIONS

In [60]:
def get_model_response(prompt: str, answer: str, filename: str) -> str:
    response = requests.post(
        url="https://openrouter.ai/api/v1/chat/completions",
        headers={
            "Authorization": f"Bearer {os.getenv('OPENROUTER_API_KEY')}",
        },
        data=json.dumps({
            "model": "anthropic/claude-3.5-sonnet", 
            "messages": [
                {
                    "role": "user",
                    "content": (
                        """
                        Solve the math problem provided below. 
                        At the very end of your message, provide the answer to the problem in <ANSWER> </ANSWER> tags:
                        In the answer tags, ONLY provide the answer to the problem, no other text.
                        """
                        + prompt
                    )
                }
            ]
            
        })
    )
    if response.status_code == 200:
        data = response.json()
        model_response= data['choices'][0]['message']['content']
        model_answer = model_response.split("<ANSWER>")[1].split("</ANSWER>")[0]
        
        print("CORRECT ANSWER: ",answer)
        print("MODEL ANSWER: ",model_answer)
        with open(filename, 'a') as f:
            f.write("QUESTION: " + prompt)
            f.write("MODEL RESPONSE: " + model_response + "\n\n")
            f.write("MODEL ANSWER: " + model_answer + "\n\n")
            f.write("CORRECT ANSWER: " + answer + "\n\n")


        return model_answer
    else:
        print(f"Error: {response.status_code}")

In [107]:
def checking_answer(model_answer: str, correct_answer: str, filename: str) -> bool:
    response = requests.post(
        url="https://openrouter.ai/api/v1/chat/completions",
        headers={
            "Authorization": f"Bearer {os.getenv('OPENROUTER_API_KEY')}",
        },
        data=json.dumps({
            "model": "anthropic/claude-3.5-sonnet", 
            "messages": [
                {
                    "role": "user",
                    "content": (
                        """
                        Check whether or not not these two answers are equivalent.
                        At the very end of your message, provide the answer to the problem in <ANSWER> </ANSWER> tags:
                        If the answers are equivalent, there should only be "YES" in the answer tags.
                        If they aren't equivalent, there should only be "NO" in the answer tags.
                        Just a quick check is fine.
                        No need for elaborate reasoning.
                        First answer: """ + model_answer + "\n" + "Second answer: " + correct_answer
                    )
                }
            ]
            
        })
    )
    if response.status_code == 200:
        data = response.json()
        model_response= data['choices'][0]['message']['content'] # raw response
        responses_equivalent = model_response.split("<ANSWER>")[1].split("</ANSWER>")[0].strip() # YES or NO
        final_answer = responses_equivalent == "YES" # true or false based 

        
        with open(filename, 'a') as f:
            f.write("CORRECT ANSWER: " + correct_answer + '\n')
            f.write("MODEL OUTPUT: " + model_answer + '\n')
            f.write("COMPARISON ANSWER : ")
            f.write("TRUE" + '\n' if final_answer else "FALSE" + '\n')
            f.write(model_response+'\n')

        if final_answer:
            return True
        else:
            return False

    else:
        print(f"Error: {response.status_code}")

In [100]:
checking_answer("6-5i","6-5i","test.txt")
checking_answer("5i","6-5i","test.txt")

False

In [64]:
wrong_indices = [59,63,73, 108, 118, 167]
print(wrong_indices)

INDICES OF PROBLEMS THE MODEL GETS WRONG:
[59, 63, 73, 108, 118, 167]


## OUTPUTS

MODEL_RESPONSE:  Let me help solve this problem step by step.

1. First, let's identify how many people need trail mix:
   * Baseball team members: 13
   * Coaches: 3
   * Helpers: 2
   * Total people: 13 + 3 + 2 = 18 people

2. Each pack contains 6 pouches of trail mix

3. To find how many packs Roger needs:
   * 18 people ÷ 6 pouches per pack = 3 packs

   However, 3 packs only provides 18 pouches, which is exactly enough. It's usually good to have a few extra, so Roger should buy 4 packs to be safe.

<ANSWER>4</ANSWER>
ANSWER:  4
Response saved to response.txt
CORRECT ANSWER:  3
MODEL_RESPONSE:  Let me help solve this step by step:

MODEL_RESPONSE:  Let me help solve this step by step.

1. First, let's identify Shawna's total goal:
   * Total goal = 30 situps

2. Now, let's count how many situps she's done so far:
   * Monday: 12 situps
   * Tuesday: 19 situps
   * Total done = 12 + 19 = 31 situps

3. To find out how many more she needs to meet her goal:
   * Need = Total goal - Total done
   * Need = 30 - 31
   * Need = -1

Since Shawna has already done 31 situps over Monday and Tuesday, which is more than her goal of 30, she doesn't need to do any additional situps on Wednesday to meet her minimum goal.

<ANSWER>0</ANSWER>
ANSWER:  0
Response saved to response.txt
CORRECT ANSWER:  59
MODEL_RESPONSE:  Let me help solve this step by step.

MODEL_RESPONSE:  Let me help solve this step by step.

1. First, let's identify Shawna's total goal:
   * Total goal = 30 situps

2. Now, let's count how many situps she's done so far:
   * Monday: 12 situps
   * Tuesday: 19 situps
   * Total done = 12 + 19 = 31 situps

3. To find out how many more she needs to meet her goal:
   * Need = Total goal - Total done
   * Need = 30 - 31
   * Need = -1

Since Shawna has already done 31 situps over Monday and Tuesday, which is more than her goal of 30, she doesn't need to do any additional situps on Wednesday to meet her minimum goal.

<ANSWER>0</ANSWER>
ANSWER:  0
Response saved to response.txt
CORRECT ANSWER:  59
MODEL_RESPONSE:  Let me help solve this step by step.

1. Henry took 9 pills per day for 14 days
   * Total pills = 9 × 14 = 126 pills total

2. Out of 9 daily pills:
   * 4 pills cost $1.50 each
   * 5 pills cost $1.50 + $5.50 = $7.00 each

3. Calculate total cost for cheaper pills:
   * Number of cheaper pills = 4 pills × 14 days = 56 pills
   * Cost = 56 × $1.50 = $84.00

4. Calculate total cost for expensive pills:
   * Number of expensive pills = 5 pills × 14 days = 70 pills
   * Cost = 70 × $7.00 = $490.00

5. Total cost for all pills:
   * Total = $84.00 + $490.00 = $574.00

<ANSWER>574</ANSWER>
ANSWER:  574
Response saved to response.txt
CORRECT ANSWER:  41

MODEL_RESPONSE:  Let me help solve this step by step:

1. First hour collection:
   * Joanne collected 15 coins

2. Next two hours:
   * She collected 35 coins

3. Fourth hour:
   * She collected 50 coins
   * She gave 15 coins to her coworker

4. Let's add it all up:
   * First hour: 15 coins
   * Next two hours: 35 coins
   * Fourth hour: 50 coins - 15 coins = 35 coins

5. Total coins = 15 + 35 + 35 = 85 coins

<ANSWER>85</ANSWER>
ANSWER:  85
Response saved to response.txt
CORRECT ANSWER:  120
MODEL_RESPONSE:  Let me help solve this step by step:

## running

In [None]:
train_subset_questions = gsm8k["train"][100:150]['question']
train_subset_answers = gsm8k_answers[100:150]


print("RUNNING ACCURACY TESTS:")

correct = 0
for i in range(len(train_subset_questions)):
    answer = get_model_response(train_subset_questions[i],train_subset_answers[i])
    answer = int(answer)
    if (answer == train_subset_answers[i]):
        correct+=1

print("ACCURACY for regular: ", correct/len(train_subset_answers))

RUNNING ACCURACY TESTS:
CORRECT ANSWER:  15
MODEL ANSWER:  15
Response appended to response.txt
CORRECT ANSWER:  36
MODEL ANSWER:  36
Response appended to response.txt
CORRECT ANSWER:  25
MODEL ANSWER:  25
Response appended to response.txt
CORRECT ANSWER:  258
MODEL ANSWER:  258
Response appended to response.txt
CORRECT ANSWER:  96
MODEL ANSWER:  96
Response appended to response.txt
CORRECT ANSWER:  320
MODEL ANSWER:  320
Response appended to response.txt
CORRECT ANSWER:  50
MODEL ANSWER:  50
Response appended to response.txt
CORRECT ANSWER:  97
MODEL ANSWER:  97
Response appended to response.txt
CORRECT ANSWER:  41
MODEL ANSWER:  574
Response appended to response.txt
CORRECT ANSWER:  20
MODEL ANSWER:  20
Response appended to response.txt
CORRECT ANSWER:  140
MODEL ANSWER:  140
Response appended to response.txt
CORRECT ANSWER:  6
MODEL ANSWER:  6
Response appended to response.txt
CORRECT ANSWER:  920
MODEL ANSWER:  920
Response appended to response.txt
CORRECT ANSWER:  60
MODEL ANSWER:

In [None]:
train_subset_questions = gsm8k["train"][100:150]['question']
train_subset_answers = gsm8k_answers[100:150]

print("RUNNING FAIL@K ACCURACY TESTS:")

correct = 0
for i in range(len(train_subset_questions)):
    passing = True
    print(f"Running the {i}th sample")
    for j in range(4):
        print(f"{j}th try")
        answer = get_model_response(train_subset_questions[i],train_subset_answers[i],"fail.txt")
        answer = int(answer)
        if (answer == train_subset_answers[i]):
            correct+=1
        else:
            passing = False
            print(f"Failed at the {j}th example")
            break
    if passing:
        correct+=1

print("ACCURACY FOR FAIL@K: ", correct/len(train_subset_answers))

RUNNING FAIL@K ACCURACY TESTS:
Running the 0th sample
0th try
CORRECT ANSWER:  15
MODEL ANSWER:  15
1th try
CORRECT ANSWER:  15
MODEL ANSWER:  15
2th try
CORRECT ANSWER:  15
MODEL ANSWER:  15
3th try
CORRECT ANSWER:  15
MODEL ANSWER:  15
Running the 1th sample
0th try
CORRECT ANSWER:  36
MODEL ANSWER:  36
1th try
CORRECT ANSWER:  36
MODEL ANSWER:  36
2th try
CORRECT ANSWER:  36
MODEL ANSWER:  36
3th try
CORRECT ANSWER:  36
MODEL ANSWER:  36
Running the 2th sample
0th try
CORRECT ANSWER:  25
MODEL ANSWER:  25
1th try
CORRECT ANSWER:  25
MODEL ANSWER:  25
2th try
CORRECT ANSWER:  25
MODEL ANSWER:  25
3th try
CORRECT ANSWER:  25
MODEL ANSWER:  25
Running the 3th sample
0th try
CORRECT ANSWER:  258
MODEL ANSWER:  258
1th try
CORRECT ANSWER:  258
MODEL ANSWER:  258
2th try
CORRECT ANSWER:  258
MODEL ANSWER:  258
3th try
CORRECT ANSWER:  258
MODEL ANSWER:  258
Running the 4th sample
0th try
CORRECT ANSWER:  96
MODEL ANSWER:  96
1th try
CORRECT ANSWER:  96
MODEL ANSWER:  96
2th try
CORRECT AN

Accuracy for above is 0.94 (three failed examples)

In [None]:
print("INDICES OF PROBLEMS THE MODEL GETS WRONG:")
wrong_indices = [59,63,73, 108, 118, 167]
print(wrong_indices)

In [102]:
length = 30
math500_subset_questions = math500[:length]
math500_subset_answers = math500_answers[:length]

with open('math500.txt', 'w') as f:
    f.write('')
with open('checking_math500.txt', 'w') as f:
    f.write('')


print("RUNNING ACCURACY TESTS:")

correct_count = 0
for i in range(length):
    answer = get_model_response(math500_subset_questions[i],math500_subset_answers[i],'math500.txt')

    correct = checking_answer(answer,math500_subset_answers[i],"checking_math500.txt")
    if correct:
        correct_count +=1
        print("CORRECT")
    else:
        print("WRONG")


print("ACCURACY for regular: ", correct_count/len(math500_subset_answers))

RUNNING ACCURACY TESTS:
CORRECT ANSWER:  \left( 3, \frac{\pi}{2} \right)
MODEL ANSWER:  (3,π/2)
CORRECT
CORRECT ANSWER:  p - q
MODEL ANSWER:  p - q
CORRECT
CORRECT ANSWER:  \frac{14}{3}
MODEL ANSWER:  14/3
CORRECT
CORRECT ANSWER:  9
MODEL ANSWER:  9
CORRECT
CORRECT ANSWER:  \text{Evelyn}
MODEL ANSWER:  Evelyn
CORRECT
CORRECT ANSWER:  42
MODEL ANSWER:  42
CORRECT
CORRECT ANSWER:  27
MODEL ANSWER:  27
CORRECT
CORRECT ANSWER:  90^\circ
MODEL ANSWER:  72
WRONG
CORRECT ANSWER:  3\sqrt{13}
MODEL ANSWER:  3√13
CORRECT
CORRECT ANSWER:  4
MODEL ANSWER:  2
WRONG
CORRECT ANSWER:  2220
MODEL ANSWER:  2220
CORRECT
CORRECT ANSWER:  \frac{3}{56}
MODEL ANSWER:  8/63
WRONG
CORRECT ANSWER:  284
MODEL ANSWER:  284
CORRECT
CORRECT ANSWER:  5
MODEL ANSWER:  5
CORRECT
CORRECT ANSWER:  \sqrt{51}
MODEL ANSWER:  √51
CORRECT
CORRECT ANSWER:  6 - 5i
MODEL ANSWER:  6 - 5i
CORRECT
CORRECT ANSWER:  -50
MODEL ANSWER:  -50
CORRECT
CORRECT ANSWER:  \pi
MODEL ANSWER:  π
CORRECT
CORRECT ANSWER:  28
MODEL ANSWER:  56
WRO

In [None]:
length = 30
math500_subset_questions = math500[:length]
math500_subset_answers = math500_answers[:length]

with open('math500_fail.txt', 'w') as f:
    f.write('')
with open('checking_math500_fail.txt', 'w') as f:
    f.write('')


print("RUNNING TESTS FOR FAIL@4:")

correct_count = 0
for i in range(length):
    passing = True
    print(f"Running the {i}th sample")
    for j in range(4):
        print(f"{j}th try")
        answer = get_model_response(math500_subset_questions[i],math500_subset_answers[i],"math500_fail.txt")
        correct = checking_answer(answer,math500_subset_answers[i],"checking_math500_fail.txt")

        if correct:
            print("correct")
            pass
        else:
            passing = False
            print(f"Failed at the {j}th try")
            break
    if passing:
        print(f"model passed the {i}th sample")
        correct_count+=1


print("ACCURACY for FAIL@4: ", correct_count/len(math500_subset_answers))

RUNNING TESTS FOR FAIL@4:
Running the 0th sample
0th try
CORRECT ANSWER:  \left( 3, \frac{\pi}{2} \right)
MODEL ANSWER:  (3, π/2)
correct
1th try
CORRECT ANSWER:  \left( 3, \frac{\pi}{2} \right)
MODEL ANSWER:  (3,π/2)
correct
2th try
CORRECT ANSWER:  \left( 3, \frac{\pi}{2} \right)
MODEL ANSWER:  (3,π/2)
correct
3th try
CORRECT ANSWER:  \left( 3, \frac{\pi}{2} \right)
MODEL ANSWER:  (3,1.5708)
correct
model passed the 0th sample
Running the 1th sample
0th try
CORRECT ANSWER:  p - q
MODEL ANSWER:  p - q
correct
1th try
CORRECT ANSWER:  p - q
MODEL ANSWER:  p - q
correct
2th try
CORRECT ANSWER:  p - q
MODEL ANSWER:  p-q
correct
3th try
CORRECT ANSWER:  p - q
MODEL ANSWER:  p-q
correct
model passed the 1th sample
Running the 2th sample
0th try
CORRECT ANSWER:  \frac{14}{3}
MODEL ANSWER:  14/3
correct
1th try
CORRECT ANSWER:  \frac{14}{3}
MODEL ANSWER:  14/3
correct
2th try
CORRECT ANSWER:  \frac{14}{3}
MODEL ANSWER:  14/3
correct
3th try
CORRECT ANSWER:  \frac{14}{3}
MODEL ANSWER:  14/3
c

accuracy above is 57%

### Claude 3.5 sonnet accuracy vs fail@4 score on a subset of MATH500 is 73% and 57%