In [1]:
# OUTPUT_FILE = "teacher_finetune_tweak_dataset.jsonl"

In [2]:
import os
import pandas as pd
import glob
import json
import re
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np
from dotenv import load_dotenv
import requests

In [3]:
# Load environment variables from .env file
load_dotenv()

True

## Load Questions similar to ta

In [4]:
def load_all_jsonl_to_dataframe(file_path):
    """Load all JSONL files from a folder into a pandas DataFrame."""
    
    try:
        # Read file into dataframe
        df = pd.read_json(file_path, lines=True)
        # df['source_file'] = os.path.basename(file_path)  # Optional: track source file
        print(f"Loaded {len(df)} rows from {os.path.basename(file_path)}")
        return df
    except Exception as e:
        print(f"Error loading {file_path}: {e}")

# Usage
file_path = "../top_600_ripe_0_similar/ripe-0-similar-ripe-epochs-30/grpo.jsonl"
raw_data = load_all_jsonl_to_dataframe(file_path)

Loaded 211 rows from grpo.jsonl


## Display sample data

In [5]:
# Display the first few rows
raw_data.head(100)

Unnamed: 0,problem,answer
0,\nConsider all possible quadratic polynomials ...,50
1,\nDecide whether there is an integer \( n > 1 ...,561
2,\nWe wrote the numbers from 1 to 2009 on a pie...,2009
3,$(1)$ $f(n)$ is a function defined on the set ...,364
4,$(1)$ State the operation laws or rules used i...,-\frac{1}{2}
...,...,...
95,"For the set $M$, define the function $f_M(x) =...",16
96,For what is the smallest $n$ such that there e...,44
97,"Four different natural numbers, of which one i...",40
98,"Four people, A, B, C, and D, stand on a stairc...",2394


In [6]:
print(raw_data.iloc[0]['problem'])


Consider all possible quadratic polynomials $x^2 + px + q$ with a positive discriminant, where the coefficients $p$ and $q$ are integers divisible by 5. Find the largest natural number $n$ such that for any polynomial with the described properties, the sum of the hundredth powers of the roots is an integer divisible by $5^n$. Let's think step by step and output the final answer within \boxed{}.


## Solution Extraction Utilities

Functions to extract solutions from model outputs and compute correctness.

In [7]:
def remove_boxed(s):
    if s is None:
        return None

    left = "\\boxed{"

    if s[:len(left)] != left:
        return None
    if s[-1] != "}":
        return None

    return s[len(left):-1]


def last_boxed_only_string(string):
    idx = string.rfind("\\boxed")
    if "\\boxed " in string:
        return "\\boxed " + string.split("\\boxed ")[-1].split("$")[0]
    if idx < 0:
        idx = string.rfind("\\fbox")
        if idx < 0:
            return None

    i = idx
    right_brace_idx = None
    num_left_braces_open = 0
    while i < len(string):
        if string[i] == "{":
            num_left_braces_open += 1
        if string[i] == "}":
            num_left_braces_open -= 1
            if num_left_braces_open == 0:
                right_brace_idx = i
                break
        i += 1

    if right_brace_idx is None:
        retval = None
    else:
        retval = string[idx:right_brace_idx + 1]

    return retval

def extract_solution(text):
    return remove_boxed(last_boxed_only_string(text))

def correctness_reward_func(response: str, actual_answers: str) -> float:
    extracted_answer = extract_solution(response)
    return 1.0 if extracted_answer == actual_answers else 0.0

## Get Teacher Reasoning Traces

## Example

In [10]:
# Simply change the API Key to get started

from openai import OpenAI

client = OpenAI(
    api_key=os.environ['BASETEN_API_KEY'],
    base_url="https://model-rwn19e03.api.baseten.co/environments/production/sync/v1"
)

response = client.chat.completions.create(
    model="placeholder",
    messages=[{"role":"user","content":"Tell me everything you know about optimized inference."}],
)

print(response.choices[0].message.content)

Okay, I need to explain everything I know about optimized inference. Let me start by recalling what inference is in machine learning. Inference refers to the process of using a trained model to make predictions on new data. Optimized inference would then be about


In [8]:
import os
from azure.ai.inference import ChatCompletionsClient
from azure.ai.inference.models import SystemMessage, UserMessage
from azure.core.credentials import AzureKeyCredential

In [9]:
def get_reasoning_trace(problem):  # 5 minutes default timeout
    """Get a reasoning trace for a single problem using Groq Flex API"""
    
    try:
        endpoint = "https://ai-xiiiiiiiiii4559ai500267034787.services.ai.azure.com/models"
        model_name = "DeepSeek-R1"
        
        client = ChatCompletionsClient(
            endpoint=endpoint,
            credential=AzureKeyCredential(os.getenv('AZURRE_API_KEY')),
        )
        
        response = client.complete(
            messages=[
                UserMessage(content=problem)
            ],
            max_tokens=24576,
            model=model_name,
            timeout=600  # 10 minutes in seconds
        )
        
        # print(response.choices[0].message.content)
        # return out
        
        return response.choices[0].message.content

    except Exception as e:
        print(f"Error making API request: {str(e)}")
        return None


In [10]:
# def get_reasoning_trace_request(problem, model="qwen-qwq-32b", timeout=300):  # 5 minutes default timeout
#     """Get a reasoning trace for a single problem using Groq Flex API"""
    
#     try:
#         response = requests.post(
#             "https://api.groq.com/openai/v1/chat/completions",
#             headers={
#                 "Content-Type": "application/json",
#                 "Authorization": f"Bearer {os.getenv('GROQ_API_KEY')}"
#             },
#             json={
#                 "service_tier": "on_demand",
#                 "model": model,
#                 "messages": [
#                     {"role": "user", "content": problem}
#                 ],
#                 "temperature": 0.8,  # Increased temperature for more diverse reasoning paths
#                 "top_p": 0.95,
#                 "max_tokens": 24576
#             },
#             timeout=timeout  # Add timeout parameter here (in seconds)
#         )
        
#         if response.status_code != 200:
#             print(f"Error: API returned status code {response.status_code}")
#             print(f"Response: {response.text}")
#             return None
            
#         return response.json()
#     except Exception as e:
#         print(f"Error making API request: {str(e)}")
#         return None

In [11]:
ORIG_SYSTEM_PROMPT = "Let's think step by step and output the final answer within \\boxed{}."
NEW_SYSTEM_PROMPT = 'Plan your answer. Check your answer against everything in the question and if it is not correct, go back to your last checked answer. Think step by step and output the final answer within \\boxed{}.'

tweak_dataset = [
    {
        'problem': r['problem'],
        'answer': str(r['answer']),
    }
    for index, r in raw_data.reset_index().iterrows()
]

for r in tweak_dataset:
    if ORIG_SYSTEM_PROMPT in r['problem']:
        r['problem'] = r['problem'].replace(ORIG_SYSTEM_PROMPT, NEW_SYSTEM_PROMPT)

# len(tweak_dataset)
# tweak_dataset[:3]

In [12]:
print(tweak_dataset[0]['problem'])


Consider all possible quadratic polynomials $x^2 + px + q$ with a positive discriminant, where the coefficients $p$ and $q$ are integers divisible by 5. Find the largest natural number $n$ such that for any polynomial with the described properties, the sum of the hundredth powers of the roots is an integer divisible by $5^n$. Plan your answer. Check your answer against everything in the question and if it is not correct, go back to your last checked answer. Think step by step and output the final answer within \boxed{}.


In [13]:
tweak_dataset[0]['answer']

'50'

In [14]:
# response = get_reasoning_trace(tweak_dataset[0]['problem'])
# print(response)
# str(extract_solution(response))

## Get All Teacher Reasoning Traces

In [15]:
# Debug
# limit = 1
# tweak_dataset = tweak_dataset[:limit]

In [16]:
len(tweak_dataset)

211

In [17]:
for i, r in enumerate(tweak_dataset):
    print(f"{i}: {r['problem']}")
    try:
        if r.get('reward', 0.0) != 1.0:
            # print(f"{i}: {r.get('reward', 0.0)}")
            r['teacher_trace'] = get_reasoning_trace(r['problem'])
    except Exception as e:
        print(f"Error making API request: {str(e)}")
        # r['teacher_trace'] = None

# tweak_dataset

0: 
Consider all possible quadratic polynomials $x^2 + px + q$ with a positive discriminant, where the coefficients $p$ and $q$ are integers divisible by 5. Find the largest natural number $n$ such that for any polynomial with the described properties, the sum of the hundredth powers of the roots is an integer divisible by $5^n$. Plan your answer. Check your answer against everything in the question and if it is not correct, go back to your last checked answer. Think step by step and output the final answer within \boxed{}.
Error making API request: (Timeout) The operation was timeout.
Code: Timeout
Message: The operation was timeout.
1: 
Decide whether there is an integer \( n > 1 \) with the following properties:

(a) \( n \) is not a prime number.

(b) For all integers \( a \), \( a^{n} - a \) is divisible by \( n \). Plan your answer. Check your answer against everything in the question and if it is not correct, go back to your last checked answer. Think step by step and output the

In [18]:
for i, r in enumerate(tweak_dataset):
    if tweak_dataset[i].get('teacher_trace', None) is None:
        print(f"{i} skipped")
        continue
    r['extracted_solution'] = str(extract_solution(tweak_dataset[i]['teacher_trace']))
    r['reward'] = correctness_reward_func(tweak_dataset[i]['teacher_trace'], tweak_dataset[i]['answer'])
    print(f"{i} Gold answer {tweak_dataset[i]['answer']} extracted answer {r['extracted_solution']} reward {r['reward']}")

0 skipped
1 Gold answer 561 extracted answer 561 reward 1.0
2 skipped
3 Gold answer 364 extracted answer 364 reward 1.0
4 Gold answer -\frac{1}{2} extracted answer -\dfrac{1}{2} reward 0.0
5 Gold answer 432 extracted answer 33075 reward 0.0
6 Gold answer 202500 extracted answer 680 reward 0.0
7 Gold answer 16 extracted answer 16 reward 1.0
8 Gold answer 750 extracted answer -750 reward 0.0
9 skipped
10 Gold answer 2100 extracted answer 65 reward 0.0
11 Gold answer 20 extracted answer 20 reward 1.0
12 Gold answer 36 extracted answer 36 reward 1.0
13 skipped
14 Gold answer 6 extracted answer 8 reward 0.0
15 Gold answer 20 extracted answer 20 reward 1.0
16 skipped
17 Gold answer 125 extracted answer 125 reward 1.0
18 Gold answer 6\% extracted answer 6\% reward 1.0
19 skipped
20 skipped
21 skipped
22 skipped
23 Gold answer 5 extracted answer 5 reward 1.0
24 Gold answer 20 extracted answer 20 reward 1.0
25 skipped
26 Gold answer 107\% extracted answer 107.06\% reward 0.0
27 skipped
28 Gold 

In [19]:
correct = [r for r in tweak_dataset if r.get('reward', None) == 1.0]
len(correct)

74

In [20]:
# Create directory if it doesn't exist
dirname = os.path.dirname(OUTPUT_FILE)
if len(dirname.strip()) > 0:
    os.makedirs(dirname, exist_ok=True)

# Save to JSONL file
with open(OUTPUT_FILE, 'w') as f:
    for item in correct:
        f.write(json.dumps(item) + '\n')

print(f"Saved {len(correct)} records to {OUTPUT_FILE}")

Saved 74 records to teacher_finetune_tweak_dataset.jsonl
