In [1]:
try:
    import peft
    LOCAL = True
    MODEL_PATH = "deepseek-ai/deepseek-math-7b-rl"
    from functions import *
except:
    LOCAL = False
    MODEL_PATH = "/kaggle/input/deepseek-math"
    from functions_math import *
    import gc


import torch
if not LOCAL:torch.backends.cuda.enable_mem_efficient_sdp(False)
import pandas as pd
from transformers import AutoModelForCausalLM, AutoTokenizer


tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)

model = AutoModelForCausalLM.from_pretrained(
    MODEL_PATH,
    device_map = "auto",
    torch_dtype="auto",
    trust_remote_code = True,
    use_flash_attention_2=LOCAL,
)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
The model was loaded with use_flash_attention_2=True, which is deprecated and may be removed in a future release. Please use `attn_implementation="flash_attention_2"` instead.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [2]:
if LOCAL:
    import json
    with open('../Data/AMC/aime_normal.json', 'r') as file:
        data = json.load(file)
    # to have consistent format as in Kaggle
    data = pd.DataFrame(data)
    data.rename(columns={'question': 'problem'}, inplace=True)
else:
    data = pd.read_csv('/kaggle/input/ai-mathematical-olympiad-prize/test.csv')
    if len(data) < 5:
        data = pd.read_csv('/kaggle/input/ai-mathematical-olympiad-prize/train.csv')
        PRIVATE = False
    else:
        PRIVATE = True

In [3]:
# no_repeat_processor,digits_processor -> rescore parse
# data = pd.read_csv("../llmOutputs/model1"+'.csv')
# data['final_answer'] = data.y
# data = data[data.yhat == 'parsing error']
# data.shape -> (102, 5)

In [4]:
def gen_prompt(problem):
    
    return f"""
### Instruction:\n{problem}\n\n
### Response: Let's think step by step. The final response should be a single number in the last line of your response.
"""

In [5]:
outs = []
no_repeat_processor = [NoRepeatTokenLogitsProcessor()]
# digits_processor = [DigitsOnlyLogitsProcessor(tokenizer)]
token2answer = tokenizer.encode("\nthe answer is:", return_tensors="pt",add_special_tokens=False).to('cuda')
for index, row in data.iterrows():
    problem = row['problem']
    query_prompt = gen_prompt(problem)
    
    messages = [
        {
            "role": "user",
            "content": query_prompt
        }
    ]

    inputs = tokenizer.apply_chat_template(messages, return_tensors="pt").to("cuda")
    with torch.no_grad():
        encoded_output = model.generate(inputs, max_new_tokens=1500, do_sample=False, pad_token_id=tokenizer.eos_token_id,\
                                        logits_processor=no_repeat_processor)

    decoded_output = tokenizer.decode(encoded_output[0], skip_special_tokens=True).replace(query_prompt, '')
    try:
        answer = decoded_output.split('\n')[-1]
        answer = naive_parse(answer) % 1000
    except:
        print(decoded_output)
        answer = 'parsing error'
    if LOCAL:
        outs.append((problem,decoded_output,answer,int(row['final_answer'][0])))
    else:
        outs.append(37 if answer == 'parsing error' else answer)
        torch.cuda.empty_cache()
        gc.collect()
        if not PRIVATE:
            print(decoded_output)
            print(f'\nanswer is {answer}')

User: 

The first equation is $\sqrt{\log_b n} = \log_b \sqrt{n}$.
Squaring both sides, we get $\log_b n = \log_b n$, which is true.
So the first equation is satisfied for all $n > 0$.
The second equation is $b \cdot \log_b n = \log_b (bn)$.
Using the property of logarithms, we can rewrite this as $\log_b n^b = \log_b (bn)$.
This simplifies to $\log_b n^b = \log_b n + \log_b b = \log_b n + 1$.
So we have $\log_b n^b = \log_b n + 1$.
This is equivalent to $n^b = b \cdot n$, which simplifies to $n^{b-1} = b$.
Taking the logarithm base $b$ of both sides, we get $(b-1) \log_b n = 1$.
So $\log_b n = \frac{1}{b-1}$.
Using the definition of logarithms, we have $n = b^{\frac{1}{b-1}}$.
We are given that $n = \frac{j}{k}$, where $j$ and $k$ are relatively prime positive integers.
So we have $\frac{j}{k} = b^{\frac{1}{b-1}}$.
Rearranging, we get $j \cdot b^{b-1} = k \cdot b$.
Since $j$ and $k$ are relatively prime, we must have $j = k \cdot b$.
Substituting this into the equation, we get $k \cdo

In [8]:
if LOCAL:
    outs_df = pd.DataFrame(outs,columns=['problem','output','yhat','y'])
    print(f"correct: {sum(outs_df.yhat == outs_df.y)}")
    print(f"parse error: {sum(outs_df.yhat =='parsing error')}")
    out_path = create_next_model_folder('../llmOutputs')
    print(out_path) # ../llmOutputs/model1
    outs_df.to_csv(out_path+'/generations.csv', header=True, index=False)
else:
    if not PRIVATE:
        answers = data.answer.tolist()
        correct = sum([y==yhat for y,yhat in zip(answers,outs)])
        print(f'{correct} correct answers')    
    data['answer'] = outs
    data[['id','answer']].to_csv("submission.csv", header=True, index=False)

51
../llmOutputs/model1


In [10]:
sum(outs_df.yhat=='parsing error')

58