In [1]:
try:
    from vllm import LLM, SamplingParams
    LOCAL = True
    MODEL_PATH = "deepseek-ai/deepseek-math-7b-rl"
    from functions import *
    dtype = 'auto'
    gpu_memory_utilization = 0.95

except:
    %pip uninstall -y torch -q
    %pip install --no-index --find-links=/kaggle/input/vllm-whl -U vllm -q
    from vllm import LLM, SamplingParams
    LOCAL = False
    MODEL_PATH = "/kaggle/input/deepseek-math"
    dtype = 'half'
    gpu_memory_utilization = 0.99
    from functions_math import *


import torch
import pandas as pd
import subprocess
import sys
import gc
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [2]:
iterations = 3
repeats = 6
timeout = 7
temperature = 0.7
max_tokens = 1600
normalize = True
llm = LLM(model=MODEL_PATH,
          dtype=dtype,
          enforce_eager=True,
          gpu_memory_utilization=gpu_memory_utilization,
          swap_space=8,
          max_model_len=2048,
          kv_cache_dtype="fp8_e5m2",
          tensor_parallel_size=1)
tokenizer = llm.get_tokenizer()
stop_words = [tokenizer.eos_token]
sampling_params = SamplingParams(n = 1, best_of= 1,
                                 temperature=temperature,
                                 max_tokens=max_tokens,
                                 stop=stop_words)

INFO 04-23 14:03:41 utils.py:253] CUDA_HOME is not found in the environment. Using /usr/local/cuda as CUDA_HOME.
INFO 04-23 14:03:41 config.py:381] Using fp8_e5m2 data type to store kv cache. It reduces the GPU memory footprint and boosts the performance. But it may cause slight accuracy drop. Currently we only support fp8 without scaling factors and make e5m2 as a default format.
INFO 04-23 14:03:41 llm_engine.py:74] Initializing an LLM engine (v0.4.0.post1) with config: model='deepseek-ai/deepseek-math-7b-rl', tokenizer='deepseek-ai/deepseek-math-7b-rl', tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=2048, download_dir=None, load_format=auto, tensor_parallel_size=1, disable_custom_all_reduce=True, quantization=None, enforce_eager=True, kv_cache_dtype=fp8_e5m2, device_config=cuda, seed=0)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


INFO 04-23 14:03:42 selector.py:16] Using FlashAttention backend.
INFO 04-23 14:03:42 weight_utils.py:177] Using model weights format ['*.safetensors']
INFO 04-23 14:03:44 model_runner.py:104] Loading model weights took 12.8725 GB
INFO 04-23 14:03:44 gpu_executor.py:94] # GPU blocks: 2309, # CPU blocks: 2184


In [3]:
if LOCAL:
    import json
    with open('../Data/AMC/aime_normal.json', 'r') as file:
        data = json.load(file)
    # to have consistent format as in Kaggle
    data = pd.DataFrame(data)
    data.rename(columns={'question': 'problem'}, inplace=True)
else:
    data = pd.read_csv('/kaggle/input/ai-mathematical-olympiad-prize/test.csv')
    if len(data) < 5:
        data = pd.read_csv('/kaggle/input/ai-mathematical-olympiad-prize/train.csv')
        PRIVATE = False
    else:
        PRIVATE = True

In [4]:
def is_integer(num):
    if isinstance(num, float):
        return num.is_integer()
    elif isinstance(num, int):
        return True
    else:
        return False
    
def is_between_0_and_999(num):
    return 0 <= num <= 999

def gen_prompt_codeIn1(problem):
    return f"""
### Problem:\n{problem}\n
### Response: Let's first reason step by step. Then write Python code to solve the problem, using brute force enumeration if necessary. 
Limit the search space using logical deductions. The code should be enclosed between ```python\n actual code...``` and should only print the final answer.
The final answer is an integer between 0 and 999.
"""

def parse_err(input):
    return f""" \n{input}\n
    The code should be enclosed between ```python\n actual code...``` Please fix.
"""

def time_err(code):
    return f""" \n{code}\n
    code timed out. Try to use math reasoning to limit the search space and find ways to exit loop early where appropriate.
"""

def code_err(code, err):
    return f""" \n{code}\n
    The code encounters this error: {err}\n.
    Please fix.
"""

def eval_err(code):
    return f""" \n{code}\n
    code should only print the final number.
"""

def number_range_type_err(problem,input,answer):
    return f""" \nThis is the original problem: {problem}\n 
    \nThis is the first solution: {input}\n
    However, answer should be a integer between 0 and 999 but got {str(answer)}. Please revisit the logics and
    generate python code to answer the question.
    The code should be enclosed between ```python\n actual code...``` and should only print the final answer.
"""    
def process_inputs(inputs,prompt_fun):
    # inputs is a list of str
    outs = []
    for problem in inputs:
        query_prompt = prompt_fun(problem)
        messages = [{"role": "user","content": query_prompt}]
        input = tokenizer.apply_chat_template(messages, tokenize=False)
        outs.append(input)
    return outs

from functools import partial
def get_value_code(output,normalize):
    # cumulative_logprob are negative. factor of 10 penalize it.
    output = output.outputs[0] # outpus[i] is the i-th sample with attr cumulative_logprob,text,token_ids
    if len(output.token_ids) == 0: return float('-inf')
    text = output.text
    if "```python" not in text and text.count("print(") != 1: return float('-inf')
    if normalize:
        return output.cumulative_logprob/len(output.token_ids)
    else:
        return output.cumulative_logprob
    
def process_code(inputs,problems,repeats,normalize):
    inputs = group_elements(inputs, repeats) # [[out1_re1, out1_re2,...],[out2_re1,out2_re2...],...]
    outs = []
    for input,problem in zip(inputs,problems):
        # input is a list of vllm.outputs.RequestOutput
        input = max(input,key=partial(get_value_code,normalize=normalize)).outputs[0].text
        # parse err
        try:
            code = input.split('```')[1][7:]
        except: 
            outs.append(parse_err(input))
            continue
        # execute code
        with open('code.py', 'w') as fout:
            fout.write(code)
        # timeout err
        try:
            process = subprocess.run([sys.executable, 'code.py'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, timeout=timeout)
        except subprocess.TimeoutExpired:
            outs.append(time_err(code))
            continue
        if process.stderr:# code.py err
            stderr = process.stderr.decode('utf8')
            outs.append(code_err(code, stderr))
            continue
        else:
            stdout = process.stdout.decode('utf8')
            try:
                answer = eval(stdout)
                if is_integer(answer) and is_between_0_and_999(answer):
                    outs.append(int(answer))
                    continue
                else:
                    outs.append(number_range_type_err(problem,input,answer))
                    continue
            except:
                outs.append(eval_err(code))
                continue
    return outs

def repeat_elements(lst, k):
    return [i for i in lst for _ in range(k)]

def group_elements(lst, k):
    return [lst[i:i+k] for i in range(0, len(lst), k)]

def gen_prompt_pure(problem):
    return '''
### Problem:\n'''+problem+'''\n
### Response: Let's think step by step and do not use Python code. The final answer should be a single integer in the last line of your response. The integer should be between 0 and 999.
the answer should be enclosed within \\boxed{}.
'''
import re
def extract_number(text):
    patterns = [
        r'The answer is.*\\boxed\{(.*?)\}',
        r"The answer is[:\s]*\$([0-9]+)\$",
        r"The answer is[:\s]*([0-9]+)"
    ]
    for pattern in patterns:
        match = re.search(pattern, text)
        if match:
            return match.group(1)
    return 'parse err'

def parse_strict(text):
    try:
        text = extract_number(text.split('\n')[-1])
        if text == 'parse err':
            return 'parse err'
        else:
            answer = eval(text)
            if is_integer(answer) and is_between_0_and_999(answer):
                return int(answer)
            else:
                return 'parse err'
    except:
        return 'parse err'

def group_and_sum(A, B):
    result_dict = {}
    for a, b in zip(A, B):
        if a in result_dict:
            result_dict[a] += b
        else:
            result_dict[a] = b
    return list(result_dict.items())

from math import exp
def get_max(input,normalize):
    # input is a list of vllm.outputs.RequestOutput
    # return the best one from "repeats" samples
    # use group by to reward repeated answer
    texts = []
    scores = []
    for o in input:
        o = o.outputs[0]
        answer = parse_strict(o.text)
        if answer != 'parse err' and len(o.token_ids)>0:
            texts.append(answer)
            scores.append(exp(o.cumulative_logprob/len(o.token_ids) if normalize else o.cumulative_logprob)) # exp to ensure reward are positive
    if texts:
        groups = group_and_sum(texts,scores)
        return max(groups,key=lambda x:x[1])[0]
    else:
        return 'parse err'
    
def process_pure(inputs,repeats,normalize):
    inputs = group_elements(inputs, repeats) # [[out1_re1, out1_re2,...],[out2_re1,out2_re2...],...]
    outs = []
    for input in inputs:
        # input is a list of vllm.outputs.RequestOutput
        outs.append(get_max(input,normalize))
    return outs

In [5]:
### code ###
# prepare inputs
problems = data.problem.tolist()
if LOCAL: 
    solutions = data.final_answer.map(lambda x:int(x[0])).tolist()
inputs = process_inputs(problems,gen_prompt_codeIn1)
capacity = len(inputs) * repeats

# generation
final_answers = []
if LOCAL: monitors = []
for iteration in range(iterations):
    repeats = capacity//len(inputs)
    inputs = repeat_elements(inputs,repeats)
    raw_outputs = llm.generate(inputs, sampling_params)
    outs = process_code(raw_outputs,problems,repeats,normalize)
    final_answers.append(outs)
    if LOCAL:
        monitors.extend([(iteration, p, j.text, s, a) \
                            for a,o,p,s in zip(repeat_elements(outs,repeats),raw_outputs,\
                                               repeat_elements(problems,repeats),repeat_elements(solutions,repeats)) 
                                for j in o.outputs])
        inputs,problems,solutions = zip(*[(o,p,s) for o,p,s in zip(outs,problems,solutions) if isinstance(o,str)]) # invalid answers
    else:
        inputs,problems = zip(*[(o,p) for o,p in zip(outs,problems) if isinstance(o,str)]) # invalid answers
    
### code END ###

### pure reasoning ###
# prepare inputs
inputs = process_inputs(problems,gen_prompt_pure)
repeats = capacity//len(inputs)
inputs = repeat_elements(inputs,repeats)

# generation
raw_outputs = llm.generate(inputs, sampling_params)
outs = process_pure(raw_outputs,repeats,normalize)
final_answers.append(outs)
if LOCAL:
        monitors.extend([(iteration+1, p, j.text, s, a) \
                        for a,o,p,s in zip(repeat_elements(outs,repeats),raw_outputs,\
                                            repeat_elements(problems,repeats),repeat_elements(solutions,repeats)) 
                            for j in o.outputs])
### pure reasoning END###

# post-process
def post_process(final_answers,end_with):
    final_answers = final_answers[:end_with]
    temp = final_answers[-1]
    for new in final_answers[::-1][1:]: # reverse order, starts with second to last
        temp = iter(temp)
        temp = [next(temp) if isinstance(x, str) else x for x in new] # replace str in new by temp
    temp = [37 if isinstance(x, str) else x for x in temp]
    return temp

Processed prompts: 100%|██████████| 5850/5850 [46:16<00:00,  2.11it/s]  
Token indices sequence length is longer than the specified maximum sequence length for this model (70249 > 4096). Running this sequence through the model will result in indexing errors
Processed prompts:   9%|▊         | 482/5520 [04:09<1:05:36,  1.28it/s]



Processed prompts:  20%|█▉        | 1088/5520 [09:15<56:22,  1.31it/s]  



Processed prompts:  26%|██▌       | 1410/5520 [12:04<21:50,  3.14it/s]  



Processed prompts:  44%|████▎     | 2411/5520 [20:44<12:54,  4.01it/s]  



Processed prompts:  64%|██████▍   | 3557/5520 [30:17<13:01,  2.51it/s]  



Processed prompts: 100%|██████████| 5520/5520 [45:36<00:00,  2.02it/s]
Processed prompts: 100%|██████████| 5577/5577 [47:12<00:00,  1.97it/s]  
Processed prompts: 100%|██████████| 5652/5652 [49:50<00:00,  1.89it/s]  


In [6]:
if LOCAL:
    # data.final_answer is like [['14'],['65'],...]
    for iteration in range(1,len(final_answers)+1):
        temp = post_process(final_answers,iteration)
        print(f"correct # for iteration {iteration}: {sum([yhat==int(y[0])for yhat, y in zip(temp,data.final_answer.tolist())])}")
    out_path = create_next_model_folder('../llmOutputs')
    print(out_path) # ../llmOutputs/model1
    with open(out_path + '/final_answers.json', 'w') as f:
        json.dump(final_answers, f)
    outs_df = pd.DataFrame(monitors,columns=['iteration','problem','output','y','yhat'])
    outs_df.to_csv(out_path+'/generations.csv', header=True, index=False)
else:
    temp = post_process(final_answers,len(final_answers))
    if not PRIVATE:
        answers = data.answer.tolist()
        correct = sum([y==yhat for y,yhat in zip(answers,temp)])
        print(f'{correct} correct answers')    
    data['answer'] = temp
    data[['id','answer']].to_csv("submission.csv", header=True, index=False)

correct # for iteration 1: 135
correct # for iteration 2: 142
correct # for iteration 3: 146
correct # for iteration 4: 181
../llmOutputs/model9
