In [1]:
try:
    from vllm import LLM, SamplingParams
    LOCAL = True
    MODEL_PATH = "deepseek-ai/deepseek-math-7b-rl"
    from functions import *
    dtype = 'auto'
    gpu_memory_utilization = 0.95

except:
    %pip uninstall -y torch -q
    %pip install --no-index --find-links=/kaggle/input/vllm-whl -U vllm -q
    from vllm import LLM, SamplingParams
    LOCAL = False
    MODEL_PATH = "/kaggle/input/deepseek-math"
    dtype = 'half'
    gpu_memory_utilization = 0.99
    from functions_math import *


import torch
import pandas as pd
import subprocess
import sys
import gc
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [2]:
iterations = 3
repeats = 1
timeout = 7
temperature = 0.7
max_tokens = 1600
llm = LLM(model=MODEL_PATH,
          dtype=dtype,
          enforce_eager=True,
          gpu_memory_utilization=gpu_memory_utilization,
          swap_space=8,
          max_model_len=2048,
          kv_cache_dtype="fp8_e5m2",
          tensor_parallel_size=1)
tokenizer = llm.get_tokenizer()
stop_words = [tokenizer.eos_token]
sampling_params = SamplingParams(n = 1, best_of= 1,
                                 temperature=temperature,
                                 max_tokens=max_tokens,
                                 stop=stop_words)

INFO 04-21 13:23:18 utils.py:253] CUDA_HOME is not found in the environment. Using /usr/local/cuda as CUDA_HOME.
INFO 04-21 13:23:18 config.py:381] Using fp8_e5m2 data type to store kv cache. It reduces the GPU memory footprint and boosts the performance. But it may cause slight accuracy drop. Currently we only support fp8 without scaling factors and make e5m2 as a default format.
INFO 04-21 13:23:18 llm_engine.py:74] Initializing an LLM engine (v0.4.0.post1) with config: model='deepseek-ai/deepseek-math-7b-rl', tokenizer='deepseek-ai/deepseek-math-7b-rl', tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=2048, download_dir=None, load_format=auto, tensor_parallel_size=1, disable_custom_all_reduce=True, quantization=None, enforce_eager=True, kv_cache_dtype=fp8_e5m2, device_config=cuda, seed=0)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


INFO 04-21 13:23:18 selector.py:16] Using FlashAttention backend.
INFO 04-21 13:23:19 weight_utils.py:177] Using model weights format ['*.safetensors']
INFO 04-21 13:23:21 model_runner.py:104] Loading model weights took 12.8725 GB
INFO 04-21 13:23:21 gpu_executor.py:94] # GPU blocks: 2304, # CPU blocks: 2184


In [3]:
if LOCAL:
    import json
    with open('../Data/AMC/aime_normal.json', 'r') as file:
        data = json.load(file)
    # to have consistent format as in Kaggle
    data = pd.DataFrame(data)
    data.rename(columns={'question': 'problem'}, inplace=True)
else:
    data = pd.read_csv('/kaggle/input/ai-mathematical-olympiad-prize/test.csv')
    if len(data) < 5:
        data = pd.read_csv('/kaggle/input/ai-mathematical-olympiad-prize/train.csv')
        PRIVATE = False
    else:
        PRIVATE = True

In [4]:
def is_integer(num):
    if isinstance(num, float):
        return num.is_integer()
    elif isinstance(num, int):
        return True
    else:
        return False
    
def is_between_0_and_999(num):
    return 0 <= num <= 999

def gen_prompt_codeIn1(problem):
    return f"""
### Problem:\n{problem}\n
### Response: Let's first reason step by step. Then write Python code to solve the problem, using brute force enumeration if necessary. 
Limit the search space using logical deductions. The code should be enclosed between ```python\n actual code...``` and should only print the final answer.
The final answer is an integer between 0 and 999.
"""

def parse_err(input):
    return f""" \n{input}\n
    The code should be enclosed between ```python\n actual code...``` Please fix.
"""

def time_err(code):
    return f""" \n{code}\n
    code timed out. Try to use math reasoning to limit the search space and find ways to exit loop early where appropriate.
"""

def code_err(code, err):
    return f""" \n{code}\n
    The code encounters this error: {err}\n.
    Please fix.
"""

def eval_err(code):
    return f""" \n{code}\n
    code should only print the final number.
"""

def number_range_type_err(problem,input,answer):
    return f""" \nThis is the original problem: {problem}\n 
    \nThis is the first solution: {input}\n
    However, answer should be a integer between 0 and 999 but got {str(answer)}. Please revisit the logics and
    generate python code to answer the question.
    The code should be enclosed between ```python\n actual code...``` and should only print the final answer.
"""    
def process_inputs(inputs):
    # inputs is a list of str
    outs = []
    for problem in inputs:
        query_prompt = gen_prompt_codeIn1(problem)
        messages = [{"role": "user","content": query_prompt}]
        input = tokenizer.apply_chat_template(messages, tokenize=False)
        outs.append(input)
    return outs

from functools import partial
def get_value(output,normalize):
    # cumulative_logprob are negative. factor of 10 penalize it.
    if len(output.token_ids) == 0: return float('-inf')
    text = output.text
    if "```python" not in text and text.count("print(") != 1:
        factor = 10
    else:
        factor = 1
    if normalize:
        return factor*output.cumulative_logprob/len(output.token_ids)
    else:
        return factor*output.cumulative_logprob

def process_code(inputs,problems,normalize=True):
    # inputs,problem is a list of str
    outs = []
    for input,problem in zip(inputs,problems):
        # input is vllm.outputs.RequestOutput
        input = max(input.outputs,key=partial(get_value,normalize=normalize)).text
        # parse err
        try:
            code = input.split('```')[1][7:]
        except: 
            outs.append(parse_err(input))
            continue
        # execute code
        with open('code.py', 'w') as fout:
            fout.write(code)
        # timeout err
        try:
            process = subprocess.run([sys.executable, 'code.py'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, timeout=timeout)
        except subprocess.TimeoutExpired:
            outs.append(time_err(code))
            continue
        if process.stderr:# code.py err
            stderr = process.stderr.decode('utf8')
            outs.append(code_err(code, stderr))
            continue
        else:
            stdout = process.stdout.decode('utf8')
            try:
                answer = eval(stdout)
                if is_integer(answer) and is_between_0_and_999(answer):
                    outs.append(int(answer))
                    continue
                else:
                    outs.append(number_range_type_err(problem,input,answer))
                    continue
            except:
                outs.append(eval_err(code))
                continue
    return outs

In [5]:
# prepare inputs
problems = data.problem.tolist()
inputs = process_inputs(problems)
# capacity = len(inputs) * sampling_params['best_of']

# generation
final_answers = []
if LOCAL: monitors = []
for _ in range(iterations):
    # sampling_params['n'] = sampling_params['best_of'] = capacity//len(inputs)
    raw_outputs = llm.generate(inputs, sampling_params)
    outs = process_code(raw_outputs,problems)
    inputs,problems = zip(*[(o,p) for o,p in zip(outs,problems) if isinstance(o,str)]) # invalid answers
    final_answers.append(outs)
    if LOCAL: monitors.append([[(j.text,j.cumulative_logprob,len(j.token_ids)) for j in o.outputs] for o in raw_outputs])

# post-process
temp = final_answers[-1]
for new in final_answers[::-1][1:]: # reverse order, starts with second to last
    temp = iter(temp)
    temp = [next(temp) if isinstance(x, str) else x for x in new] # replace str in new by temp
temp = [37 if isinstance(x, str) else x for x in temp]

Processed prompts: 100%|██████████| 975/975 [08:06<00:00,  2.00it/s]
Token indices sequence length is longer than the specified maximum sequence length for this model (48670 > 4096). Running this sequence through the model will result in indexing errors
Processed prompts:   0%|          | 0/660 [00:00<?, ?it/s]



Processed prompts:  12%|█▏        | 81/660 [00:35<05:23,  1.79it/s]



Processed prompts:  23%|██▎       | 153/660 [01:10<03:53,  2.17it/s]



Processed prompts:  32%|███▏      | 212/660 [01:31<01:36,  4.64it/s]



Processed prompts:  58%|█████▊    | 381/660 [02:54<02:17,  2.03it/s]



Processed prompts: 100%|██████████| 660/660 [05:26<00:00,  2.02it/s]
Processed prompts: 100%|██████████| 530/530 [04:34<00:00,  1.93it/s]


In [31]:
if LOCAL:
    # iteration 0
    # data.final_answer is like [['14'],['65'],...]
    print(f"correct # for iteration 0: {sum([yhat==int(y[0])for yhat, y in zip(final_answers[0],data.final_answer.tolist())])}")
    print(f"correct # for iteration {iterations}: {sum([yhat==int(y[0])for yhat, y in zip(temp,data.final_answer.tolist())])}")
    out_path = create_next_model_folder('../llmOutputs')
    print(out_path) # ../llmOutputs/model1
    with open(out_path + '/final_answers.json', 'w') as f:
        json.dump(final_answers, f)
    with open(out_path + '/monitors.json', 'w') as f:
        json.dump(monitors, f)        
else:
    if not PRIVATE:
        answers = data.answer.tolist()
        correct = sum([y==yhat for y,yhat in zip(answers,temp)])
        print(f'{correct} correct answers')    
    data['answer'] = temp
    data[['id','answer']].to_csv("submission.csv", header=True, index=False)

correct # for iteration 0: 95
correct # for iteration 3: 116
../llmOutputs/model8
