In [1]:
try:
    from vllm import LLM, SamplingParams
    LOCAL = True
    MODEL_PATH = "deepseek-ai/deepseek-math-7b-rl"
    from functions import *
    dtype = 'auto'
    gpu_memory_utilization = 0.95

except:
    %pip uninstall -y torch -q
    %pip install --no-index --find-links=/kaggle/input/vllm-whl -U vllm -q
    from vllm import LLM, SamplingParams
    LOCAL = False
    MODEL_PATH = "/kaggle/input/deepseek-math"
    dtype = 'half'
    gpu_memory_utilization = 0.99
    from functions_math import *
    import gc

import torch
import pandas as pd
import subprocess
import sys
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [2]:
iterations = 3
repeats = 7
temperature = 0.9
max_tokens = 1500
llm = LLM(model=MODEL_PATH,
          dtype=dtype,
          enforce_eager=True,
          gpu_memory_utilization=gpu_memory_utilization,
          swap_space=40,
          max_model_len=2048,
          kv_cache_dtype="fp8_e5m2",
          tensor_parallel_size=1)
tokenizer = llm.get_tokenizer()
stop_words = [tokenizer.eos_token]
sampling_params = {'n': repeats,
                    'best_of': repeats,
                    'temperature': temperature,
                    'max_tokens': max_tokens,
                    'stop': stop_words
                }

INFO 04-20 19:14:04 utils.py:253] CUDA_HOME is not found in the environment. Using /usr/local/cuda as CUDA_HOME.
INFO 04-20 19:14:04 config.py:381] Using fp8_e5m2 data type to store kv cache. It reduces the GPU memory footprint and boosts the performance. But it may cause slight accuracy drop. Currently we only support fp8 without scaling factors and make e5m2 as a default format.
INFO 04-20 19:14:04 llm_engine.py:74] Initializing an LLM engine (v0.4.0.post1) with config: model='deepseek-ai/deepseek-math-7b-rl', tokenizer='deepseek-ai/deepseek-math-7b-rl', tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=2048, download_dir=None, load_format=auto, tensor_parallel_size=1, disable_custom_all_reduce=True, quantization=None, enforce_eager=True, kv_cache_dtype=fp8_e5m2, device_config=cuda, seed=0)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


INFO 04-20 19:14:04 selector.py:16] Using FlashAttention backend.
INFO 04-20 19:14:05 weight_utils.py:177] Using model weights format ['*.safetensors']
INFO 04-20 19:14:11 model_runner.py:104] Loading model weights took 12.8725 GB
INFO 04-20 19:14:11 gpu_executor.py:94] # GPU blocks: 2323, # CPU blocks: 10922


In [3]:
if LOCAL:
    import json
    with open('../Data/AMC/aime_normal.json', 'r') as file:
        data = json.load(file)
    # to have consistent format as in Kaggle
    data = pd.DataFrame(data)
    data.rename(columns={'question': 'problem'}, inplace=True)
else:
    data = pd.read_csv('/kaggle/input/ai-mathematical-olympiad-prize/test.csv')
    if len(data) < 5:
        data = pd.read_csv('/kaggle/input/ai-mathematical-olympiad-prize/train.csv')
        PRIVATE = False
    else:
        PRIVATE = True

In [4]:
def is_integer(num):
    if isinstance(num, float):
        return num.is_integer()
    elif isinstance(num, int):
        return True
    else:
        return False
    
def is_between_0_and_999(num):
    return 0 <= num <= 999

def gen_prompt_codeIn1(problem):
    return f"""
### Problem:\n{problem}\n
### Response: Let's first reason step by step. Then write Python code to solve the problem, using brute force enumeration if necessary. 
Limit the search space using logical deductions. The code should be enclosed between ```python\n actual code...``` and should only print the final answer.
The final answer is an integer between 0 and 999.
"""

def parse_err(input):
    return f""" \n{input}\n
    The code should be enclosed between ```python\n actual code...``` Please fix.
"""

def code_err(code, err):
    return f""" \n{code}\n
    The code encounters this error: {err}\n.
    Please fix.
"""

def eval_err(code):
    return f""" \n{code}\n
    code should only print the final number.
"""

def number_range_type_err(problem,input,answer):
    return f""" \nThis is the original problem: {problem}\n 
    \nThis is the first solution: {input}\n
    However, answer should be a integer between 0 and 999 but got {str(answer)}. Please revisit the logics and
    generate python code to answer the question.
    The code should be enclosed between ```python\n actual code...``` and should only print the final answer.
"""    
def process_inputs(inputs):
    # inputs is a list of str
    outs = []
    for problem in inputs:
        query_prompt = gen_prompt_codeIn1(problem)
        messages = [{"role": "user","content": query_prompt}]
        input = tokenizer.apply_chat_template(messages, tokenize=False)
        outs.append(input)
    return outs

from functools import partial
def get_value(output,normalize):
    # cumulative_logprob are negative. factor of 10 penalize it.
    text = output.text
    if "```python" not in text and text.count("print(") != 1:
        factor = 10
    else:
        factor = 1
    if normalize:
        return factor*output.cumulative_logprob/len(output.token_ids)
    else:
        return factor*output.cumulative_logprob

def process_code(inputs,problems,normalize=True):
    # inputs,problem is a list of str
    outs = []
    for input,problem in zip(inputs,problems):
        # input is vllm.outputs.RequestOutput
        input = max(input.outputs,key=partial(get_value,normalize=normalize)).text
        # parse code
        try:
            code = input.split('```')[1][7:]
        except: 
            outs.append(parse_err(input))
            continue
        # execute code
        with open('code.py', 'w') as fout:
            fout.write(code)
        batcmd = 'timeout 21 ' + sys.executable + ' code.py'
        process = subprocess.run(batcmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
        if process.stderr:# code.py err
            stderr = process.stderr.decode('utf8')
            outs.append(code_err(code, stderr))
        else:
            stdout = process.stdout.decode('utf8')
            try:
                answer = eval(stdout)
                if is_integer(answer) and is_between_0_and_999(answer):
                    outs.append(int(answer))
                else:
                    outs.append(number_range_type_err(problem,input,answer))
            except:
                outs.append(eval_err(code))
            
    return outs

In [5]:
# prepare inputs
problems = data.problem.tolist()
inputs = process_inputs(problems)
capacity = len(inputs) * sampling_params['best_of']

# generation
final_answers = []
if LOCAL: monitors = []
for _ in range(iterations):
    sampling_params['n'] = sampling_params['best_of'] = capacity//len(inputs)
    raw_outputs = llm.generate(inputs, SamplingParams(**sampling_params))
    outs = process_code(raw_outputs,problems)
    inputs = [o for o in outs if isinstance(o,str)] # invalid answers
    final_answers.append(outs)
    if LOCAL: monitors.append(inputs)

Processed prompts:   5%|▌         | 51/975 [02:12<33:57,  2.21s/it] 

KeyboardInterrupt: 

In [None]:
if LOCAL:
    outs_df = pd.DataFrame(outs,columns=['problem','output','yhat','y'])
    print(f"correct: {sum(outs_df.yhat == outs_df.y)}")
    print(f"parse error: {sum(outs_df.yhat =='parsing error')}")
    out_path = create_next_model_folder('../llmOutputs')
    print(out_path) # ../llmOutputs/model1
    outs_df.to_csv(out_path+'/generations.csv', header=True, index=False)
else:
    if not PRIVATE:
        answers = data.answer.tolist()
        correct = sum([y==yhat for y,yhat in zip(answers,outs)])
        print(f'{correct} correct answers')    
    data['answer'] = outs
    data[['id','answer']].to_csv("submission.csv", header=True, index=False)