In [1]:
try:
    from vllm import LLM, SamplingParams
    LOCAL = True
    MODEL_PATH = "deepseek-ai/deepseek-math-7b-rl"
    from functions import *
    dtype = 'auto'
    gpu_memory_utilization = 0.9
except:
    %pip uninstall -y torch -q
    %pip install --no-index --find-links=/kaggle/input/vllm-whl -U vllm -q
    from vllm import LLM, SamplingParams
    LOCAL = False
    MODEL_PATH = "/kaggle/input/deepseek-math"
    dtype = 'half'
    gpu_memory_utilization = 0.99
    from functions_math import *
    import gc

import torch
import pandas as pd

In [2]:
llm = LLM(model=MODEL_PATH,
          dtype='auto',
          enforce_eager=True,
          gpu_memory_utilization=0.9,
          swap_space=10,
          max_model_len=2048,
          kv_cache_dtype="fp8_e5m2",
          tensor_parallel_size=1)
tokenizer = llm.get_tokenizer()
stop_words = [tokenizer.eos_token]
sampling_params = SamplingParams(n = 1, best_of= 1,
                                 temperature=0,
                                 max_tokens=1500,
                                 stop=stop_words)

INFO 04-17 14:55:55 utils.py:253] CUDA_HOME is not found in the environment. Using /usr/local/cuda as CUDA_HOME.
INFO 04-17 14:55:55 config.py:381] Using fp8_e5m2 data type to store kv cache. It reduces the GPU memory footprint and boosts the performance. But it may cause slight accuracy drop. Currently we only support fp8 without scaling factors and make e5m2 as a default format.
INFO 04-17 14:55:55 llm_engine.py:74] Initializing an LLM engine (v0.4.0.post1) with config: model='deepseek-ai/deepseek-math-7b-rl', tokenizer='deepseek-ai/deepseek-math-7b-rl', tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=2048, download_dir=None, load_format=auto, tensor_parallel_size=1, disable_custom_all_reduce=True, quantization=None, enforce_eager=True, kv_cache_dtype=fp8_e5m2, device_config=cuda, seed=0)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


INFO 04-17 14:55:56 selector.py:51] Cannot use FlashAttention because the package is not found. Please install it for better performance.
INFO 04-17 14:55:56 selector.py:25] Using XFormers backend.
INFO 04-17 14:55:57 weight_utils.py:177] Using model weights format ['*.safetensors']
INFO 04-17 14:55:58 model_runner.py:104] Loading model weights took 12.8725 GB
INFO 04-17 14:55:59 gpu_executor.py:94] # GPU blocks: 1989, # CPU blocks: 2730


In [3]:
if LOCAL:
    import json
    with open('../Data/AMC/aime_normal.json', 'r') as file:
        data = json.load(file)
    # to have consistent format as in Kaggle
    data = pd.DataFrame(data)
    data.rename(columns={'question': 'problem'}, inplace=True)
else:
    data = pd.read_csv('/kaggle/input/ai-mathematical-olympiad-prize/test.csv')
    if len(data) < 5:
        data = pd.read_csv('/kaggle/input/ai-mathematical-olympiad-prize/train.csv')
        PRIVATE = False
    else:
        PRIVATE = True

In [4]:
# prepare inputs
inputs = []
for index, row in data.iterrows():
    problem = row['problem']
    query_prompt = gen_prompt(problem)
    messages = [{"role": "user","content": query_prompt}]
    input = tokenizer.apply_chat_template(messages, tokenize=False)
    inputs.append(input)

# generation
raw_outputs = llm.generate(inputs, sampling_params)

# parse outputs
outs = []
for i, (index, row) in enumerate(data.iterrows()):
    problem = row['problem']
    decoded_output = raw_outputs[i].outputs[0].text
    try:
        answer = decoded_output.split('\n')[-1]
        answer = naive_parse(answer) % 1000
    except:
        answer = 'parsing error'

    if LOCAL:
        outs.append((problem,decoded_output,answer,int(row['final_answer'][0])))
    else:
        outs.append(37 if answer == 'parsing error' else answer)
        if not PRIVATE:
            print(decoded_output)
            print(f'\nanswer is {answer}')    

In [14]:
if LOCAL:
    outs_df = pd.DataFrame(outs,columns=['problem','output','yhat','y'])
    print(f"correct: {sum(outs_df.yhat == outs_df.y)}")
    print(f"parse error: {sum(outs_df.yhat =='parsing error')}")
    out_path = create_next_model_folder('../llmOutputs')
    print(out_path) # ../llmOutputs/model1
    outs_df.to_csv(out_path+'/generations.csv', header=True, index=False)
else:
    if not PRIVATE:
        answers = data.answer.tolist()
        correct = sum([y==yhat for y,yhat in zip(answers,outs)])
        print(f'{correct} correct answers')    
    data['answer'] = outs
    data[['id','answer']].to_csv("submission.csv", header=True, index=False)

correct: 51
parse error: 116
../llmOutputs/model4
