In [1]:
try:
    import peft
    LOCAL = True
    MODEL_PATH = "deepseek-ai/deepseek-math-7b-rl"
    from functions import create_next_model_folder, NoRepeatTokenLogitsProcessor,naive_parse,sentence_transformers
except:
    LOCAL = False
    MODEL_PATH = "/kaggle/input/deepseek-math"
    from functions_math import create_next_model_folder, NoRepeatTokenLogitsProcessor,naive_parse,sentence_transformers
    

import torch
import gc
if not LOCAL:torch.backends.cuda.enable_mem_efficient_sdp(False)
import pandas as pd

In [2]:
if LOCAL:
    import json
    with open('../Data/AMC/aime_normal.json', 'r') as file:
        data = json.load(file)
    # to have consistent format as in Kaggle
    data = pd.DataFrame(data)
    data.rename(columns={'question': 'problem'}, inplace=True)
else:
    data = pd.read_csv('/kaggle/input/ai-mathematical-olympiad-prize/test.csv')
    if len(data) < 5:
        data = pd.read_csv('/kaggle/input/ai-mathematical-olympiad-prize/train.csv')
        PRIVATE = False
    else:
        PRIVATE = True

#### RAG

In [3]:
import json
import numpy as np
from transformers import AutoModel, AutoTokenizer,AutoModelForCausalLM
from sklearn.metrics.pairwise import cosine_similarity

In [4]:
if LOCAL:
    model_paths = model_names = ["mixedbread-ai/mxbai-embed-large-v1"]
    data_paths = ['context_combine3.json']
    FILE_PATH = '../Data/Embeddings'
else:
    # local path to Kaggle model folder
    model_paths = ["/kaggle/input/westseverus-7b-dpo"]
    # model name as in from HF source, needs to be of the same order as model_paths
    model_names = ["mixedbread-ai/mxbai-embed-large-v1"]
    data_paths = ['context_combine4.json']
    FILE_PATH = '/kaggle/input/embedding-db'
get_name = lambda x: x.split('/')[-1]

In [5]:
# List of str (query) -> List of list of str (context)

# get embedding for query
query = data.problem.tolist()
contexts = []
query_list = []
for model_name in model_paths:
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)
    sentence_model = sentence_transformers(model,tokenizer)
    query_list.append(sentence_model.encode(query,True))

# get contexts for each query
for data_path in data_paths:
    with open(FILE_PATH + '/' + get_name(data_path) + 'context.json', 'r') as f:
        output_context = json.load(f)
    for query,model_name in zip(query_list,model_names):
        embeddings = np.load(FILE_PATH + '/' + model_name.split('/')[-1] + "--" + data_path +'.npy')
        index = cosine_similarity(embeddings,query).argmax(0)
        context = [output_context[i] for i in index]
        contexts.append(context)

# contexts [[question1_model1, question1_model2,...],[question2_model1,...],...]
contexts = list(zip(*contexts))

del model,tokenizer
torch.cuda.empty_cache()
gc.collect()

0

#### Generation

In [6]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_PATH,
    device_map = "auto",
    torch_dtype="auto",
    trust_remote_code = True,
    use_flash_attention_2=LOCAL,
)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
The model was loaded with use_flash_attention_2=True, which is deprecated and may be removed in a future release. Please use `attn_implementation="flash_attention_2"` instead.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [7]:
def gen_prompt(problem):
    
    return f"""
### Instruction:\n{problem}\n\n
### Response: Let's think step by step. The final response should be a integer between 0 and 999 in the last line of your response.
"""
from collections import Counter
def aggregate(answers):
    pred = Counter(answers).most_common(2)
    if len(pred) == 1:
        if pred[0][0] == "parsing error":
            return 37
        else:
            return pred[0][0]
    else:
        return pred[1][0] if pred[0][0] == "parsing error" else pred[0][0]

In [8]:
outs = []
no_repeat_processor = [NoRepeatTokenLogitsProcessor()]
for context, (index, row) in zip(contexts,data.iterrows()):
    answers = []
    problem = row['problem']
    query_prompt = gen_prompt(problem)
    if LOCAL: monitor = [problem,int(row['final_answer'][0])] # LOCAL only
    for c in context:
        # problem, context -> answer #
        messages = [
            {
                "role": "user",
                "content": c + "\n" + query_prompt
            }
        ]

        inputs = tokenizer.apply_chat_template(messages, return_tensors="pt").to("cuda")
        with torch.no_grad():
            encoded_output = model.generate(inputs, max_new_tokens=1500, do_sample=False, pad_token_id=tokenizer.eos_token_id,\
                                            logits_processor=no_repeat_processor)

        decoded_output = tokenizer.decode(encoded_output[0], skip_special_tokens=True).replace(c + "\n" + query_prompt, '')
        try:
            answer = decoded_output.split('\n')[-1]
            answer = naive_parse(answer) % 1000
        except:
            answer = 'parsing error'
        # End problem, context -> answer #
        answers.append(answer)
        if LOCAL:
            monitor.append(c)
            monitor.append(decoded_output)
            monitor.append(answer)
        elif not PRIVATE:
            print(f"\nContext: {c}\n")
            print(decoded_output)

    final_answer = aggregate(answers)
    if LOCAL:
        monitor.append(final_answer)
        outs.append(monitor)
    else:
        outs.append(final_answer)
        torch.cuda.empty_cache()
        gc.collect()

Token indices sequence length is longer than the specified maximum sequence length for this model (5214 > 4096). Running this sequence through the model will result in indexing errors
This is a friendly reminder - the current text generation call will exceed the model's predefined maximum length (4096). Depending on the model, you may observe exceptions, performance degradation, or nothing at all.


In [17]:
if LOCAL:
    outs_df = pd.DataFrame(outs,columns=['problem','y'] + ['context','output','yhat_i']*len(context) + ['yhat'])
    print(sum(outs_df.yhat == outs_df.y))
    out_path = create_next_model_folder('../llmOutputs')
    print(out_path)
    outs_df.to_csv(out_path+'/outputs.csv', header=True, index=False)
    # 49
    # ../llmOutputs/model2
else:
    if not PRIVATE:
        answers = data.answer.tolist()
        correct = sum([y==yhat for y,yhat in zip(answers,outs)])
        print(f'{correct} correct answers')    
    data['answer'] = outs
    data[['id','answer']].to_csv("submission.csv", header=True, index=False)