In [1]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, AutoConfig

MODEL_PATH = "mistralai/Mixtral-8x7B-Instruct-v0.1"
# MODEL_PATH = "/kaggle/input/westseverus-7b-dpo"
quantization_config = BitsAndBytesConfig(
    load_in_4bit = True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
    llm_int8_enable_fp32_cpu_offload=True
)

# To prevent GPU memory overflow in Mixtral8x7b
config = AutoConfig.from_pretrained(MODEL_PATH)
config.gradient_checkpointing = True


tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)

model = AutoModelForCausalLM.from_pretrained(
    MODEL_PATH,
    device_map = "auto",
    trust_remote_code = True,
    quantization_config=quantization_config,
    config=config,
    # load_in_8bit_fp32_cpu_offload=True
    # load_in_4bit=True,
    use_flash_attention_2=True,
)

The model was loaded with use_flash_attention_2=True, which is deprecated and may be removed in a future release. Please use `attn_implementation="flash_attention_2"` instead.


Loading checkpoint shards:   0%|          | 0/19 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

In [2]:
import json
with open('../Data/AMC/aime.json', 'r') as file:
    data = json.load(file)

In [3]:
def gen_prompt(problem):
    
    return f"""
Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n
### Instruction:\n{problem}\n\n
### Response: Let's think step by step. The final response should be a single number in the last line of your response.
"""

In [4]:
import gc
device = 'cuda'

In [5]:
def naive_parse(answer):
    out = []
    start = False
    end = False
    for l in reversed(list(answer)):
        if l in '0123456789' and not end:
            start = True
            out.append(l)
        else:
            if start:
                end = True
        
    out = reversed(out)
    return int(''.join(out))

In [6]:
answers_hat = []
answers = []
texts = []
for d in data:
    problem = d['problem']
    query_prompt = gen_prompt(problem)
    
    messages = [
        {
            "role": "user",
            "content": query_prompt
        }
    ]

    inputs = tokenizer.apply_chat_template(messages, return_tensors="pt").to("cuda")
    with torch.no_grad():
        encoded_output = model.generate(inputs, max_new_tokens=1500, do_sample=False, pad_token_id=tokenizer.eos_token_id)

    decoded_output = tokenizer.decode(encoded_output[0], skip_special_tokens=True).replace(query_prompt, '').replace("[INST]", "").replace("[/INST]", "").strip()
    answer = decoded_output.split('\n')[-1]
    answer = naive_parse(answer)
    answer = int(answer)
    answers.append(d['answers'])
    answers_hat.append(answer)
    texts.append(decoded_output)
    torch.cuda.empty_cache()
    gc.collect()

ValueError: Trying to set a tensor of shape torch.Size([4096, 4096]) in "weight" (which has shape torch.Size([8388608, 1])), this look incorrect.

In [None]:
sum = 0
for y,yhat in zip(answers,answers_hat):
    if y in [int(temp) for temp in yhat]:
        sum+=1
print(sum)