# Select grids with VLLM

## Goal

Can I use VLLM to select the correct grid answer?

## Configuration

In [None]:
class cfg:
    model_path: str = "/home/gbarbadillo/data/Qwen2-0.5B-arc"
    max_model_len: int = 8192 #61000 for phi-3
    solutions_filepath: str = '/mnt/hdd0/MEGA/AI/22_Kaggle/arc24/scripts/submission_x512.json'
    dataset_filepath: str = '/mnt/hdd0/Kaggle/arc24/data/arc-agi_evaluation_challenges.json'

## Imports

In [None]:
import sys
import os
import json
import time
import textwrap
from vllm import LLM, SamplingParams
from transformers import AutoTokenizer
import matplotlib.pyplot as plt
import matplotlib as mpl

sys.path.append(os.path.realpath('../scripts/'))

from evaluation import (
    load_arc_data_with_solutions,
    evaluate
)
from inference import (
    clear_vllm_gpu_memory,
    SimplePromptCreator,
    GridCodeBlockEncoder,
    MinimalGridEncoder
)
from arc24.prompting import (
    pretty_print_prompt,
    system_prompt,
    prompt_template,
    answer_template,
    remove_assistant_ending
)

plt.plot()
plt.close('all')
plt.rcParams["figure.figsize"] = (25, 4)
mpl.rcParams['lines.linewidth'] = 3
mpl.rcParams['font.size'] = 16

## Load model

In [None]:
llm = LLM(model=cfg.model_path,
            trust_remote_code=True,
            dtype='half',
            tensor_parallel_size=2, # to use 2 gpus
            max_model_len=cfg.max_model_len,
            #kv_cache_dtype='fp8_e5m2', I have disabled kv cache quantization because it is hurtful
            enforce_eager=True, # without this 13.9GB of memory is used on each GPU, with this is 13.3GB,
            disable_log_stats=True,
            )
tokenizer = AutoTokenizer.from_pretrained(cfg.model_path)

## Prepare prompts

In [None]:
with open(cfg.solutions_filepath, 'r') as f:
    solutions = json.load(f)
ground_truth = load_arc_data_with_solutions(cfg.dataset_filepath)

In [None]:
grid_encoder = GridCodeBlockEncoder(MinimalGridEncoder())

In [None]:
def create_prompt(task, test_idx, grid):
    train_samples = [{key: grid_encoder.to_text(grid) for key, grid in sample.items()} for sample in task['train']]
    user_message = prompt_template.render(train_samples=train_samples,
                                            test_input=grid_encoder.to_text(task['test'][test_idx]['input']))
    messages = [{"role": "system", "content": system_prompt},
                {"role": "user", "content": user_message},
                {"role": "assistant", "content": answer_template.render(test_output=grid_encoder.to_text(grid))}]
    # TODO: add start of assistant reply
    prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=False)
    prompt = remove_assistant_ending(prompt, cfg.model_path)
    return prompt

In [None]:
task_id = list(solutions.keys())[0]
test_idx = 0
prompt = create_prompt(ground_truth[task_id], test_idx=test_idx, grid=ground_truth[task_id]['test'][test_idx]['output'])
print(len(tokenizer.tokenize(prompt)))
pretty_print_prompt(prompt, default_color='white')

## Compute likelihood of each prompt

In [None]:
sampling_params = SamplingParams(n=1, temperature=0.0, max_tokens=1)
ret = llm.generate(prompt, sampling_params, use_tqdm=False)

In [None]:
ret[0]

In [None]:
sampling_params = SamplingParams(n=1, temperature=0.0, max_tokens=1, logprobs=1)
ret = llm.generate(prompt, sampling_params, use_tqdm=False)

In [None]:
ret[0]

In [None]:
sampling_params = SamplingParams(n=1, temperature=0.0, max_tokens=1, logprobs=1, prompt_logprobs=1)
ret = llm.generate(prompt, sampling_params, use_tqdm=False)

In [None]:
raise

In [None]:
sampling_params = SamplingParams(n=1, temperature=0.0, max_tokens=1, prompt_logprobs=0)
ret = llm.generate(prompt, sampling_params, use_tqdm=False)

## Clean

In [None]:
del llm.llm_engine.model_executor
del llm
clear_vllm_gpu_memory()

## TODO

- [ ] Print all the elements in the response.