In [1]:
!pip install -qqq Levenshtein

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m161.7/161.7 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m49.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import pandas as pd
import time
from tqdm import tqdm
from Levenshtein import distance as levenshtein_distance
from os import environ
from typing import Optional
from openai import OpenAI
from peft import PeftModel
from kaggle_secrets import UserSecretsClient
import os
from functools import lru_cache
from pathlib import Path
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    AutoConfig,
    BitsAndBytesConfig,
)

user_secrets = UserSecretsClient()
os.environ["ACCELERATE_OFFLOAD_DIR"] = "/kaggle/working/offload"

2025-06-01 22:55:38.467395: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1748818538.685524      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1748818538.748262      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [3]:
class LLMClient:
    def __init__(
        self,
        model_name: str = "gpt-3.5-turbo",
        *,
        temperature: float | None = None,    # leave None ⇒ follow server default
        top_p: float | None = None,
        max_tokens: int = 30,
        deterministic: bool = False,         # ← NEW convenience flag
        api_key: Optional[str] = None,
        base_url: str | None = None,
        prompt_template: str = "{description}",
    ):
        # ---------- defaults that every call will inherit ----------
        if deterministic:
            self._defaults: Dict[str, Any] = dict(temperature=0.0, top_p=1.0)
        else:
            self._defaults = {}
            if temperature is not None:
                self._defaults["temperature"] = temperature
            if top_p is not None:
                self._defaults["top_p"] = top_p

        self._defaults["max_tokens"] = max_tokens
        self.model_name = model_name
        self.prompt_template = prompt_template

        # ---------- client initialisation ----------
        self.client = OpenAI(
            api_key=api_key,
            base_url=base_url or "https://api.openai.com/v1",
        )

    # -----------------------------------------------------------------
    def generate(self, question: str, **override_kw) -> str:
        """
        Parameters
        ----------
        description : str
            The user prompt (will be interpolated into `prompt_template`).
        override_kw  : Any extra sampling args for *this* call only
            E.g. temperature=0.7, top_p=0.8, presence_penalty=0.5 …

        Returns
        -------
        str : the assistant’s reply, stripped of whitespace.
        """
        prompt = self.prompt_template.format(description=question)

        # merge: instance defaults  <  per-call overrides
        gen_args = {**self._defaults, **override_kw}

        response = self.client.chat.completions.create(
            model=self.model_name,
            messages=[{"role": "user", "content": prompt}],
            **gen_args,
            n=1,
            stop=None,
        )
        return response.choices[0].message.content.strip()


In [4]:
# ── 1 · Shared backbone loader (identical patch for the quantization stub) ──
@lru_cache(maxsize=None)
def _load_base(
    model_name: str,
    torch_dtype: torch.dtype,
    device_map,
    quantize_4bit: bool,
    hf_token: Optional[str],
    trust_remote_code: bool,
):
    cfg = AutoConfig.from_pretrained(model_name, trust_remote_code=trust_remote_code)
    if not quantize_4bit and getattr(cfg, "quantization_config", None):
        cfg.quantization_config = None  # remove empty stub that breaks FP16 load

    q_cfg = (
        BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_compute_dtype=torch_dtype,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type="nf4",
        )
        if quantize_4bit
        else None
    )

    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        config=cfg,
        device_map=device_map,
        torch_dtype=torch_dtype,
        quantization_config=q_cfg,
        token=hf_token,
        trust_remote_code=trust_remote_code,
    ).eval()

    tok = AutoTokenizer.from_pretrained(model_name, use_fast=True, token=hf_token)
    return model, tok


# ── 2 · Inference-only wrapper with optional PEFT adapter & chat support ────
class HuggingFaceLLM:
    """
    * If the tokenizer has a `chat_template`, `.generate()` uses it and returns
      only the assistant’s reply (no prompt, no stop tokens).
    * Otherwise it behaves like a classic completion model, using
      `prompt_template.format(description=question)`.
    """

    _ADAPTER_CACHE: dict[tuple[str, str], PeftModel] = {}

    def __init__(
        self,
        model_name: str,
        *,
        prompt_template: str = "{description}",   # used as *system* prompt in chat mode
        peft_adapter_dir: str | None = None,
        max_new_tokens: int = 512,
        device_map="auto",
        torch_dtype=torch.float16,
        quantize_4bit: bool = False,
        hf_token: Optional[str] = None,
        trust_remote_code: bool = True,
        offload_dir: str = "/kaggle/working/offload",
        deterministic: bool = True,
    ):
        self.prompt_template = prompt_template
        self.max_new_tokens = max_new_tokens

        # shared backbone & tokenizer
        self._base, self.tokenizer = _load_base(
            model_name,
            torch_dtype,
            device_map,
            quantize_4bit,
            hf_token,
            trust_remote_code,
        )

        self._gen_defaults = (
            {"do_sample": False} if deterministic else {}
        )

        # optional LoRA adapter (shared cache)
        if peft_adapter_dir:
            abs_adapter = os.path.abspath(peft_adapter_dir)
            key = (model_name, abs_adapter)
            if key not in HuggingFaceLLM._ADAPTER_CACHE:
                Path(offload_dir).mkdir(parents=True, exist_ok=True)
                HuggingFaceLLM._ADAPTER_CACHE[key] = PeftModel.from_pretrained(
                    self._base,
                    abs_adapter,
                    is_trainable=False,
                    device_map="auto",
                    offload_dir=offload_dir,
                ).eval()
            self.model = HuggingFaceLLM._ADAPTER_CACHE[key]
        else:
            self.model = self._base

        if deterministic:
            gc = self.model.generation_config
            gc.top_k = None
            gc.top_p = None
            gc.temperature = None

    # ── 3 · Build prompt either via chat template or classic string ----------

    def _build_prompt(self, question: str) -> str:
        if getattr(self.tokenizer, "chat_template", None):
            # treat prompt_template as system message (can be empty string)
            messages = []
            if self.prompt_template:
                messages.append({"role": "system", "content": self.prompt_template})
            messages.append({"role": "user", "content": question})

            return self.tokenizer.apply_chat_template(
                messages,
                tokenize=False,
                add_generation_prompt=True,   # marks where assistant should start
            )
        # fallback: legacy completion mode
        return self.prompt_template.format(description=question)

    # ── 4 · Single-turn question → answer (only assistant text returned) -----
    @torch.inference_mode()
    def generate(self, question: str, **gen_kw) -> str:
        prompt = self._build_prompt(question)

        enc = self.tokenizer(prompt, return_tensors="pt").to(self.model.device)
        out = self.model.generate(
            **enc,
            max_new_tokens=self.max_new_tokens,
            **self._gen_defaults,   # ← merge defaults
            **gen_kw,               #   with per-call overrides
        )[0]

        # Strip the prompt tokens so we keep only newly generated answer
        answer_ids = out[enc["input_ids"].shape[-1] :]
        return self.tokenizer.decode(answer_ids, skip_special_tokens=True).strip()


In [5]:
def benchmark_llm(model, test_df: pd.DataFrame, incorrect_log_path: str):
    total_samples = len(test_df)
    exact_matches = 0
    total_lev_distance = 0
    timings = []
    incorrect_predictions = []

    for _, row in tqdm(test_df.iterrows(), total=total_samples, desc="Benchmarking"):
        # Use the row's description from the DataFrame
        description = row['description']
        key_no_spaces = row['key'].replace(" ", "")

        start_time = time.perf_counter()
        # Pass the description to the generate function
        prediction = model.generate(question=description).strip()
        elapsed = time.perf_counter() - start_time
        timings.append(elapsed)

        if prediction == key_no_spaces:
            exact_matches += 1
        else:
            incorrect_predictions.append({
                "description": description,
                "expected": key_no_spaces,
                "predicted": prediction
            })

        total_lev_distance += levenshtein_distance(prediction, key_no_spaces)

    accuracy = exact_matches / total_samples * 100
    avg_lev_distance = total_lev_distance / total_samples
    mean_time = sum(timings) / total_samples
    median_time = sorted(timings)[total_samples // 2]

    # Logging incorrect predictions to a CSV file
    pd.DataFrame(incorrect_predictions).to_csv(incorrect_log_path, index=False)

    return pd.DataFrame([{
        "Model": model.__class__.__name__,
        "Accuracy": accuracy,
        "Avg_Levenshtein": avg_lev_distance,
        "Mean_Time": mean_time,
        "Median_Time": median_time
    }])

In [6]:
def benchmark_multiple_llms(models: dict, test_df: pd.DataFrame):
    results_df = pd.DataFrame()

    for model_name, model in models.items():
        print(f"\nRunning benchmark for {model_name}")
        incorrect_log_path = f"incorrect_predictions_{model_name}.csv"
        summary_df = benchmark_llm(model, test_df, incorrect_log_path)
        summary_df["Model"] = model_name
        results_df = pd.concat([results_df, summary_df], ignore_index=True)

    return results_df

In [7]:
prompt_template_default = (
    "How to {description} using vim motions? "
    "Write only the symbol sequence representing the vim motion. Don't write anything else"
    "Don't use spaces in your answer. "
    "Combination of control and <symbol> should be written as 'Ctrl+<symbol>'."
)

prompt_template_extended = ""
with open("/kaggle/input/ext-prompt/extended_prompt.txt", "r") as file:
    prompt_template_extended = file.read()

In [8]:
OPENAI_API_KEY = user_secrets.get_secret("openai_api")
DEEPSEEK_API_KEY =  user_secrets.get_secret("deepseek_api")

qwen_05_default_prompt = HuggingFaceLLM('Qwen/Qwen2.5-0.5B-Instruct', prompt_template=prompt_template_default)
qwen_05_extended_prompt = HuggingFaceLLM('Qwen/Qwen2.5-0.5B-Instruct', prompt_template=prompt_template_extended)

qwen_7_default_prompt = HuggingFaceLLM('Qwen/Qwen2.5-7B-Instruct', prompt_template=prompt_template_default)
qwen_7_extended_prompt = HuggingFaceLLM('Qwen/Qwen2.5-7B-Instruct', prompt_template=prompt_template_extended)

dir_05 = '/kaggle/input/0.5b/transformers/default/1'
qwen_05_finutined_default_prompt = HuggingFaceLLM('Qwen/Qwen2.5-0.5B-Instruct', peft_adapter_dir=dir_05, prompt_template=prompt_template_default)
qwen_05_finetuned_extended_prompt = HuggingFaceLLM('Qwen/Qwen2.5-0.5B-Instruct', prompt_template=prompt_template_extended)

dir_7 = '/kaggle/input/7b/transformers/default/1'
qwen_7_finetuned_default_prompt = HuggingFaceLLM('Qwen/Qwen2.5-7B-Instruct', peft_adapter_dir=dir_7, prompt_template=prompt_template_default)
qwen_7_finetuned_extended_prompt = HuggingFaceLLM('Qwen/Qwen2.5-7B-Instruct', prompt_template=prompt_template_extended)

gpt41_default_prompt = LLMClient("gpt-4.1", api_key=OPENAI_API_KEY, prompt_template=prompt_template_default)
gpt41mini_default_prompt = LLMClient("gpt-4.1-mini", api_key=OPENAI_API_KEY, prompt_template=prompt_template_default)
gpt41nano_default_prompt = LLMClient("gpt-4.1-nano", api_key=OPENAI_API_KEY, prompt_template=prompt_template_default)

gpt41_extended_prompt = LLMClient("gpt-4.1", api_key=OPENAI_API_KEY, prompt_template=prompt_template_extended)
gpt41mini_extended_prompt = LLMClient("gpt-4.1-mini", api_key=OPENAI_API_KEY, prompt_template=prompt_template_extended)
gpt41nano_extended_prompt = LLMClient("gpt-4.1-nano", api_key=OPENAI_API_KEY, prompt_template=prompt_template_extended)

deepseek_default_prompt = LLMClient("deepseek-chat", base_url="https://api.deepseek.com", api_key=DEEPSEEK_API_KEY, prompt_template=prompt_template_default)
deepseek_extended_prompt = LLMClient("deepseek-chat", base_url="https://api.deepseek.com", api_key=DEEPSEEK_API_KEY, prompt_template=prompt_template_extended)

llm_instances = {
    "deepseek_v3_default_prompt": deepseek_default_prompt,
    "deepseek_v3_extended_prompt": deepseek_extended_prompt,
    "gpt_4.1_default_prompt": gpt41_default_prompt,
    "gpt_4.1_mini_default_prompt": gpt41mini_default_prompt,
    "gpt_4.1_nano_default_prompt": gpt41nano_default_prompt,
    "gpt_4.1_extended_prompt": gpt41_extended_prompt,
    "gpt_4.1_mini_extended_prompt": gpt41mini_extended_prompt,
    "gpt_4.1_nano_extended_prompt": gpt41nano_extended_prompt,
    "qwen_0.5B_default_prompt": qwen_05_default_prompt,
    "qwen_0.5B_extended_prompt": qwen_05_extended_prompt,
    "qwen_7B_default_prompt": qwen_7_default_prompt,
    "qwen_7B_extended_prompt": qwen_7_extended_prompt,
    "qwen_05_finutined_default_prompt": qwen_05_finutined_default_prompt,
    "qwen_05_finetuned_extended_prompt": qwen_05_finetuned_extended_prompt,
    "qwen_7_finetuned_default_prompt": qwen_7_finetuned_default_prompt,
    "qwen_7_finetuned_extended_prompt": qwen_7_finetuned_extended_prompt,
}

config.json:   0%|          | 0.00/659 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/988M [00:00<?, ?B/s]

Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


generation_config.json:   0%|          | 0.00/242 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/7.30k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/663 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/27.8k [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/3.95G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/3.56G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/3.86G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/243 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/7.30k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

In [9]:
for llm in llm_instances:
    print('-' * 50)
    print(llm)
    print(llm_instances[llm].generate('Delete current line'))

--------------------------------------------------
deepseek_v3_default_prompt
dd
--------------------------------------------------
deepseek_v3_extended_prompt
dd
--------------------------------------------------
gpt_4.1_default_prompt
dd
--------------------------------------------------
gpt_4.1_mini_default_prompt
dd
--------------------------------------------------
gpt_4.1_nano_default_prompt
dd
--------------------------------------------------
gpt_4.1_extended_prompt
dd
--------------------------------------------------
gpt_4.1_mini_extended_prompt
dd
--------------------------------------------------
gpt_4.1_nano_extended_prompt
dd
--------------------------------------------------
qwen_0.5B_default_prompt
d1q
--------------------------------------------------
qwen_0.5B_extended_prompt
dl`1`1`1`1`1`1`1`1`1`1`1`1`1`1`1`1`1`1`1`1`1`1`1`1`1`1`1`1`1`1`1`1`1`1`1`1`1`1`1`1`1`1`1`1`1`1`1`1`1`1`1`1`1`1`1`1`1`1`1`1`1`1`1`1`1`1`1`1`1`1`1`1`1`1`1`1`1`1`1`1`1`1`1`1`1`1`1`1`1`1`1`1`1`1`1`1`

In [10]:
from sklearn.model_selection import train_test_split
import random
import numpy as np

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

DATA_PATH = "/kaggle/input/vim-data/final.csv"
df = pd.read_csv(DATA_PATH).dropna(subset=["description", "key"])
_, val_df = train_test_split(df, test_size=0.1, random_state=SEED)

results = benchmark_multiple_llms(models=llm_instances, test_df=val_df)
results.to_csv(index=False)


Running benchmark for deepseek_v3_default_prompt


Benchmarking: 100%|██████████| 264/264 [18:41<00:00,  4.25s/it]



Running benchmark for deepseek_v3_extended_prompt


Benchmarking: 100%|██████████| 264/264 [21:42<00:00,  4.93s/it]



Running benchmark for gpt_4.1_default_prompt


Benchmarking: 100%|██████████| 264/264 [02:58<00:00,  1.48it/s]



Running benchmark for gpt_4.1_mini_default_prompt


Benchmarking: 100%|██████████| 264/264 [02:42<00:00,  1.63it/s]



Running benchmark for gpt_4.1_nano_default_prompt


Benchmarking: 100%|██████████| 264/264 [01:45<00:00,  2.49it/s]



Running benchmark for gpt_4.1_extended_prompt


Benchmarking: 100%|██████████| 264/264 [12:27<00:00,  2.83s/it]



Running benchmark for gpt_4.1_mini_extended_prompt


Benchmarking: 100%|██████████| 264/264 [02:40<00:00,  1.64it/s]



Running benchmark for gpt_4.1_nano_extended_prompt


Benchmarking: 100%|██████████| 264/264 [02:29<00:00,  1.77it/s]



Running benchmark for qwen_0.5B_default_prompt


Benchmarking: 100%|██████████| 264/264 [34:12<00:00,  7.78s/it]



Running benchmark for qwen_0.5B_extended_prompt


Benchmarking: 100%|██████████| 264/264 [51:42<00:00, 11.75s/it]



Running benchmark for qwen_7B_default_prompt


Benchmarking: 100%|██████████| 264/264 [30:20<00:00,  6.90s/it]



Running benchmark for qwen_7B_extended_prompt


Benchmarking: 100%|██████████| 264/264 [10:09<00:00,  2.31s/it]



Running benchmark for qwen_05_finutined_default_prompt


Benchmarking: 100%|██████████| 264/264 [34:07<00:00,  7.76s/it]



Running benchmark for qwen_05_finetuned_extended_prompt


Benchmarking: 100%|██████████| 264/264 [51:45<00:00, 11.76s/it]



Running benchmark for qwen_7_finetuned_default_prompt


Benchmarking: 100%|██████████| 264/264 [30:20<00:00,  6.90s/it]



Running benchmark for qwen_7_finetuned_extended_prompt


Benchmarking: 100%|██████████| 264/264 [10:07<00:00,  2.30s/it]


'Model,Accuracy,Avg_Levenshtein,Mean_Time,Median_Time\ndeepseek_v3_default_prompt,20.075757575757574,9.325757575757576,4.247133955606052,4.089783183000009\ndeepseek_v3_extended_prompt,37.878787878787875,19.234848484848484,4.932908440916654,4.49480037100011\ngpt_4.1_default_prompt,25.757575757575758,3.1893939393939394,0.6767796670340975,0.5692824739999196\ngpt_4.1_mini_default_prompt,23.484848484848484,3.8333333333333335,0.6139181527537989,0.5389309619999949\ngpt_4.1_nano_default_prompt,12.878787878787879,5.84469696969697,0.3999423015113614,0.3238598909997563\ngpt_4.1_extended_prompt,42.04545454545455,1.7878787878787878,2.8289928292083304,3.0402434660004474\ngpt_4.1_mini_extended_prompt,31.818181818181817,2.446969696969697,0.6082877695795539,0.49711389999993116\ngpt_4.1_nano_extended_prompt,17.803030303030305,6.026515151515151,0.5638452176969353,0.47139139599948976\nqwen_0.5B_default_prompt,0.7575757575757576,237.54545454545453,7.7752845473105925,0.5078549789996032\nqwen_0.5B_extended_p