In [None]:
%%writefile install_packages.py

def install_packages():
  import os
  os.system('pip uninstall -y torch')
  os.system('pip uninstall -y torchvision')
  os.system('pip install -q --no-index --find-links=/kaggle/input/0-6-3-post1-wheels-vllm vllm')
  os.system('pip install -q --no-index -U --upgrade /kaggle/input/vllm-t4-fix/grpcio-1.62.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl')
  os.system('pip install -q --no-index -U --upgrade /kaggle/input/vllm-t4-fix/ray-2.11.0-cp310-cp310-manylinux2014_x86_64.whl')
  os.system('pip uninstall -y pynvml')
  os.system('pip install --no-deps --no-index /kaggle/input/0-6-3-post1-wheels-vllm/nvidia_ml_py-12.560.30-py3-none-any.whl')
  os.system('pip install --no-deps --no-index /kaggle/input/logits-processor-zoo/logits_processor_zoo-0.1.0-py3-none-any.whl')

install_packages()

In [None]:
try:
    import vllm
    print('vllm version=',vllm.__version__)
except ImportError:
    print('Installing packages')
    !python install_packages.py
    import vllm
    print('----'*10,'\nvllm version=',vllm.__version__)

In [None]:
import pandas as pd
import numpy as np
import os

os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["model_path"] = "/kaggle/input/qwen2.5-14b-instruct-gptq-int8/transformers/default/1"

IS_SUBMISSION = bool(os.getenv("KAGGLE_IS_COMPETITION_RERUN"))

run_eval = True ## To check score on train data- 100 samples

In [None]:
%%writefile run_vllm.py

import sys
import re
import gc
import vllm
import pandas as pd
import os
from transformers import AutoTokenizer, AutoModel
from logits_processor_zoo.vllm import MultipleChoiceLogitsProcessor


IS_SUBMISSION = bool(os.getenv("KAGGLE_IS_COMPETITION_RERUN"))
os.environ["CUDA_VISIBLE_DEVICES"]="0,1"

if IS_SUBMISSION:
    df = pd.read_parquet('/kaggle/input/wsdm-cup-multilingual-chatbot-arena/test.parquet')
else:
    df = pd.read_parquet('/kaggle/input/wsdm-cup-multilingual-chatbot-arena/train.parquet')
    df = df.sample(100, random_state=42).reset_index(drop=True).copy()
    df['winner_GT'] = df['winner']
    print('Length of df=',len(df))

def apply_template(row, tokenizer):
    messages = [
        {"role": "system", 
         "content": '''You are a highly skilled judge tasked with selecting the best response to a given query.

Input Format:
<Query>
[The question that needs to be answered.]
</Query>

<Response_A> 
[The response from the first candidate.]
</Response_A>

<Response_B>
[The response from the second candidate.]
</Response_B>

Your Task:
Carefully analyze both <Response_A> and <Response_B> in relation to the Query.
Determine which response provides a better answer based on:
- Helpfulness
- Accuracy
- Relevance

Output:
Respond with only a single letter:
- A if <Response_A> is better.
- B if <Response_B> is better.

Important Notes:
- Provide only the letter A or B as your response.
- No explanations are needed.'''
        },
        {
            "role": "user", 
            "content": f'''Here is your input to process now-

<Query>
{row['prompt']}
</Query>
{'---'*10}
<Response_A>
{row['response_a']}
</Response_A>
{'---'*10}
<Response_B>
{row['response_b']}
</Response_B>
'''
        }
    ]
    text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    return text+' Choice: '

def winner(x):
    return f'model_{x.lower()}'


model_path = os.getenv('model_path')
if model_path in ['/kaggle/input/meta-llama-3.3-70b/transformers/ibnzterrell-instruct-awq-int4/1',
                  '/kaggle/input/qwen-72b-gptq-int4/transformers/qwen2.5-72b-instruct-gptq-int4/1']:
    print("Offload needed")
    offload = 8.5
    swap = 1
    max_len = 4000
    num_seqs = 35
else:
    offload = 0
    swap = 5
    max_len = 10000
    num_seqs = 256


llm = vllm.LLM(model=model_path,
    tensor_parallel_size= 2,
    gpu_memory_utilization= 0.99, 
    trust_remote_code= True,
    dtype= "half",
    enforce_eager= True,
    max_model_len= max_len,
    disable_log_stats= True,
    cpu_offload_gb= offload,
    swap_space= swap,
    device= 'cuda',
    max_num_seqs= num_seqs,
    enable_prefix_caching= True,
)

tokenizer = AutoTokenizer.from_pretrained(model_path)

df["text"] = df.apply(lambda row: apply_template(row, tokenizer),axis=1)

def tok_len(txt):
    return len(tokenizer.encode(txt))

df['token_length'] = df['text'].apply(tok_len)

print("Stats of df",df['token_length'].describe())
print('Example input-\n',df["text"][0])

responses = llm.generate(
        df["text"].values,
        vllm.SamplingParams(
            n=1,  
            top_k=1,  
            temperature=0,  
            seed=777, 
            skip_special_tokens=False, 
            max_tokens=1, 
            logits_processors=[MultipleChoiceLogitsProcessor(tokenizer, choices=["A","B"])]
        ),
        use_tqdm=True
    )
print('Raw responses: ',[x.outputs[0].text for x in responses])
df["winner"] = [winner(x.outputs[0].text) for x in responses]

if IS_SUBMISSION:
    df.to_csv("submission.csv", columns=["id", "winner"], index=False)
else:
    df.to_csv("submission.csv", columns=["id", "winner", "winner_GT"], index=False)

In [None]:
if IS_SUBMISSION or run_eval:
    !python run_vllm.py

In [None]:
if not IS_SUBMISSION and run_eval:
    df = pd.read_csv('submission.csv')
    correct_preds = (df['winner'] == df['winner_GT']).sum()
    total_preds = len(df)
    acc = correct_preds / total_preds
    
    print(f"Accuracy: {acc}")