# LMSYS ZeroShot Prediction

This code uses LLama3 to make zero-shot predictions.

Instead of learning a classification header, it uses a carefully crafted prompt to predict which token among **A**, **B**, or **tie** is most likely to follow `###Answer:`.









In [None]:
import json
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
import pickle
import random
import os
import sys

import transformers
from transformers import AdamW
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import get_cosine_schedule_with_warmup

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset

import sklearn.metrics
from sklearn.metrics import accuracy_score

os.environ["TOKENIZERS_PARALLELISM"] = "true"
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

In [None]:
class CFG:
    INPUT_DIR = "/kaggle/input/lmsys-chatbot-arena/"
    OUTPUT = "/kaggle/working"
    MODEL_ID = "/kaggle/input/llama-3/transformers/8b-hf/1"
    SEED = 42
    USE_TURN = 0

In [None]:
train_df = pd.read_csv(f"{CFG.INPUT_DIR}/train.csv")
train_df

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model = AutoModelForCausalLM.from_pretrained(
    CFG.MODEL_ID,
    torch_dtype=torch.float16,
    device_map='auto',
)

In [None]:
tokenizer = AutoTokenizer.from_pretrained(
    CFG.MODEL_ID,
    use_fast=False,
    trust_remote_code=True,
    padding_side="left",
    pad_token="<|endoftext|>"
)
model.config.pad_token_id = model.config.eos_token_id

In [None]:
n_valid = len(train_df) // 5
sample_df = train_df.sample(n_valid, random_state=CFG.SEED)
sample_df.head()

In [None]:
results = []
for _, row in tqdm(sample_df.iterrows(), total=len(sample_df)):
    prompt = json.loads(row["prompt"])
    response_a = json.loads(row["response_a"])
    response_b = json.loads(row["response_b"])
    
    
    p = prompt[CFG.USE_TURN]
    a = response_a[CFG.USE_TURN]
    b = response_b[CFG.USE_TURN]
    
    if a is None or b is None or len(p.split()) < 3:
        continue
        
    p = " ... ".join(["none" if i is None else i for i in prompt])
    a = " ... ".join(["none" if i is None else i for i in response_a])
    b = " ... ".join(["none" if i is None else i for i in response_b])
    # Use head and tail
    p = p[:128] + " ... " + p[-128:]
    a = a[:256] + " ... " + a[-256:]
    b = b[:256] + " ... " + b[-256:]
    
    text = f"""### Instruction
Which model's answer is appropriate for the prompt?　If both are appropriate, answer `tie`.

### Prompt
{p}

### A
{a}

### B
{b}

### Answer
"""

    toks = tokenizer(text)

    for k in toks.keys():
        toks[k] = torch.tensor(toks[k]).cuda()  

    with torch.no_grad():
        out = model(toks["input_ids"].unsqueeze(0))

    pred_token_id = tokenizer.encode("A") + tokenizer.encode("B") + tokenizer.encode("tie")
    pred = out.logits[0, -1, pred_token_id].cpu().softmax(0).numpy()
    
    d = row.to_dict()
    d["predict"] = pred
    results.append(d)

In [None]:
results_df = pd.DataFrame(results)
results_df

In [None]:
results_df.to_csv("result.csv", index=None)

In [None]:
!ls

In [None]:
targets = results_df[["winner_model_a", "winner_model_b", "winner_tie"]].values

predicts = np.array(results_df["predict"].tolist())

In [None]:
logloss = sklearn.metrics.log_loss(targets, predicts)
print(logloss)

In [None]:
!ls

### Next steps:

- Try a larger model (with better benchmark accuracy)
- Refine the prompt.
- SFT (Supervised Fine-Tuning) on competition data
- Use ensemble 