## LMSYS COMPETITION NOTEBOOK

Forked From: https://www.kaggle.com/code/nabojyotipandey/dual-model-inference-gemma2-9b-llama3-8b and fine-tuned

DATASETS LOADED IN ENVIRONMENT:
- LMSYS
- Llama3
- Gemma2
- bitsandbytes

This project aims to predict user preferences in head-to-head battles between responses from large language models (LLMs). Utilizing data from the LMSYS ChatBot Arena Kaggle Competition, we employed a model ensemble approach combining two pre-trained models, Gemma2 and Llama3.


In [None]:
!pip install transformers peft accelerate bitsandbytes -U --no-index --find-links /kaggle/input/lmsys-wheel-files

In [None]:
# IMPORTS
import time
from dataclasses import dataclass
from concurrent.futures import ThreadPoolExecutor

import torch
import sklearn
import numpy as np
import pandas as pd
from transformers import (
    Gemma2ForSequenceClassification, GemmaTokenizerFast,
    AutoTokenizer, LlamaForSequenceClassification, BitsAndBytesConfig
)
from transformers.data.data_collator import pad_without_fast_tokenizer_warning
from peft import PeftModel, get_peft_model, LoraConfig, TaskType

torch.backends.cuda.enable_mem_efficient_sdp(True)
torch.backends.cuda.enable_flash_sdp(True)

## CONFIG

In [None]:
# CONFIGURATION CLASS
@dataclass
class Config:
    gemma_dir: str = '/kaggle/input/gemma-2/transformers/gemma-2-9b-it-4bit/1/gemma-2-9b-it-4bit'
    gemma_lora_dir: str = '/kaggle/input/73zap2gx/checkpoint-5748'
    llama_model_name: str = '/kaggle/input/llama-3/transformers/8b-chat-hf/1'
    llama_weights_path: str = '/kaggle/input/lmsys-model/model'
    max_length: int = 2048
    batch_size: int = 4
    tta: bool = False
    spread_max_length: bool = False

# Instantiate the configuration
cfg = Config()

## LOAD DATA

In [None]:
# LOAD AND PROCESS TEST DATA
test = pd.read_csv('/kaggle/input/lmsys-chatbot-arena/test.csv')

def process_text(text: str) -> str:
    return " ".join(eval(text, {"null": ""}))

test.loc[:, 'prompt'] = test['prompt'].apply(process_text)
test.loc[:, 'response_a'] = test['response_a'].apply(process_text)
test.loc[:, 'response_b'] = test['response_b'].apply(process_text)

## TOKENIZE

In [None]:
# TOKENIZE FUNCTION
def tokenize(tokenizer, prompt, response_a, response_b,
             max_length=cfg.max_length, spread_max_length=cfg.spread_max_length):
    # Handle different formats for different tokenizers
    if isinstance(tokenizer, GemmaTokenizerFast):
        prompt = ["<prompt>: " + p for p in prompt]
        response_a = ["\n\n<response_a>: " + r_a for r_a in response_a]
        response_b = ["\n\n<response_b>: " + r_b for r_b in response_b]
    else:
        prompt = ["User prompt: " + p for p in prompt]
        response_a = ["\n\nModel A :\n" + r_a for r_a in response_a]
        response_b = ["\n\n--------\n\nModel B:\n" + r_b for r_b in response_b]

    # Tokenize with spread max length
    if spread_max_length:
        prompt = tokenizer(prompt, max_length=max_length//3, truncation=True, padding=False).input_ids
        response_a = tokenizer(response_a, max_length=max_length//3, truncation=True, padding=False).input_ids
        response_b = tokenizer(response_b, max_length=max_length//3, truncation=True, padding=False).input_ids
        input_ids = [p + r_a + r_b for p, r_a, r_b in zip(prompt, response_a, response_b)]
        attention_mask = [[1] * len(i) for i in input_ids]
    # Tokenize without spread max length
    else:
        text = [p + r_a + r_b for p, r_a, r_b in zip(prompt, response_a, response_b)]
        tokenized = tokenizer(text, max_length=max_length, truncation=True, padding=False)
        input_ids = tokenized.input_ids
        attention_mask = tokenized.attention_mask

    return input_ids, attention_mask

In [None]:
# Gemma Tokenizer
gemma_tokenizer = GemmaTokenizerFast.from_pretrained(cfg.gemma_dir)
gemma_tokenizer.add_eos_token = True
gemma_tokenizer.padding_side = "right"

# Llama Tokenizer
llama_tokenizer = AutoTokenizer.from_pretrained('/kaggle/input/lmsys-model/tokenizer')

# Prepare data for Gemma model
gemma_data = pd.DataFrame()
gemma_data["id"] = test["id"]
gemma_data["input_ids"], gemma_data["attention_mask"] = tokenize(gemma_tokenizer, test["prompt"], test["response_a"], test["response_b"])
gemma_data["length"] = gemma_data["input_ids"].apply(len)

# Prepare data for Llama model
llama_data = pd.DataFrame()
llama_data["id"] = test["id"]
llama_data["input_ids"], llama_data["attention_mask"] = tokenize(llama_tokenizer, test["prompt"], test["response_a"], test["response_b"])
llama_data["length"] = llama_data["input_ids"].apply(len)

## LOAD MODELS

In [None]:
# Load Gemma model on GPU 0
device_0 = torch.device('cuda:0')
# Load the Gemma model for sequence classification onto GPU 0
gemma_model = Gemma2ForSequenceClassification.from_pretrained(
    cfg.gemma_dir,
    device_map=device_0,
    use_cache=False
)
# Apply PEFT using the LoRA checkpoint
gemma_model = PeftModel.from_pretrained(gemma_model, cfg.gemma_lora_dir)

In [None]:
# Load Llama model on GPU 1
device_1 = torch.device('cuda:1')

# Configure BitsAndBytes for 8-bit loading
bnb_config = BitsAndBytesConfig(
    load_in_8bit=True,
    bnb_8bit_compute_dtype=torch.float16,
    bnb_8bit_use_double_quant=False
)
#  Load the Llama model for sequence classification onto GPU 1
llama_base_model = LlamaForSequenceClassification.from_pretrained(
    cfg.llama_model_name,
    num_labels=3,
    torch_dtype=torch.float16,
    quantization_config=bnb_config,
    device_map='cuda:1')
llama_base_model.config.pad_token_id = llama_tokenizer.pad_token_id

# Configure and apply PEFT using the LoRA configuration
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.10,
    bias='none',
    inference_mode=True,
    task_type=TaskType.SEQ_CLS,
    target_modules=['o_proj', 'v_proj']
)
llama_model = get_peft_model(llama_base_model, peft_config).to(device_1)
llama_model.load_state_dict(torch.load(cfg.llama_weights_path), strict=False)
llama_model.eval()

## INFERENCE

In [None]:
@torch.no_grad()
@torch.cuda.amp.autocast()
def inference(df, model, tokenizer, device, batch_size=cfg.batch_size, max_length=cfg.max_length):
    a_win, b_win, tie = [], [], []
    # Process the data in batches
    for start_idx in range(0, len(df), batch_size):
        end_idx = min(start_idx + batch_size, len(df))
        tmp = df.iloc[start_idx:end_idx]
        input_ids = tmp["input_ids"].to_list()
        attention_mask = tmp["attention_mask"].to_list()
        # Pad the inputs without triggering the fast tokenizer warning
        inputs = pad_without_fast_tokenizer_warning(
            tokenizer,
            {"input_ids": input_ids, "attention_mask": attention_mask},
            padding="longest",
            pad_to_multiple_of=None,
            return_tensors="pt",
        )
        # Perform inference
        outputs = model(**inputs.to(device))
        proba = outputs.logits.softmax(-1).cpu()

        # Append the probabilities to the corresponding lists
        a_win.extend(proba[:, 0].tolist())
        b_win.extend(proba[:, 1].tolist())
        tie.extend(proba[:, 2].tolist())
    # Update the DataFrame with the probabilities
    df["winner_model_a"] = a_win
    df["winner_model_b"] = b_win
    df["winner_tie"] = tie

    return df

## VALIDATION TESTING AND WEIGHTING

In [None]:
# VALIDATION TESTING FOR MODEL WEIGHTING

"""
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss

# Load the training data
train = pd.read_csv('/kaggle/input/lmsys-chatbot-arena/train.csv')
train.loc[:, 'prompt'] = train['prompt'].apply(process_text)
train.loc[:, 'response_a'] = train['response_a'].apply(process_text)
train.loc[:, 'response_b'] = train['response_b'].apply(process_text)

# Prepare the labels for evaluation later
train["labels"] = train.apply(lambda row: [row['winner_model_a'], row['winner_model_b'], row['winner_tie']], axis=1)

# Split the dataset into training and validation sets
train_data, val_data = train_test_split(train, test_size=0.2, random_state=42)

# Tokenize the training data for each model
train_data_gemma = train_data.copy()
train_data_llama = train_data.copy()

train_data_gemma["input_ids"], train_data_gemma["attention_mask"] = tokenize(
    gemma_tokenizer, train_data_gemma["prompt"], train_data_gemma["response_a"], train_data_gemma["response_b"]
)

train_data_llama["input_ids"], train_data_llama["attention_mask"] = tokenize(
    llama_tokenizer, train_data_llama["prompt"], train_data_llama["response_a"], train_data_llama["response_b"]
)

# Tokenize the validation data for each model
val_data_gemma = val_data.copy()
val_data_llama = val_data.copy()

val_data_gemma["input_ids"], val_data_gemma["attention_mask"] = tokenize(
    gemma_tokenizer, val_data_gemma["prompt"], val_data_gemma["response_a"], val_data_gemma["response_b"]
)

val_data_llama["input_ids"], val_data_llama["attention_mask"] = tokenize(
    llama_tokenizer, val_data_llama["prompt"], val_data_llama["response_a"], val_data_llama["response_b"]
)

# Calculate the length of each input sequence for sorting
val_data_gemma["length"] = val_data_gemma["input_ids"].apply(len)
val_data_llama["length"] = val_data_llama["input_ids"].apply(len)

# Sort by input length for efficient batching
val_data_gemma = val_data_gemma.sort_values("length", ascending=False)
val_data_llama = val_data_llama.sort_values("length", ascending=False)

# Perform inference for both models using the correct devices
val_gemma_result_df = inference(val_data_gemma, gemma_model, gemma_tokenizer, device_0)
val_llama_result_df = inference(val_data_llama, llama_model, llama_tokenizer, device_1)

# Calculate validation metrics (using log loss)
gemma_log_loss = log_loss(val_data['labels'].tolist(), val_gemma_result_df[['winner_model_a', 'winner_model_b', 'winner_tie']].values)
llama_log_loss = log_loss(val_data['labels'].tolist(), val_llama_result_df[['winner_model_a', 'winner_model_b', 'winner_tie']].values)

# Print the log loss for both models
print(f"Gemma Model Log Loss: {gemma_log_loss}")
print(f"Llama Model Log Loss: {llama_log_loss}")

# Calculate weights based on validation log loss
total_loss = gemma_log_loss + llama_log_loss
gemma_weight = llama_log_loss / total_loss
llama_weight = gemma_log_loss / total_loss

print(f"Calculated Weights - Gemma: {gemma_weight}, Llama: {llama_weight}")
"""
# Gemma Weight was calculated as: 0.4257644179
# Llama Weight was calculated as: 0.5742355821

## GENERATE PREDICTIONS

In [None]:
st = time.time()

# Sort data by input length
gemma_data = gemma_data.sort_values("length", ascending=False)
llama_data = llama_data.sort_values("length", ascending=False)

with ThreadPoolExecutor(max_workers=2) as executor:
    results = executor.map(inference,
                           (gemma_data, llama_data),
                           (gemma_model, llama_model),
                           (gemma_tokenizer, llama_tokenizer),
                           (device_0, device_1))

gemma_result_df, llama_result_df = list(results)

# Apply the weighted average to combine results
combined_result_df = gemma_result_df.copy()
combined_result_df["winner_model_a"] = (gemma_weight * gemma_result_df["winner_model_a"] + llama_weight * llama_result_df["winner_model_a"])
combined_result_df["winner_model_b"] = (gemma_weight * gemma_result_df["winner_model_b"] + llama_weight * llama_result_df["winner_model_b"])
combined_result_df["winner_tie"] = (gemma_weight * gemma_result_df["winner_tie"] + llama_weight * llama_result_df["winner_tie"])

In [None]:
submission_df = combined_result_df[["id", 'winner_model_a', 'winner_model_b', 'winner_tie']]
submission_df.to_csv('submission.csv', index=False)
display(submission_df.head())