# Note
- [Training script](https://www.kaggle.com/code/shelterw/training-llama3-8b-4-bit-qlora-sft)

# Import

In [None]:
!pip install transformers -U --no-index --find-links /kaggle/input/lmsys-transformers/lmsys_transformers

In [None]:
!pip install peft accelerate bitsandbytes -U --no-index --find-links /kaggle/input/lmsys-wheel-files

In [None]:
import json
import time
import sklearn
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.cuda.amp import autocast
from dataclasses import dataclass
from concurrent.futures import ThreadPoolExecutor
from threading import Thread
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, AutoModel, AutoConfig, DataCollatorForSeq2Seq
from transformers import Trainer, TrainingArguments, DataCollatorWithPadding, AutoModelForSequenceClassification
from peft import get_peft_config, PeftModel, PeftConfig, get_peft_model, LoraConfig, TaskType 
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, TaskType
from transformers.modeling_outputs import CausalLMOutputWithPast
from transformers import BitsAndBytesConfig, LlamaForCausalLM, LlamaModel, LlamaPreTrainedModel
from transformers.data.data_collator import pad_without_fast_tokenizer_warning
from transformers import set_seed
torch.backends.cuda.enable_mem_efficient_sdp(True)
torch.backends.cuda.enable_flash_sdp(True)
assert torch.cuda.device_count() == 2, "Sorry - multi-GPU required!"
import warnings
warnings.filterwarnings('ignore')

In [None]:
# /kaggle/input/llama-3-8b-instruct-bnb-4bit/llama-3-8b-Instruct-bnb-4bit/config.json
MODEL_NAME = '/kaggle/input/meta-llama-3-8b/LLM-Research/Meta-Llama-3-8B'
WEIGHTS_PATH = '/kaggle/input/llama31-sample5500-cls/llama31-sample5500-cls/checkpoint-550'
TOKENIZER_PATH = '/kaggle/input/llama31-sample5500-cls/llama31-sample5500-cls/tokenizer'
MAX_LENGTH = 2400
BATCH_SIZE = 4
DEVICE = torch.device("cuda")    

# Prepare Data 

In [None]:
def process_text(text: str) -> list:
    x = json.loads(text)
    x = ['none' if pd.isna(i) else i for i in x]
    return x


def merge_text(x):
    prompt = x['prompt']
    response_a = x['response_a']
    response_b = x['response_b']
    res = ''
    for i in range(len(prompt)):
        if i == len(prompt) - 1:
            res += f'<prompt>: {prompt[i]}' + f'\n\n<response_a>: {response_a[i]}' + f'\n\n<response_b>: {response_b[i]}'
        else:
            res += f'<prompt>: {prompt[i]}' + f'\n\n<response_a>: {response_a[i]}' + f'\n\n<response_b>: {response_b[i]}' + '\n\n'
    return res

In [None]:
train = pd.read_csv('/kaggle/input/lmsys-chatbot-arena/train.csv')
test = pd.read_csv('/kaggle/input/lmsys-chatbot-arena/test.csv')

if test.shape[0] == 3:
    test = train.head(100)


test['prompt'] = test['prompt'].map(lambda x: process_text(x))
test['response_a'] = test['response_a'].map(lambda x: process_text(x))
test['response_b'] = test['response_b'].map(lambda x: process_text(x))

test['text'] = test.apply(lambda x: merge_text(x), axis=1)
test.head()

# Tokenize

In [None]:
def tokenize(tokenizer, x):
    tokenized = tokenizer(x, max_length=MAX_LENGTH, truncation=True)
    return tokenized['input_ids'], tokenized['attention_mask']

In [None]:
llama31_tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_PATH)

llama31_data = pd.DataFrame()
llama31_data["id"] = test["id"]
llama31_data["input_ids"], llama31_data["attention_mask"] = tokenize(llama31_tokenizer, list(test['text']))
llama31_data["length"] = llama31_data["input_ids"].apply(len)

# Load model 
We load 1 model on each gpu.  

In [None]:
bnb_config =  BitsAndBytesConfig(
    load_in_8bit=True,
    bnb_8bit_compute_dtype=torch.float16,
    bnb_8bit_use_double_quant=False,
)

device_0 = torch.device('cuda:0')
llama31_model_0 = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=3,
    torch_dtype=torch.float16,
    quantization_config=bnb_config,
    use_cache=False,
    device_map=device_0
)
llama31_model_0.config.pad_token_id = llama31_tokenizer.pad_token_id

device_1 = torch.device('cuda:1')
llama31_model_1 = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=3,
    torch_dtype=torch.float16,
    quantization_config=bnb_config,
    use_cache=False,
    device_map=device_1
)
llama31_model_1.config.pad_token_id = llama31_tokenizer.pad_token_id


lora_config = LoraConfig(
    r=4,
    lora_alpha=8,
    # only target self-attention
    target_modules=['q_proj', 'k_proj', 'v_proj', 'o_proj', 'gate_proj', 'up_proj', 'down_proj'],
    # layers_to_transform=[i for i in range(42) if i >= config.freeze_layers],
    lora_dropout=0,
    bias='none',
    task_type=TaskType.SEQ_CLS,
)

# llama31_model_0 = prepare_model_for_kbit_training(llama31_model_0)
# llama31_model_0 = get_peft_model(llama31_model_0, lora_config)
# llama31_model_0.print_trainable_parameters()

# llama31_model_1 = prepare_model_for_kbit_training(llama31_model_1)
# llama31_model_1 = get_peft_model(llama31_model_1, lora_config)
# llama31_model_1.print_trainable_parameters()

# Load weights 

In [None]:
# Get peft
llama31_model_0 = PeftModel.from_pretrained(llama31_model_0, model_id=WEIGHTS_PATH).to(device_0) 
llama31_model_0.eval()

llama31_model_1 = PeftModel.from_pretrained(llama31_model_1, model_id=WEIGHTS_PATH).to(device_1)
llama31_model_1.eval();

# Inference


In [None]:
@torch.no_grad()
@torch.cuda.amp.autocast()
def inference(df, model, tokenizer, device, batch_size=BATCH_SIZE, max_length=MAX_LENGTH):
    a_win, b_win, tie = [], [], []
    
    for start_idx in range(0, len(df), batch_size):
        end_idx = min(start_idx + batch_size, len(df))
        tmp = df.iloc[start_idx:end_idx]
        input_ids = tmp["input_ids"].to_list()
        attention_mask = tmp["attention_mask"].to_list()
        inputs = pad_without_fast_tokenizer_warning(
            tokenizer,
            {"input_ids": input_ids, "attention_mask": attention_mask},
            padding="longest",
            pad_to_multiple_of=None,
            return_tensors="pt",
        )
        outputs = model(**inputs.to(device))
        proba = outputs.logits.softmax(-1).cpu()
        
        a_win.extend(proba[:, 0].tolist())
        b_win.extend(proba[:, 1].tolist())
        tie.extend(proba[:, 2].tolist())
    
    df["winner_model_a"] = a_win
    df["winner_model_b"] = b_win
    df["winner_tie"] = tie
    
    return df

In [None]:
st = time.time()

llama31_data = llama31_data.sort_values("length", ascending=False).reset_index(drop=True)
sub_1 = llama31_data.iloc[0::2].copy()
sub_2 = llama31_data.iloc[1::2].copy()

with ThreadPoolExecutor(max_workers=2) as executor:
    results = executor.map(inference, (sub_1, sub_2), (llama31_model_0, llama31_model_1), (llama31_tokenizer, llama31_tokenizer), (device_0, device_1))

result_df = pd.concat(list(results), axis=0)
proba = result_df[["winner_model_a", "winner_model_b", "winner_tie"]].values

print(f"elapsed time: {time.time() - st}")

In [None]:
result_df.loc[:, "winner_model_a"] = proba[:, 0]
result_df.loc[:, "winner_model_b"] = proba[:, 1]
result_df.loc[:, "winner_tie"] = proba[:, 2]
submission_df = result_df[["id", 'winner_model_a', 'winner_model_b', 'winner_tie']]
submission_df.to_csv('submission.csv', index=False)
display(submission_df)