In [None]:
!pip install -U /kaggle/input/bitsandbytes-0-42-0-py3-none-any-whl/bitsandbytes-0.42.0-py3-none-any.whl -qq
!pip install -U /kaggle/input/peft-wheel/pytorch/version1/1/peft-0.10.0-py3-none-any.whl -qq

In [None]:
import pandas as pd
import numpy as np
from scipy.special import softmax
from transformers import AutoTokenizer, AutoModelForSequenceClassification,LlamaForSequenceClassification, BitsAndBytesConfig
from peft import PeftModel, PeftConfig
import torch
from torch.cuda.amp import autocast
from datasets import Dataset
import torch.nn.functional as F
from threading import Thread
import gc

In [None]:
torch.backends.cuda.enable_mem_efficient_sdp(True)
torch.backends.cuda.enable_flash_sdp(True)

MODEL_NAME = "/kaggle/input/llama-3/transformers/8b-hf/1"
MAX_LENGTH = 1284
BATCH_SIZE = 4

In [None]:
df = pd.read_csv("/kaggle/input/lmsys-chatbot-arena/test.csv")
df.head()

In [None]:
def transform(row):
    return row.strip('[]')

In [None]:
df['prompt'] = df['prompt'].apply(transform)
df['response_a'] = df['response_a'].apply(transform)
df['response_b'] = df['response_b'].apply(transform)

In [None]:
df['text'] = 'User prompt: ' + df['prompt'] +  '\n\nModel A :\n' + df['response_a'] +'\n\n----------\n\nModel B:\n'  + df['response_b']

In [None]:
# peft_model_id = "/kaggle/input/lmsys-llama-lora/pytorch/version1/1"
# peft_config = PeftConfig.from_pretrained(peft_model_id)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token_id = tokenizer.eos_token_id
tokenizer.pad_token = tokenizer.eos_token

In [None]:
def tokenize_function(df):
    result = tokenizer(df, padding="max_length", truncation=True, max_length = MAX_LENGTH)
    return result['input_ids'], result['attention_mask']

In [None]:
temp = df['text'].apply(tokenize_function)
df['input_ids'] = temp.apply(lambda x: x[0])
df['attention_mask'] = temp.apply(lambda x: x[0])

In [None]:
df

In [None]:
# data = Dataset.from_pandas(df[['text']])
# data = data.map(tokenize_function, batched=True)
# data.set_format(type='torch', columns=['input_ids', 'attention_mask'])

In [None]:
# data

In [None]:
device0 = torch.device('cuda:0')
device1 = torch.device('cuda:1')

bnb_config =  BitsAndBytesConfig(
    load_in_8bit=True,
    bnb_8bit_compute_dtype=torch.float16,
    bnb_8bit_use_double_quant=False)

In [None]:
peft_model_id = "/kaggle/input/lmsys-llama-lora/pytorch/version1/1"

In [None]:
model_0 = LlamaForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=3,
    torch_dtype=torch.float16,
    quantization_config=bnb_config,
    device_map='cuda:0')
model_0.config.pad_token_id = tokenizer.pad_token_id

model_0 = PeftModel.from_pretrained(model_0, peft_model_id).to(device0)
model_0 = model_0.merge_and_unload()
model_0.eval()

In [None]:
model_1 = LlamaForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=3,
    torch_dtype=torch.float16,
    quantization_config=bnb_config,
    device_map='cuda:1')
model_1.config.pad_token_id = tokenizer.pad_token_id

model_1 = PeftModel.from_pretrained(model_1, peft_model_id).to(device1)
model_1 = model_1.merge_and_unload()
model_1.eval()

In [None]:
gc.collect()

In [None]:
def inference(df,model,device,batch_size=BATCH_SIZE):
    
    all_probabilities = []
    for start_idx in range(0, len(df), batch_size):
        end_idx = min(start_idx + batch_size, len(df))
        
        batch_input_ids = torch.tensor(df['input_ids'][start_idx:end_idx].tolist()).to(device)
        batch_attention_mask = torch.tensor(df['attention_mask'][start_idx:end_idx].tolist()).to(device)
        
        with torch.no_grad():
            with autocast():
                outputs = model(
                    input_ids=batch_input_ids,
                    attention_mask=batch_attention_mask)
        logits = outputs.logits
        probabilities = F.softmax(logits, dim=-1)
        all_probabilities.extend(probabilities.cpu().numpy())
    
    del batch_input_ids, batch_attention_mask, outputs
    gc.collect()
    torch.cuda.empty_cache()  

    all_probabilities = np.array(all_probabilities)
    
    df['winner_model_a'] = all_probabilities[:, 0]
    df['winner_model_b'] = all_probabilities[:, 1]
    df['winner_tie'] = all_probabilities[:, 2]
    return df

In [None]:
N_SAMPLES = len(df)

half = round(N_SAMPLES / 2)
sub1 = df.iloc[0:half].copy()
sub2 = df.iloc[half:N_SAMPLES].copy()

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
def run_inference(df, model, device, results, index):
    results[index] = inference(df, model, device)

In [None]:
results = {}
t0 = Thread(target=run_inference, args=(sub1, model_0, device0,results, 0))
t1 = Thread(target=run_inference, args=(sub2, model_1, device1,results, 1))

t0.start()
t1.start()

# Wait for all threads to finish
t0.join()
t1.join()

data = pd.concat([results[0], results[1]], axis=0)

In [None]:
data.drop(columns=['prompt','response_a','response_b','text','input_ids','attention_mask'],axis=1,inplace=True)

In [None]:
data

In [None]:
data.to_csv("submission.csv",index=False)