# Create SMOTE for imbalanced data in train csv

In [None]:
%%time

import pandas as pd
import numpy as np

train = pd.read_csv('/kaggle/input/lmsys-chatbot-arena/train.csv')
test = pd.read_csv('/kaggle/input/lmsys-chatbot-arena/test.csv')
sample_submission = pd.read_csv('/kaggle/input/lmsys-chatbot-arena/sample_submission.csv')

I copy methods from https://www.kaggle.com/code/emiz6413/training-gemma-2-9b-4-bit-qlora-fine-tuning?scriptVersionId=187770530 but i changed it using my own balanced data with gemma model 

In [None]:
%%time

train.head(51)

In [None]:
%%time

test.head(2)

In [None]:
%%time

# Combine the datasets
combined_df = pd.concat([train, test], ignore_index=True)

In [None]:
%%time

from imblearn.over_sampling import SMOTE

# Fill missing values in target columns with 0 (assuming the missing values should be treated as no win)
combined_df['winner_model_a'] = combined_df['winner_model_a'].fillna(0)
combined_df['winner_model_b'] = combined_df['winner_model_b'].fillna(0)
combined_df['winner_tie'] = combined_df['winner_tie'].fillna(0)

# Combine the target columns into a single column for SMOTE
combined_df['winner'] = combined_df[['winner_model_a', 'winner_model_b', 'winner_tie']].idxmax(axis=1)

# Map the winner column to numerical values
winner_mapping = {'winner_model_a': 0, 'winner_model_b': 1, 'winner_tie': 2}
combined_df['winner'] = combined_df['winner'].map(winner_mapping)

# Apply SMOTE to the combined dataset
smote = SMOTE()
X = combined_df.drop(columns=['winner', 'winner_model_a', 'winner_model_b', 'winner_tie'])
y = combined_df['winner']

# For simplicity, we'll encode the textual data using simple numerical encoding
X_encoded = X.apply(lambda col: col.astype('category').cat.codes if col.dtype == 'object' else col)

# Apply SMOTE
X_resampled, y_resampled = smote.fit_resample(X_encoded, y)

# Create a DataFrame from the resampled data
resampled_df = pd.DataFrame(X_resampled, columns=X.columns)
resampled_df['winner'] = y_resampled

# Decode the winner column back to its original form
inverse_winner_mapping = {v: k for k, v in winner_mapping.items()}
resampled_df['winner'] = resampled_df['winner'].map(inverse_winner_mapping)

# Split the winner column back into the original three columns
resampled_df['winner_model_a'] = (resampled_df['winner'] == 'winner_model_a').astype(int)
resampled_df['winner_model_b'] = (resampled_df['winner'] == 'winner_model_b').astype(int)
resampled_df['winner_tie'] = (resampled_df['winner'] == 'winner_tie').astype(int)

# Drop the combined winner column
resampled_df = resampled_df.drop(columns=['winner'])

In [None]:
%%time

resampled_df.head(10)

In [None]:
%%time

# Fill missing values with the mode
combined_df['model_a'].fillna(combined_df['model_a'].mode()[0], inplace=True)
combined_df['model_b'].fillna(combined_df['model_b'].mode()[0], inplace=True)
combined_df['winner_model_a'].fillna(combined_df['winner_model_a'].mode()[0], inplace=True)
combined_df['winner_model_b'].fillna(combined_df['winner_model_b'].mode()[0], inplace=True)
combined_df['winner_tie'].fillna(combined_df['winner_tie'].mode()[0], inplace=True)

# Check for missing values in the combined DataFrame
missing_values = combined_df.isnull().sum()

# Display the missing values
missing_values

In [None]:
# Save the combined DataFrame to a new CSV file
combined_df.to_csv('combined_df.csv', index=False)

# Why i created imbalanced (smote) methods ?

The dataset needs to address class imbalance because an imbalanced dataset can lead to biased model performance. When a machine learning model is trained on an imbalanced dataset, it tends to become biased towards the majority class, resulting in poor performance on the minority classes. This can significantly impact the model's ability to generalize well to new, unseen data.

In this specific dataset, the columns related to the winners (winner_model_a, winner_model_b, winner_tie) are imbalanced. Here's a breakdown of the issue:

winner_model_a: This column indicates whether model A won. If the count of 1s (indicating a win for model A) is much lower or higher compared to the other winner columns, it creates an imbalance.

winner_model_b: This column indicates whether model B won. Similarly, if the count of 1s in this column is disproportionate to the others, it contributes to imbalance.

winner_tie: This column indicates whether the result was a tie. A significantly lower count of 1s here compared to the others further highlights the imbalance.

From the initial analysis, we observed that the winner_tie column had fewer instances compared to the other two columns, suggesting an imbalance in how often ties occur relative to wins by either model A or model B. This imbalance can skew the model's learning process, making it less effective at predicting ties. Addressing this imbalance using techniques like SMOTE (Synthetic Minority Over-sampling Technique) helps to ensure that the model has a more balanced view of all possible outcomes, leading to better overall performance.

# Use Gemma Model

In [None]:
%%time

# gemma-2 is available from transformers>=4.42.3
#!pip install -U "transformers>=4.42.3" bitsandbytes accelerate peft

In [None]:
%%time

import os
import copy
from dataclasses import dataclass

import numpy as np
import torch
from datasets import Dataset
from transformers import (
    BitsAndBytesConfig,
    Gemma2ForSequenceClassification,
    GemmaTokenizerFast,
    Gemma2Config,
    PreTrainedTokenizerBase, 
    EvalPrediction,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding,
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, TaskType
from sklearn.metrics import log_loss, accuracy_score

# Configurations

In [None]:
%%time

@dataclass
class Config:
    output_dir: str = "output"
    checkpoint: str = "unsloth/gemma-2-9b-it-bnb-4bit"  # 4-bit quantized gemma-2-9b-instruct
    max_length: int = 1024
    n_splits: int = 5
    fold_idx: int = 0
    optim_type: str = "adamw_8bit"
    per_device_train_batch_size: int = 2
    gradient_accumulation_steps: int = 2  # global batch size is 8 
    per_device_eval_batch_size: int = 8
    n_epochs: int = 1
    freeze_layers: int = 16  # there're 42 layers in total, we don't add adapters to the first 16 layers
    lr: float = 2e-4
    warmup_steps: int = 20
    lora_r: int = 16
    lora_alpha: float = lora_r * 2
    lora_dropout: float = 0.05
    lora_bias: str = "none"
    
config = Config()

# Training Arguments

In [None]:
%%time

training_args = TrainingArguments(
    output_dir="output",
    overwrite_output_dir=True,
    report_to="none",
    num_train_epochs=config.n_epochs,
    per_device_train_batch_size=config.per_device_train_batch_size,
    gradient_accumulation_steps=config.gradient_accumulation_steps,
    per_device_eval_batch_size=config.per_device_eval_batch_size,
    logging_steps=10,
    eval_strategy="epoch",
    save_strategy="steps",
    save_steps=200,
    optim=config.optim_type,
    fp16=True,
    learning_rate=config.lr,
    warmup_steps=config.warmup_steps,
)

# LoRA config

In [None]:
%%time

lora_config = LoraConfig(
    r=config.lora_r,
    lora_alpha=config.lora_alpha,
    # only target self-attention
    target_modules=["q_proj", "k_proj", "v_proj"],
    layers_to_transform=[i for i in range(42) if i >= config.freeze_layers],
    lora_dropout=config.lora_dropout,
    bias=config.lora_bias,
    task_type=TaskType.SEQ_CLS,
)

# Instantiate the tokenizer & model

In [None]:
%%time

tokenizer = GemmaTokenizerFast.from_pretrained(config.checkpoint)
tokenizer.add_eos_token = True  # We'll add <eos> at the end
tokenizer.padding_side = "right"

In [None]:
%%time

model = Gemma2ForSequenceClassification.from_pretrained(
    config.checkpoint,
    num_labels=3,
    torch_dtype=torch.float16,
    device_map="auto",
)
model.config.use_cache = False
model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)
model

In [None]:
%%time

model.print_trainable_parameters()

# Load Train File

In [None]:
%%time

ds = Dataset.from_csv("/kaggle/working/combined_df.csv")
ds = ds.select(torch.arange(100))  # We only use the first 100 data for demo purpose

In [None]:
%%time

class CustomTokenizer:
    def __init__(
        self, 
        tokenizer: PreTrainedTokenizerBase, 
        max_length: int
    ) -> None:
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __call__(self, batch: dict) -> dict:
        prompt = ["<prompt>: " + self.process_text(t) for t in batch["prompt"]]
        response_a = ["\n\n<response_a>: " + self.process_text(t) for t in batch["response_a"]]
        response_b = ["\n\n<response_b>: " + self.process_text(t) for t in batch["response_b"]]
        texts = [p + r_a + r_b for p, r_a, r_b in zip(prompt, response_a, response_b)]
        tokenized = self.tokenizer(texts, max_length=self.max_length, truncation=True)
        labels=[]
        for a_win, b_win in zip(batch["winner_model_a"], batch["winner_model_b"]):
            if a_win:
                label = 0
            elif b_win:
                label = 1
            else:
                label = 2
            labels.append(label)
        return {**tokenized, "labels": labels}
        
    @staticmethod
    def process_text(text: str) -> str:
        return " ".join(eval(text, {"null": ""}))

In [None]:
%%time

encode = CustomTokenizer(tokenizer, max_length=config.max_length)
ds = ds.map(encode, batched=True)

# Compute metric logloss

In [None]:
%%time

def compute_metrics(eval_preds: EvalPrediction) -> dict:
    preds = eval_preds.predictions
    labels = eval_preds.label_ids
    probs = torch.from_numpy(preds).float().softmax(-1).numpy()
    loss = log_loss(y_true=labels, y_pred=probs)
    acc = accuracy_score(y_true=labels, y_pred=preds.argmax(-1))
    return {"acc": acc, "log_loss": loss}

In [None]:
%%time

folds = [
    (
        [i for i in range(len(ds)) if i % config.n_splits != fold_idx],
        [i for i in range(len(ds)) if i % config.n_splits == fold_idx]
    ) 
    for fold_idx in range(config.n_splits)
]

In [None]:
%%time

train_idx, eval_idx = folds[config.fold_idx]

trainer = Trainer(
    args=training_args, 
    model=model,
    tokenizer=tokenizer,
    train_dataset=ds.select(train_idx),
    eval_dataset=ds.select(eval_idx),
    compute_metrics=compute_metrics,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
)
trainer.train()

# Inference

In [None]:
%%time

assert torch.cuda.device_count() == 2

In [None]:
%%time

@dataclass
class Config:
    gemma_dir = '/kaggle/input/gemma-2/transformers/gemma-2-9b-it-4bit/1/gemma-2-9b-it-4bit'
    lora_dir = '/kaggle/working/output/checkpoint-20'
    max_length = 2048
    batch_size = 4
    device = torch.device("cuda")    
    tta = False  # test time augmentation. <prompt>-<model-b's response>-<model-a's response>
    spread_max_length = False  # whether to apply max_length//3 on each input or max_length on the concatenated input

cfg = Config()

In [None]:
%%time

test = pd.read_csv('/kaggle/input/lmsys-chatbot-arena/test.csv')

In [None]:
%%time

def process_text(text: str) -> str:
    return " ".join(eval(text, {"null": ""}))

test.loc[:, 'prompt'] = test['prompt'].apply(process_text)
test.loc[:, 'response_a'] = test['response_a'].apply(process_text)
test.loc[:, 'response_b'] = test['response_b'].apply(process_text)

display(test.head(5))

In [None]:
%%time

def tokenize(
    tokenizer, prompt, response_a, response_b, max_length=cfg.max_length, spread_max_length=cfg.spread_max_length
):
    prompt = ["<prompt>: " + p for p in prompt]
    response_a = ["\n\n<response_a>: " + r_a for r_a in response_a]
    response_b = ["\n\n<response_b>: " + r_b for r_b in response_b]
    if spread_max_length:
        prompt = tokenizer(prompt, max_length=max_length//3, truncation=True, padding=False).input_ids
        response_a = tokenizer(response_a, max_length=max_length//3, truncation=True, padding=False).input_ids
        response_b = tokenizer(response_b, max_length=max_length//3, truncation=True, padding=False).input_ids
        input_ids = [p + r_a + r_b for p, r_a, r_b in zip(prompt, response_a, response_b)]
        attention_mask = [[1]* len(i) for i in input_ids]
    else:
        text = [p + r_a + r_b for p, r_a, r_b in zip(prompt, response_a, response_b)]
        tokenized = tokenizer(text, max_length=max_length, truncation=True, padding=False)
        input_ids = tokenized.input_ids
        attention_mask = tokenized.attention_mask
    return input_ids, attention_mask

In [None]:
%%time

tokenizer = GemmaTokenizerFast.from_pretrained(cfg.gemma_dir)
tokenizer.add_eos_token = True
tokenizer.padding_side = "right"

data = pd.DataFrame()
data["id"] = test["id"]
data["input_ids"], data["attention_mask"] = tokenize(tokenizer, test["prompt"], test["response_a"], test["response_b"])
data["length"] = data["input_ids"].apply(len)

aug_data = pd.DataFrame()
aug_data["id"] = test["id"]
# swap response_a & response_b
aug_data['input_ids'], aug_data['attention_mask'] = tokenize(tokenizer, test["prompt"], test["response_b"], test["response_a"])
aug_data["length"] = aug_data["input_ids"].apply(len)

In [None]:
%%time

print(tokenizer.decode(data["input_ids"][0]))

In [None]:
%%time

print(tokenizer.decode(aug_data["input_ids"][0]))

In [None]:
%%time

# Load base model on GPU 0
device_0 = torch.device('cuda:0')
model_0 = Gemma2ForSequenceClassification.from_pretrained(
    cfg.gemma_dir,
    device_map=device_0,
    use_cache=False,
)

# Load base model on GPU 1
device_1 = torch.device('cuda:1')
model_1 = Gemma2ForSequenceClassification.from_pretrained(
    cfg.gemma_dir,
    device_map=device_1,
    use_cache=False,
)

In [None]:
%%time

from peft import PeftModel

model_0 = PeftModel.from_pretrained(model_0, cfg.lora_dir)
model_1 = PeftModel.from_pretrained(model_1, cfg.lora_dir)

In [None]:
%%time

@torch.no_grad()
@torch.cuda.amp.autocast()
def inference(df, model, device, batch_size=cfg.batch_size, max_length=cfg.max_length):
    a_win, b_win, tie = [], [], []
    
    for start_idx in range(0, len(df), batch_size):
        end_idx = min(start_idx + batch_size, len(df))
        tmp = df.iloc[start_idx:end_idx]
        input_ids = tmp["input_ids"].to_list()
        attention_mask = tmp["attention_mask"].to_list()
        inputs = pad_without_fast_tokenizer_warning(
            tokenizer,
            {"input_ids": input_ids, "attention_mask": attention_mask},
            padding="longest",
            pad_to_multiple_of=None,
            return_tensors="pt",
        )
        outputs = model(**inputs.to(device))
        proba = outputs.logits.softmax(-1).cpu()
        
        a_win.extend(proba[:, 0].tolist())
        b_win.extend(proba[:, 1].tolist())
        tie.extend(proba[:, 2].tolist())
    
    df["winner_model_a"] = a_win
    df["winner_model_b"] = b_win
    df["winner_tie"] = tie
    
    return df

In [None]:
%%time

import time
from dataclasses import dataclass
from concurrent.futures import ThreadPoolExecutor
from transformers.data.data_collator import pad_without_fast_tokenizer_warning

st = time.time()

# sort by input length to fully leverage dynaminc padding
data = data.sort_values("length", ascending=False)
# the total #tokens in sub_1 and sub_2 should be more or less the same
sub_1 = data.iloc[0::2].copy()
sub_2 = data.iloc[1::2].copy()

with ThreadPoolExecutor(max_workers=2) as executor:
    results = executor.map(inference, (sub_1, sub_2), (model_0, model_1), (device_0, device_1))

result_df = pd.concat(list(results), axis=0)
proba = result_df[["winner_model_a", "winner_model_b", "winner_tie"]].values

print(f"elapsed time: {time.time() - st}")

In [None]:
%%time

st = time.time()

if cfg.tta:
    data = aug_data.sort_values("length", ascending=False)  # sort by input length to boost speed
    sub_1 = data.iloc[0::2].copy()
    sub_2 = data.iloc[1::2].copy()

    with ThreadPoolExecutor(max_workers=2) as executor:
        results = executor.map(inference, (sub_1, sub_2), (model_0, model_1), (device_0, device_1))

    tta_result_df = pd.concat(list(results), axis=0)
    # recall TTA's order is flipped
    tta_proba = tta_result_df[["winner_model_b", "winner_model_a", "winner_tie"]].values 
    # average original result and TTA result.
    proba = (proba + tta_proba) / 2

print(f"elapsed time: {time.time() - st}")

In [None]:
%%time

# Align the probabilities with the test_data order based on 'id'
aligned_proba = proba[[result_df.index[result_df['id'] == id][0] for id in test['id']]]

# Extract the predictions
test_pred_a = aligned_proba[:, 0]
test_pred_b = aligned_proba[:, 1]
test_pred_tie = aligned_proba[:, 2]

# Prepare the submission file
submission = pd.DataFrame({
    'id': test['id'],
    'winner_model_a': test_pred_a,
    'winner_model_b': test_pred_b,
    'winner_tie': test_pred_tie
})

# Save the submission file
submission_path = '/kaggle/working/submission.csv'
submission.to_csv(submission_path, index=False)

In [None]:
submission.head()

The less number of logloss, the better result of performances .