# Introduction

This notebook is forked from [Here](https://www.kaggle.com/code/emiz6413/inference-gemma-2-9b-4-bit-qlora)

Qwen2 is a language model series including decoder language models of different model sizes. For each size, we release the base language model and the aligned chat model. It is based on the Transformer architecture with SwiGLU activation, attention QKV bias, group query attention, mixture of sliding window attention and full attention, etc. Additionally, we have an improved tokenizer adaptive to multiple natural languages and codes.

The notebook training Qwen2 1.5b version with batch size of 4 and 1 epoch. The training time is around 1 hr on A100. I expect that Qwen2 7b could have better performance on this task. The training code for Colab is attached at the end of the notebook.

In [None]:
!pip install transformers peft accelerate bitsandbytes \
    -U --no-index --find-links /kaggle/input/lmsys-wheel-files

In [None]:
import time
from dataclasses import dataclass
from concurrent.futures import ThreadPoolExecutor

import torch
import sklearn

import numpy as np
import pandas as pd
from transformers import Qwen2ForSequenceClassification, AutoTokenizer, BitsAndBytesConfig
from transformers.data.data_collator import pad_without_fast_tokenizer_warning
from peft import PeftModel

In [None]:
assert torch.cuda.device_count() == 2

## Configurations

In [None]:
@dataclass
class Config:
    model_dir = '/kaggle/input/qwen2/transformers/qwen2-1.5b-instruct/1'
    lora_dir = '/kaggle/input/lmsys-qwen2-1-5b-checkpoint/checkpoint-5748'
    max_length = 2048
    batch_size = 4
    device = torch.device("cuda")    
    tta = False  # test time augmentation. <prompt>-<model-b's response>-<model-a's response>
    spread_max_length = False  # whether to apply max_length//3 on each input or max_length on the concatenated input

cfg = Config()

# Load & pre-process Data 

In [None]:
test = pd.read_csv('/kaggle/input/lmsys-chatbot-arena/test.csv')

In [None]:
def process_text(text: str) -> str:
    return " ".join(eval(text, {"null": ""}))

test.loc[:, 'prompt'] = test['prompt'].apply(process_text)
test.loc[:, 'response_a'] = test['response_a'].apply(process_text)
test.loc[:, 'response_b'] = test['response_b'].apply(process_text)

display(test.head(5))

# Tokenize

In [None]:
def tokenize(
    tokenizer, prompt, response_a, response_b, max_length=cfg.max_length, spread_max_length=cfg.spread_max_length
):
    prompt = ["<User prompt>: " + p for p in prompt]
    response_a = ["\n\n<response_a>: " + r_a for r_a in response_a]
    response_b = ["\n\n<response_b>: " + r_b for r_b in response_b]
    if spread_max_length:
        prompt = tokenizer(prompt, max_length=max_length//3, truncation=True, padding=False).input_ids
        response_a = tokenizer(response_a, max_length=max_length//3, truncation=True, padding=False).input_ids
        response_b = tokenizer(response_b, max_length=max_length//3, truncation=True, padding=False).input_ids
        input_ids = [p + r_a + r_b for p, r_a, r_b in zip(prompt, response_a, response_b)]
        attention_mask = [[1]* len(i) for i in input_ids]
    else:
        text = [p + r_a + r_b for p, r_a, r_b in zip(prompt, response_a, response_b)]
        tokenized = tokenizer(text, max_length=max_length, truncation=True, padding=False)
        input_ids = tokenized.input_ids
        attention_mask = tokenized.attention_mask
    return input_ids, attention_mask

In [None]:
%%time

tokenizer = AutoTokenizer.from_pretrained(cfg.model_dir)
tokenizer.add_eos_token = True
tokenizer.padding_side = "right"

data = pd.DataFrame()
data["id"] = test["id"]
data["input_ids"], data["attention_mask"] = tokenize(tokenizer, test["prompt"], test["response_a"], test["response_b"])
data["length"] = data["input_ids"].apply(len)

aug_data = pd.DataFrame()
aug_data["id"] = test["id"]
# swap response_a & response_b
aug_data['input_ids'], aug_data['attention_mask'] = tokenize(tokenizer, test["prompt"], test["response_b"], test["response_a"])
aug_data["length"] = aug_data["input_ids"].apply(len)

In [None]:
print(tokenizer.decode(data["input_ids"][0]))

In [None]:
print(tokenizer.decode(aug_data["input_ids"][0]))

# Load model

In [None]:
# Load base model on GPU 0
device_0 = torch.device('cuda:0')
model_0 = Qwen2ForSequenceClassification.from_pretrained(
    cfg.model_dir,
    num_labels=3,
    device_map=device_0,
    use_cache=False,
)

# Load base model on GPU 1
device_1 = torch.device('cuda:1')
model_1 = Qwen2ForSequenceClassification.from_pretrained(
    cfg.model_dir,
    num_labels=3,
    device_map=device_1,
    use_cache=False,
)

#### Load LoRA adapter

In [None]:
model_0 = PeftModel.from_pretrained(model_0, cfg.lora_dir)
model_0.config.pad_token_id = model_0.config.eos_token_id
model_1 = PeftModel.from_pretrained(model_1, cfg.lora_dir)
model_1.config.pad_token_id = model_1.config.eos_token_id

# Inference


In [None]:
@torch.no_grad()
@torch.cuda.amp.autocast()
def inference(df, model, device, batch_size=cfg.batch_size, max_length=cfg.max_length):
    a_win, b_win, tie = [], [], []
    
    for start_idx in range(0, len(df), batch_size):
        end_idx = min(start_idx + batch_size, len(df))
        tmp = df.iloc[start_idx:end_idx]
        input_ids = tmp["input_ids"].to_list()
        attention_mask = tmp["attention_mask"].to_list()
        inputs = pad_without_fast_tokenizer_warning(
            tokenizer,
            {"input_ids": input_ids, "attention_mask": attention_mask},
            padding="longest",
            pad_to_multiple_of=None,
            return_tensors="pt",
        )
        outputs = model(**inputs.to(device))
        proba = outputs.logits.softmax(-1).cpu()
        
        a_win.extend(proba[:, 0].tolist())
        b_win.extend(proba[:, 1].tolist())
        tie.extend(proba[:, 2].tolist())
    
    df["winner_model_a"] = a_win
    df["winner_model_b"] = b_win
    df["winner_tie"] = tie
    
    return df

In [None]:
st = time.time()

# sort by input length to fully leverage dynaminc padding
data = data.sort_values("length", ascending=False)
# the total #tokens in sub_1 and sub_2 should be more or less the same
sub_1 = data.iloc[0::2].copy()
sub_2 = data.iloc[1::2].copy()

with ThreadPoolExecutor(max_workers=2) as executor:
    results = executor.map(inference, (sub_1, sub_2), (model_0, model_1), (device_0, device_1))

result_df = pd.concat(list(results), axis=0)
proba = result_df[["winner_model_a", "winner_model_b", "winner_tie"]].values

print(f"elapsed time: {time.time() - st}")

In [None]:
st = time.time()

if cfg.tta:
    data = aug_data.sort_values("length", ascending=False)  # sort by input length to boost speed
    sub_1 = data.iloc[0::2].copy()
    sub_2 = data.iloc[1::2].copy()

    with ThreadPoolExecutor(max_workers=2) as executor:
        results = executor.map(inference, (sub_1, sub_2), (model_0, model_1), (device_0, device_1))

    tta_result_df = pd.concat(list(results), axis=0)
    # recall TTA's order is flipped
    tta_proba = tta_result_df[["winner_model_b", "winner_model_a", "winner_tie"]].values 
    # average original result and TTA result.
    proba = (proba + tta_proba) / 2

print(f"elapsed time: {time.time() - st}")

In [None]:
result_df.loc[:, "winner_model_a"] = proba[:, 0]
result_df.loc[:, "winner_model_b"] = proba[:, 1]
result_df.loc[:, "winner_tie"] = proba[:, 2]
submission_df = result_df[["id", 'winner_model_a', 'winner_model_b', 'winner_tie']]
submission_df.to_csv('submission.csv', index=False)
display(submission_df)

# Training Code

This training code is forked and inspired from https://www.kaggle.com/code/emiz6413/training-gemma-2-9b-4-bit-qlora-fine-tuning


## Transfer Kaggle Data

In [None]:
# # IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# # TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# # THEN FEEL FREE TO DELETE THIS CELL.
# # NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# # ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# # NOTEBOOK.

# import os
# import sys
# from tempfile import NamedTemporaryFile
# from urllib.request import urlopen
# from urllib.parse import unquote, urlparse
# from urllib.error import HTTPError
# from zipfile import ZipFile
# import tarfile
# import shutil

# CHUNK_SIZE = 40960
# DATA_SOURCE_MAPPING = 'lmsys-chatbot-arena:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-competitions-data%2Fkaggle-v2%2F66631%2F8346466%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240716%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240716T010448Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D4ae2cd10c5a4750deca45551519904c5858980d5cf8cd8ade09b2299d926c86c895b50ba333acf0db5210d0dd29197c9a9c5a525c8afd0186b88f17d3ca756f0562ad5acfa2e856e8b159f554e61f72102865abd60add751dd59bed5126536d977d6fe54d2e85f8e5baa8d3d75337d0a222a89f0f30fa6dd7c360e4a192363dc417e9e4a9c9c23368991db65b4994c2200bee494d8d5e2684f754ab1b1a511f7db3652e01ab658b04d26cc1321e783fa5509f67d4c438808adc7932a0e79a21849375023b36e90cbe288cf68a6185b2ce950464b71c9b6133d49769c67e77a5298809fb63da23c0655165e80661623bd9bb908bc9a486dbc9e09caebf2a01392,qwen2/transformers/qwen2-1.5b-instruct/1:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-models-data%2F52038%2F62308%2Fbundle%2Farchive.tar.gz%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240716%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240716T010448Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D80f6a1f964073d960129f611693afca0666128c1a319b7ab93769a1993d5910e4995cf3c6f2f87323452999b069eafb9f7b01be97dbb80a3441cd0b4871d9d35379750a3b5614397253624b097961c886c48df889ac7b1100231715e2b60bf4f3ffccd5e7fc68b3d7d0668f350a8fefbddb90275770e75aaa7f74fae68b2f5314f610ec2f1abf0436156e9426e6173e229172ca0c4ee91eb2d768de3190c9f07e6c28b73bc8c5553e2dac6320842103524591b663021a41801bb2274b5fd91dd62f174ba8976c74995012ad3ed34ecf554a9e2cb08f91813e9cacc9b8d554c6a7a037414635f30e506ea39f63fb4db01f5cf3322dca02097f9550b1a5454ae99'

# KAGGLE_INPUT_PATH='/kaggle/input'
# KAGGLE_WORKING_PATH='/kaggle/working'
# KAGGLE_SYMLINK='kaggle'

# !umount /kaggle/input/ 2> /dev/null
# shutil.rmtree('/kaggle/input', ignore_errors=True)
# os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
# os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

# try:
#   os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
# except FileExistsError:
#   pass
# try:
#   os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
# except FileExistsError:
#   pass

# for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
#     directory, download_url_encoded = data_source_mapping.split(':')
#     download_url = unquote(download_url_encoded)
#     filename = urlparse(download_url).path
#     destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
#     try:
#         with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
#             total_length = fileres.headers['content-length']
#             print(f'Downloading {directory}, {total_length} bytes compressed')
#             dl = 0
#             data = fileres.read(CHUNK_SIZE)
#             while len(data) > 0:
#                 dl += len(data)
#                 tfile.write(data)
#                 done = int(50 * dl / int(total_length))
#                 sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
#                 sys.stdout.flush()
#                 data = fileres.read(CHUNK_SIZE)
#             if filename.endswith('.zip'):
#               with ZipFile(tfile) as zfile:
#                 zfile.extractall(destination_path)
#             else:
#               with tarfile.open(tfile.name) as tarfile:
#                 tarfile.extractall(destination_path)
#             print(f'\nDownloaded and uncompressed: {directory}')
#     except HTTPError as e:
#         print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
#         continue
#     except OSError as e:
#         print(f'Failed to load {download_url} to path {destination_path}')
#         continue

# print('Data source import complete.')


## Install and Load libraries

In [None]:
## gemma-2 is available from transformers>=4.42.3
# !pip install -U "transformers>=4.42.3" bitsandbytes accelerate peft datasets

In [None]:
# import os
# import copy
# from dataclasses import dataclass

# import numpy as np
# import torch
# from datasets import Dataset
# from transformers import (
#     BitsAndBytesConfig,
#     AutoTokenizer,
#     Qwen2ForSequenceClassification,
#     PreTrainedTokenizerBase,
#     EvalPrediction,
#     Trainer,
#     TrainingArguments,
#     DataCollatorWithPadding,
# )
# from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, TaskType
# from sklearn.metrics import log_loss, accuracy_score

## Training Configuration

In [None]:
# @dataclass
# class Config:
#     output_dir: str = "output"
#     model_name: str = "/kaggle/input/qwen2/transformers/qwen2-1.5b-instruct/1"
#     checkpoint: str = "/kaggle/input/qwen2/transformers/qwen2-1.5b-instruct/1" # local directory of qwen model
#     max_length: int = 2048
#     n_splits: int = 5
#     fold_idx: int = 0
#     optim_type: str = "adamw_8bit"
#     per_device_train_batch_size: int = 4
#     gradient_accumulation_steps: int = 2  # global batch size is 8
#     per_device_eval_batch_size: int = 8
#     n_epochs: int = 1
#     freeze_layers: int = 16  # there're 42 layers in total, we don't add adapters to the first 16 layers
#     lr: float = 2e-4
#     warmup_steps: int = 20
#     lora_r: int = 16
#     lora_alpha: float = lora_r * 2
#     lora_dropout: float = 0.05
#     lora_bias: str = "none"

# config = Config()

## Training Arguments

In [None]:
# training_args = TrainingArguments(
#     output_dir="output",
#     overwrite_output_dir=True,
#     report_to="none",
#     num_train_epochs=config.n_epochs,
#     per_device_train_batch_size=config.per_device_train_batch_size,
#     gradient_accumulation_steps=config.gradient_accumulation_steps,
#     per_device_eval_batch_size=config.per_device_eval_batch_size,
#     logging_steps=10,
#     eval_strategy="epoch",
#     save_strategy="steps",
#     save_steps=200,
#     optim=config.optim_type,
#     fp16=True,
#     learning_rate=config.lr,
#     warmup_steps=config.warmup_steps,
# )

## LoRA Config

In [None]:
# lora_config = LoraConfig(
#     r=config.lora_r,
#     lora_alpha=config.lora_alpha,
#     # only target self-attention
#     target_modules=["q_proj", "k_proj", "v_proj"],
#     layers_to_transform=[i for i in range(42) if i >= config.freeze_layers],
#     lora_dropout=config.lora_dropout,
#     bias=config.lora_bias,
#     task_type=TaskType.SEQ_CLS,
# )

## Initialize and Tokenike model

In [None]:
# tokenizer = AutoTokenizer.from_pretrained(config.model_name)
# tokenizer.add_eos_token = True
# tokenizer.padding_side = "right"
# model = Qwen2ForSequenceClassification.from_pretrained(
#     config.model_name,
#     num_labels=3,
#     torch_dtype=torch.float16,
#     device_map="auto",
# )
# model.config.use_cache = False
# model = prepare_model_for_kbit_training(model)
# model = get_peft_model(model, lora_config)
# model.config.pad_token_id = model.config.eos_token_id
# model

In [None]:
# model.print_trainable_parameters()

# Load Training Data

In [None]:
# ds = Dataset.from_csv("/kaggle/input/lmsys-chatbot-arena/train.csv")
# class CustomTokenizer:
#     def __init__(
#         self,
#         tokenizer: PreTrainedTokenizerBase,
#         max_length: int
#     ) -> None:
#         self.tokenizer = tokenizer
#         self.max_length = max_length

#     def __call__(self, batch: dict) -> dict:
#         prompt = ["<prompt>: " + self.process_text(t) for t in batch["prompt"]]
#         response_a = ["\n\n<response_a>: " + self.process_text(t) for t in batch["response_a"]]
#         response_b = ["\n\n<response_b>: " + self.process_text(t) for t in batch["response_b"]]
#         texts = [p + r_a + r_b for p, r_a, r_b in zip(prompt, response_a, response_b)]
#         tokenized = self.tokenizer(texts, max_length=self.max_length, truncation=True)
#         labels=[]
#         for a_win, b_win in zip(batch["winner_model_a"], batch["winner_model_b"]):
#             if a_win:
#                 label = 0
#             elif b_win:
#                 label = 1
#             else:
#                 label = 2
#             labels.append(label)
#         return {**tokenized, "labels": labels}

#     @staticmethod
#     def process_text(text: str) -> str:
#         return " ".join(eval(text, {"null": ""}))
# encode = CustomTokenizer(tokenizer, max_length=config.max_length)
# ds = ds.map(encode, batched=True)

## Run Training

In [None]:
# def compute_metrics(eval_preds: EvalPrediction) -> dict:
#     preds = eval_preds.predictions
#     labels = eval_preds.label_ids
#     probs = torch.from_numpy(preds).float().softmax(-1).numpy()
#     loss = log_loss(y_true=labels, y_pred=probs)
#     acc = accuracy_score(y_true=labels, y_pred=preds.argmax(-1))
#     return {"acc": acc, "log_loss": loss}

# folds = [
#     (
#         [i for i in range(len(ds)) if i % config.n_splits != fold_idx],
#         [i for i in range(len(ds)) if i % config.n_splits == fold_idx]
#     )
#     for fold_idx in range(config.n_splits)
# ]

In [None]:
# train_idx, eval_idx = folds[config.fold_idx]

# trainer = Trainer(
#     args=training_args,
#     model=model,
#     tokenizer=tokenizer,
#     train_dataset=ds.select(train_idx),
#     eval_dataset=ds.select(eval_idx),
#     compute_metrics=compute_metrics,
#     data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
# )
# trainer.train()