In [1]:
import os
import copy
from dataclasses import dataclass

import numpy as np
import torch
from datasets import Dataset
from transformers import (
    BitsAndBytesConfig,
    Gemma2ForSequenceClassification,
    GemmaTokenizerFast,
    Gemma2Config,
    PreTrainedTokenizerBase, 
    EvalPrediction,
    Trainer,
    TrainingArguments,
    DataCollatorWithPadding,
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training, TaskType
from sklearn.metrics import log_loss, accuracy_score

In [2]:
device = 'cuda:0'
aug = False
combined = False
use_former = False
QLoRA = True

### Configurations

In [3]:
@dataclass
class Config:
    output_dir: str = "output29"
    checkpoint: str = "gemma2-9b-postpretrained-lmsys" #"autodl-tmp/gemma-2-9b-it-bnb-4bit"   "unsloth/gemma-2-9b-it-bnb-4bit"  # 4-bit quantized gemma-2-9b-instruct
    lora_dir: str = "output4/checkpoint-4844"
    max_length: int = 3072
    n_splits: int = 100
    fold_idx: int = 0
    optim_type: str = "adamw_8bit"
    per_device_train_batch_size: int = 8
    #gradient_accumulation_steps: int = 4  # global batch size is 8 
    per_device_eval_batch_size: int = 8
    n_epochs: int = 1
    freeze_layers: int = 0  # there're 42 layers in total, we don't add adapters to the first 16 layers
    lr: float = 1e-5
    warmup_steps: int = 100
    lora_r: int = 64
    lora_alpha: float = 128
    lora_dropout: float = 0.05
    lora_bias: str = "none"
    
config = Config()

#### Training Arguments

In [4]:
training_args = TrainingArguments(
    output_dir="output31",
    overwrite_output_dir=True,
    report_to="wandb",
    num_train_epochs=config.n_epochs,
    per_device_train_batch_size=config.per_device_train_batch_size,
    #gradient_accumulation_steps=config.gradient_accumulation_steps,
    per_device_eval_batch_size=config.per_device_eval_batch_size,
    logging_strategy="steps", 
    logging_steps=10,
    eval_strategy="epoch",
    save_strategy="steps",
    save_steps=2000,
    optim=config.optim_type,
    fp16=True,
    learning_rate=config.lr,
    warmup_steps=config.warmup_steps
)

#### LoRA config

In [5]:
lora_config = LoraConfig(
    r=config.lora_r,
    lora_alpha=config.lora_alpha,
    # only target self-attention
    target_modules=["q_proj", "k_proj", "v_proj", "down_proj","up_proj","o_proj","gate_proj"],
    layers_to_transform=[i for i in range(42) if i >= config.freeze_layers],
    lora_dropout=config.lora_dropout,
    bias=config.lora_bias,
    task_type=TaskType.SEQ_CLS,
    modules_to_save=['score']
)

### Instantiate the tokenizer & model

In [6]:
tokenizer = GemmaTokenizerFast.from_pretrained(config.checkpoint, local_files_only=True)
tokenizer.add_eos_token = True  # We'll add <eos> at the end
tokenizer.padding_side = "right"

In [7]:
QLoRA = True
if QLoRA:
    bnb_config = BitsAndBytesConfig(
            load_in_4bit = True,
            bnb_4bit_quant_type = "nf4", #nf4 or fp4
            bnb_4bit_use_double_quant = False,
            bnb_4bit_compute_dtype=torch.float16,
            llm_int8_skip_modules = ["score"]
        )

In [8]:

model = Gemma2ForSequenceClassification.from_pretrained(
    config.checkpoint,
    num_labels=2,
    torch_dtype=torch.float16,
    device_map="auto",
    quantization_config = bnb_config,
)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [9]:
model.config.num_labels = 2

In [10]:
print(model.score)

Linear(in_features=3584, out_features=2, bias=False)


In [11]:
model.score = None

In [12]:
model.score = torch.nn.Linear(in_features=3584, out_features=2, bias=False).to(device)

In [13]:
print(model.score)

Linear(in_features=3584, out_features=2, bias=False)


In [14]:
from peft import PeftModel

model.config.use_cache = False

model = prepare_model_for_kbit_training(model)
model = get_peft_model(model, lora_config)
model


PeftModelForSequenceClassification(
  (base_model): LoraModel(
    (model): Gemma2ForSequenceClassification(
      (model): Gemma2Model(
        (embed_tokens): Embedding(256000, 3584, padding_idx=0)
        (layers): ModuleList(
          (0-41): 42 x Gemma2DecoderLayer(
            (self_attn): Gemma2Attention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=3584, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=3584, out_features=64, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=64, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDic

In [15]:

for name, param in model.named_parameters():
    if '.modules_to_save.' in name:
        print(name)
        param.requires_grad = True

base_model.model.score.modules_to_save.default.weight


In [16]:
model.print_trainable_parameters()

trainable params: 216,079,360 || all params: 9,457,792,512 || trainable%: 2.2847


In [17]:
for name, param in model.named_parameters():
    if param.requires_grad:
        print(f"Training parameter: {name}")
    else:
        print(f"Frozen parameter: {name}")

Frozen parameter: base_model.model.model.embed_tokens.weight
Frozen parameter: base_model.model.model.layers.0.self_attn.q_proj.base_layer.weight
Training parameter: base_model.model.model.layers.0.self_attn.q_proj.lora_A.default.weight
Training parameter: base_model.model.model.layers.0.self_attn.q_proj.lora_B.default.weight
Frozen parameter: base_model.model.model.layers.0.self_attn.k_proj.base_layer.weight
Training parameter: base_model.model.model.layers.0.self_attn.k_proj.lora_A.default.weight
Training parameter: base_model.model.model.layers.0.self_attn.k_proj.lora_B.default.weight
Frozen parameter: base_model.model.model.layers.0.self_attn.v_proj.base_layer.weight
Training parameter: base_model.model.model.layers.0.self_attn.v_proj.lora_A.default.weight
Training parameter: base_model.model.model.layers.0.self_attn.v_proj.lora_B.default.weight
Frozen parameter: base_model.model.model.layers.0.self_attn.o_proj.base_layer.weight
Training parameter: base_model.model.model.layers.0.s

In [18]:
print(model.score)

ModulesToSaveWrapper(
  (original_module): Linear(in_features=3584, out_features=2, bias=False)
  (modules_to_save): ModuleDict(
    (default): Linear(in_features=3584, out_features=2, bias=False)
  )
)


In [19]:
import pandas as pd

df = pd.read_parquet("wsdm-cup-multilingual-chatbot-arena/train.parquet") 
#df_add = pd.read_parquet("lmsys_39k.parquet") 
#df = pd.concat([df, df_add]).reset_index(drop=True)
#df = df.drop_duplicates(subset=['prompt', 'response_a', 'response_b',], keep='last').reset_index(drop=True)
df["id"] = df["id"].astype("str")
print('Competition data has shape', df.shape )
LN = len(df)
df.head(1)

Competition data has shape (48439, 8)


Unnamed: 0,id,prompt,response_a,response_b,winner,model_a,model_b,language
0,00007cff95d7f7974642a785aca248b0f26e60d3312fac...,vieš po Slovensky?,"Áno, hovorím po slovensky. Ako vám môžem pomôcť?","Áno, veď som tu! Môžem ti pomôcť s otázkami al...",model_a,o1-preview,reka-core-20240904,Slovak


In [20]:
if use_former:
    former_df = pd.read_parquet("wsdm-cup-multilingual-chatbot-arena/lmsys_39k.parquet") 
    former_df.head()

In [21]:
if use_former:
    filtered_former_df = former_df[former_df['turn'] <= 1]
    df = pd.concat([df, filtered_former_df], axis=0).reset_index(drop=True)
    df

In [22]:
if use_former:
    df = df.drop(columns=['turn'])

In [23]:
import numpy as np
m1 = df.model_a.unique()
m2 = df.model_b.unique()
#l = df.language.unique()
m = np.union1d(m1,m2)
m = sorted(m)
#l = sorted(l)
print(f"There are {len(m)} unique models:")
#print(f"There are {len(l)} unique languages:")

MAP_model = {x:y for x,y in zip(m,range(len(m)))}
#MAP_language = {x:y for x,y in zip(l,range(len(l)))}
print(MAP_model)
#print(MAP_language)

df.model_a = df.model_a.map(MAP_model).astype('int32')
df.model_b = df.model_b.map(MAP_model).astype('int32')
#df.language = df.language.map(MAP_language).astype('int32')
df.head(1)

There are 60 unique models:
{'athene-70b-0725': 0, 'c4ai-aya-expanse-32b': 1, 'chatgpt-4o-latest-20240808': 2, 'chatgpt-4o-latest-20240903': 3, 'claude-3-5-sonnet-20240620': 4, 'claude-3-5-sonnet-20241022': 5, 'claude-3-haiku-20240307': 6, 'claude-3-opus-20240229': 7, 'command-r-08-2024': 8, 'command-r-plus-08-2024': 9, 'deepseek-coder-v2-0724': 10, 'deepseek-v2-api-0628': 11, 'deepseek-v2.5': 12, 'gemini-1.5-flash-001': 13, 'gemini-1.5-flash-002': 14, 'gemini-1.5-flash-8b-001': 15, 'gemini-1.5-flash-8b-exp-0827': 16, 'gemini-1.5-flash-exp-0827': 17, 'gemini-1.5-pro-001': 18, 'gemini-1.5-pro-002': 19, 'gemini-1.5-pro-exp-0827': 20, 'gemma-2-27b-it': 21, 'gemma-2-2b-it': 22, 'gemma-2-9b-it': 23, 'gemma-2-9b-it-simpo': 24, 'glm-4-plus': 25, 'gpt-4-0125-preview': 26, 'gpt-4-1106-preview': 27, 'gpt-4-turbo-2024-04-09': 28, 'gpt-4o-2024-05-13': 29, 'gpt-4o-2024-08-06': 30, 'gpt-4o-mini-2024-07-18': 31, 'grok-2-2024-08-13': 32, 'grok-2-mini-2024-08-13': 33, 'internlm2_5-20b-chat': 34, 'jamba

Unnamed: 0,id,prompt,response_a,response_b,winner,model_a,model_b,language
0,00007cff95d7f7974642a785aca248b0f26e60d3312fac...,vieš po Slovensky?,"Áno, hovorím po slovensky. Ako vám môžem pomôcť?","Áno, veď som tu! Môžem ti pomôcť s otázkami al...",model_a,48,55,Slovak


In [24]:
from tqdm import tqdm

for col in ['prompt', 'response_a', 'response_b']:
    df[col] = df[col].fillna('')
    text_list = []
    if col == 'prompt':
        max_no = 500
        s_no = 250
        e_no = -251
    else:
        max_no = 700
        s_no = 350
        e_no = -351
    for text in tqdm(df[col]):
        encoded = tokenizer(text, return_offsets_mapping=True)
        if len(encoded['input_ids']) > max_no:
            start_idx, end_idx = encoded['offset_mapping'][s_no]
            new_text = text[:end_idx]
            # print(len(tokenizer(text[:end_idx])['input_ids']))
            start_idx, end_idx = encoded['offset_mapping'][e_no]
            # print(len(tokenizer(text[start_idx:])['input_ids']))
            new_text = new_text + "\n(snip)\n" + text[start_idx:]
            # print(len(tokenizer(new_text)['input_ids']), new_text)
            text = new_text
        text_list.append(text)
    df[col] = text_list

100%|██████████| 48439/48439 [00:30<00:00, 1589.38it/s]
100%|██████████| 48439/48439 [01:06<00:00, 723.56it/s]
100%|██████████| 48439/48439 [01:08<00:00, 706.25it/s]


### Instantiate the dataset

In [25]:
from datasets import Dataset
ds = Dataset.from_pandas(df)

#ds = load_dataset("parquet", data_files="wsdm-cup-multilingual-chatbot-arena/train.parquet", split="train"  # or "all")
#ds = ds.select(torch.arange(64))  # We only use the first 100 data for demo purpose

In [26]:
class CustomTokenizer:
    def __init__(
        self, 
        tokenizer: PreTrainedTokenizerBase, 
        max_length: int
    ) -> None:
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.task_prompt = ("Your task is to pick the best response between response A and response B. Answer only with 'A' or 'B'. Think carefully before answering.\n\n")
        
    def __call__(self, batch: dict) -> dict:
        prompt = [self.task_prompt + "<prompt>: " + t for t in batch["prompt"]]
        response_a = ["\n\n<response_a>: " + t for t in batch["response_a"]]
        response_b = ["\n\n<response_b>: " + t for t in batch["response_b"]]
        texts = [p + r_a + r_b for p, r_a, r_b in zip(prompt, response_a, response_b)]
        tokenized = self.tokenizer(texts, max_length=self.max_length, truncation=True)
        labels=[]
        for win in batch["winner"]:
            if win == "model_a":
                label = 0
            else:
                label = 1
            labels.append(label)
        return {**tokenized, "labels": labels}

In [27]:
encode = CustomTokenizer(tokenizer, max_length=config.max_length)
ds_ord = ds.map(encode, batched=True)

Map:   0%|          | 0/48439 [00:00<?, ? examples/s]

### Compute metrics

We'll compute the log-loss used in LB and accuracy as a auxiliary metric.

In [28]:
def compute_metrics(eval_preds: EvalPrediction) -> dict:
    preds = eval_preds.predictions
    labels = eval_preds.label_ids
    probs = torch.from_numpy(preds).float().softmax(-1).numpy()
    loss = log_loss(y_true=labels, y_pred=probs)
    acc = accuracy_score(y_true=labels, y_pred=preds.argmax(-1))
    return {"acc": acc, "log_loss": loss}

### Split

Here, train and eval is splitted according to their `id % 5`

In [29]:
folds = [
    (
        [i for i in range(len(ds)) if i % config.n_splits != fold_idx],
        [i for i in range(len(ds)) if i % config.n_splits == fold_idx]
    ) 
    for fold_idx in range(config.n_splits)
]

In [30]:
train_idx, eval_idx = folds[config.fold_idx]

trainer = Trainer(
    args=training_args, 
    model=model,
    tokenizer=tokenizer,
    train_dataset=ds_ord.select(train_idx),
    eval_dataset=ds_ord.select(eval_idx),
    compute_metrics=compute_metrics,
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
)

trainer.train()

  trainer = Trainer(
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mmarcolau83857272[0m ([33mmarcolau83857272-sun-yat-sen-university[0m). Use [1m`wandb login --relogin`[0m to force relogin




Epoch,Training Loss,Validation Loss,Acc,Log Loss,Runtime,Samples Per Second,Steps Per Second
1,0.5348,0.58478,0.696907,0.584788,199.151,2.435,0.306




TrainOutput(global_step=5995, training_loss=0.5820976180171251, metrics={'train_runtime': 68365.2443, 'train_samples_per_second': 0.701, 'train_steps_per_second': 0.088, 'total_flos': 4.204156111237792e+18, 'train_loss': 0.5820976180171251, 'epoch': 1.0})

In [31]:
def save_model_with_lora(trainer, model, output_dir):
    # 保存tokenizer
    tokenizer.save_pretrained(output_dir)
    save_dict = model.state_dict()

    # 确保LoRA的权重被正确保存
    lora_weights = {k: v for k, v in model.state_dict().items() if "lora" in k}
    
    # 保存score层权重
    score_weights = {k: v for k, v in model.state_dict().items() if "score" in k}
    final_d = {}
    for k, v in save_dict.items():
        if "lora" in k or "score" in k:
            final_d[k] = v

    # 保存LoRA和score层权重
    torch.save(final_d, os.path.join(output_dir, "lora_score_weights.bin"))

In [32]:
trainer.save_model = lambda: save_model_with_lora(trainer, model, trainer.args.output_dir)
trainer.save_model()