In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

#### **<font color="red">If you like this notebook, please give me a upvote.</font>**

### Note：
1. My model was trained locally, so if you intend to train it on Kaggle, please pay attention to the GPU usage time and memory usage.
2. I directly copied the code to Kaggle, so the file paths in the code need to be modified.


# Training Process

In [None]:
import json
import pandas as pd
import torch
from datasets import Dataset
from modelscope import snapshot_download, AutoTokenizer
from peft import LoraConfig, TaskType, get_peft_model
from transformers import AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForSeq2Seq
import os
from sklearn.model_selection import train_test_split

Define the `data_transfer` function, which is used to convert data into a specific message format.

In [None]:
def data_transfer(dataframe, output_pt, instruction):
    x_cols = ['prompt', 'response_a', 'response_b']
    x = dataframe[x_cols]
    y = dataframe['winner']

    messages = []
    for idx in range(len(x)):
        x_line = x.iloc[idx,:]

        prompt = 'PROMPT: ' + x_line['prompt']
        model_a = 'MODEL_A: ' + x_line['response_a']
        model_b = 'MODEL_B: ' + x_line['response_b']
        text = prompt + model_a + model_b
        target = y[idx]

        message = {
            'instruction': instruction,
            'input': text,
            'output': target
        }

        messages.append(message)
    
    with open(output_pt, "w", encoding="utf-8") as file:
        for message in messages:
            file.write(json.dumps(message, ensure_ascii=False) + "\n")

Use the `process_func` function to tokenize the data in advance.

In [None]:
def process_func(example):

    MAX_LENGTH = 5000
    input_ids, attention_mask, labels = [], [], []
    instruction = tokenizer(f"<|im_start|>system\n{example['instruction']}<|im_end|>\n<|im_start|>user\n{example['input']}<|im_end|>\n<|im_start|>assistant\n", 
                            add_special_tokens=False)
    response = tokenizer(f"{example['output']}", add_special_tokens=False)
    input_ids = instruction["input_ids"] + response["input_ids"] + [tokenizer.pad_token_id]
    attention_mask = (
        instruction["attention_mask"] + response["attention_mask"] + [1]
    )
    labels = [-100] * len(instruction["input_ids"]) + response["input_ids"] + [tokenizer.pad_token_id]
    if len(input_ids) > MAX_LENGTH:  # 做一个截断
        input_ids = input_ids[:MAX_LENGTH]
        attention_mask = attention_mask[:MAX_LENGTH]
        labels = labels[:MAX_LENGTH]
    return {
   "input_ids": input_ids, "attention_mask": attention_mask, "labels": labels}   


## load and process data

In [None]:
original_pt = "dataset/train.parquet"
original = pd.read_parquet(original_pt)

train, test = train_test_split(original, train_size=0.99)
train.index = [i for i in range(train.shape[0])]
test.index = [i for i in range(test.shape[0])]

In [None]:
train_pt = "./cooked-dataset/train.jsonl"
test_pt = "./cooked-dataset/test.jsonl"
instruction = """In the text provided below, PROMPT is the question presented; MODEL_A is the response from the first model; MODEL_B is the response from the second model. Please select the best answer from the two responses above. If the first answer is better, return "model_a"; if the second answer is better, return "model_b"."""

data_transfer(train, train_pt, instruction)
data_transfer(test, test_pt, instruction)

## load model

In [None]:
model_pt = "/kaggle/input/qwen_0.5b_instruct/transformers/default/1"

tokenizer = AutoTokenizer.from_pretrained(model_pt, use_fast=False, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(model_pt, device_map="auto", torch_dtype=torch.bfloat16)
model.enable_input_require_grads()

In [None]:
train_df = pd.read_json('cooked-dataset/train.jsonl', lines=True)
train_ds = Dataset.from_pandas(train_df)
train_dataset = train_ds.map(process_func, remove_columns=train_ds.column_names)

In [None]:
test_df = pd.read_json('cooked-dataset/test.jsonl', lines=True)
test_ds = Dataset.from_pandas(test_df)
test_dataset = test_ds.map(process_func, remove_columns=test_ds.column_names)

In [None]:
with open('cooked-dataset/tokenized-train.jsonl', "w", encoding="utf-8") as file:
    for message in train_dataset:
        file.write(json.dumps(message, ensure_ascii=False) + "\n")

In [None]:
with open('cooked-dataset/tokenized-test.jsonl', "w", encoding="utf-8") as file:
    for message in test_dataset:
        file.write(json.dumps(message, ensure_ascii=False) + "\n")

## simple EDA

In [None]:
from tqdm import tqdm
import matplotlib.pyplot as plt
train_len_list = [len(x['input_ids']) for x in tqdm(train_dataset)]
test_len_list = [len(x['input_ids']) for x in tqdm(test_dataset)]

plt.figure(figsize=(16,5))
plt.hist(train_len_list, bins=50, alpha=0.5, label='train',color='red')
plt.hist(test_len_list, bins=50, alpha=0.5, label='test',color='blue')
plt.legend(loc='upper right')
plt.show()

## use LoRA finetune

In [None]:
config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    inference_mode=False,
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
)

In [None]:
model = get_peft_model(model, config)

## difine training args

In [None]:
args = TrainingArguments(
    output_dir="./output/Qwen1.5",
    overwrite_output_dir=True,
    per_device_train_batch_size=1,
    # per_device_eval_batch_size=1,
    gradient_accumulation_steps=4,
    logging_steps=10,
    num_train_epochs=10,
    save_steps=1000,
    learning_rate=5e-5,
    save_on_each_node=True,
    gradient_checkpointing=True,
    report_to="tensorboard",
    warmup_steps=1000,
    logging_dir="./output/Qwen1.5/logger",
    logging_strategy='steps',
    disable_tqdm=True,
    dataloader_num_workers=4,
    do_train=True,
    do_eval=False, 
    # eval_steps=1,
    # eval_on_start=True,
    # eval_strategy='steps',
)

In [None]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer, padding=True),
)

## train

In [None]:
trainer.train()

## Evaluating

In [None]:
import json
import pandas as pd
import torch
from datasets import Dataset
from peft import LoraConfig, TaskType, get_peft_model
from transformers import AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForSeq2Seq, AutoTokenizer
import os
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from peft import PeftModel
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import numpy as np

In [None]:
ckpts_dir = "output\Qwen1.5"
ckpts = [os.path.join(ckpts_dir, f) for f in os.listdir(ckpts_dir) if 'checkpoint-' in f]
ckpts = sorted(ckpts)
qwen_dir = "Qwen\Qwen2___5-0___5B-Instruct"

In [None]:
config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    inference_mode=False,
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
)

In [None]:
def load_models(qwen_dir, ckpt):
    tokenizer = AutoTokenizer.from_pretrained(qwen_dir)
    model = AutoModelForCausalLM.from_pretrained(qwen_dir)
    model = PeftModel.from_pretrained(model, model_id=ckpt, config=config)
    model.eval()

    return tokenizer, model

In [None]:
test_dataset = pd.read_json('cooked-dataset/test.jsonl', lines=True)

In [None]:
def preprocess(tokenizer, test_dataset):
    processed_list = []
    ground_truths = []
    iterator = tqdm(range(test_dataset.shape[0]))
    for line in iterator:
        line = test_dataset.iloc[line]
        msg = [
            {'role':'system','content':line['instruction']},
            {'role':'user','content':line['input']}
        ]
        text = tokenizer.apply_chat_template(msg, tokenize=False, add_generation_prompt=True)
        model_inputs = tokenizer([text], return_tensors="pt")
        processed_list.append(model_inputs)
        ground_truths.append(line['output'])
    return processed_list, ground_truths

In [None]:
tokenizer, model =  load_models(qwen_dir, ckpts[0])
test_dataset, ground_truths = preprocess(tokenizer, test_dataset)

In [None]:
@torch.no_grad()
def test_accuracy(model, tokenizer, test_dataset, ground_truths):
    predictions = []
    truths = []
    iterator = tqdm(range(len(test_dataset)))
    for idx in iterator:
        model_inputs_1 = test_dataset[idx].to('cuda')

        generated_ids_1 = model.generate(
            model_inputs_1.input_ids,
            max_new_tokens=512
        )

        generated_ids_1 = [
            output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs_1.input_ids, generated_ids_1)
        ]

        response_1 = tokenizer.batch_decode(generated_ids_1, skip_special_tokens=True)[0]

        predictions.append(response_1)

        truths.append(ground_truths[idx])
        iterator.set_postfix({
            'Accuracy': accuracy_score(truths, predictions)
        })
    return accuracy_score(truths, predictions)

In [None]:
metric_list = []
ckpt_names = []
for ckpt in ckpts[:-1]: # load ckpt path
    for i in range(5): # ensemble k models
        tokenizer, model = load_models(qwen_dir, ckpt) # load model
        model.cuda()
        acc = test_accuracy(model, tokenizer, test_dataset, ground_truths) # get test accuracy
        # save acc
        metric_list.append(acc)
        ckpt_names.append(int(ckpt.split('-')[-1]))
        try:
            # plot
            # scatter plot
            plt.figure(figsize=(20, 5))
            colors = plt.cm.jet(np.linspace(0, 1, len(ckpt_names)))
            plt.scatter(ckpt_names, metric_list, marker='o', s=50, alpha=0.6, edgecolors='w')
            plt.xlabel('Steps')
            plt.ylabel('Accuracy')
            plt.title('Accuracy over Steps')
            plt.grid(True)
            plt.tight_layout()
            plt.savefig('./scatter_plot_5shot.png')
            plt.close()
            # boxplot
            df = {}
            for ckpt_name in ckpt_names:
                df[ckpt_name] = []
            for ckpt_name, metric in zip(ckpt_names, metric_list):
                df[ckpt_name].append(metric)
            df = pd.DataFrame(df)
            df.columns = ckpt_names
            plt.figure(figsize=(20, 6))
            df.boxplot()
            plt.title('Boxplot of Metrics')
            plt.xlabel('Metrics')
            plt.grid(True)
            plt.tight_layout()
            plt.savefig('./box_plot_5shot.png')
            plt.close()
        except:
            continue

In [None]:
plt.figure(figsize=(20, 6))
df.boxplot()
plt.title('Boxplot of Metrics')
plt.xlabel('Metrics')
plt.grid(True)
plt.tight_layout()
plt.xticks(rotation=45)
plt.savefig('./box_plot_5shot.png')
plt.show()