# Initialize

In [None]:
# @title Install Dependency

!pip install -U xformers --index-url https://download.pytorch.org/whl/cu121 -q
!pip install --no-deps packaging ninja einops flash-attn trl peft accelerate bitsandbytes -q
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git" -q
!pip install fastparquet -q
!pip install triton -q

In [None]:
# @title Mount Google Drive

from google.colab import drive
drive.mount('/content/drive')

In [None]:
# @title Import Python Packages
import torch
import os
import json
import pandas as pd
from tqdm.notebook import tqdm
import sys



from datasets import Dataset, DatasetDict
from datasets import load_dataset
from huggingface_hub import notebook_login
from transformers import TrainingArguments, AutoModelForCausalLM, AutoTokenizer
from trl import SFTTrainer
from unsloth import FastLanguageModel
import glob

task_dir = "/content/drive/MyDrive/llmserver/task_iclr23/"
dataset_dir = "/content/drive/MyDrive/llmserver/task_iclr23/dataset/"
finetune_cache_dir = "/content/drive/MyDrive/llmserver/cache_finetune/"
huggingface_user = "YOUR_NAME"
dataset_name = "YOUR_DATASET_NAME"

prompts_dict = {}
for filename in os.listdir(task_dir + "prompt/"):
    if filename.endswith('.txt'):
        file_path = task_dir + "prompt/" + filename
        with open(file_path, 'r', encoding='utf-8') as f:
            prompts_dict[filename[:-4]] = f.read()



In [None]:
# @title Login to Huggingface
notebook_login()

# Build Dataset for Finetuning

In [None]:
# @title Build Stand Dataset from Reviews

df_paper = pd.read_parquet(f"{dataset_dir}dataset_paper.parquet",engine='fastparquet')
df_r1 = pd.read_parquet(f"{dataset_dir}dataset_human_review1.parquet",engine='fastparquet')
df_r2 = pd.read_parquet(f"{dataset_dir}dataset_human_review2.parquet",engine='fastparquet')
df_r3 = pd.read_parquet(f"{dataset_dir}dataset_human_review3.parquet",engine='fastparquet')

print(len(df_paper),len(df_r1),len(df_r2),len(df_r3))

df_finetune_raw = pd.DataFrame(columns=["instruction", "input", "output"])

rang_l = 500 # param {type:"integer"}
rang_r = 1500 # param {type:"integer"}

for i in tqdm(range(rang_l, rang_r), desc="Processing papers"):
    # columns=["paper_id", "model", "review", "review_summary"]
    row_paper = df_paper.iloc[i]
    for synopsis in [row_paper["abstract"], "Not available"]:
        for ind in range(9):
            source = [df_r1.iloc[i]["review_summary"],
                      df_r2.iloc[i]["review_summary"],
                      df_r3.iloc[i]["review_summary"],
                      df_r1.iloc[i]["review_summary"],
                      df_r2.iloc[i]["review_summary"],
                      df_r3.iloc[i]["review_summary"],
                      "Not available",
                      "Not available",
                      "Not available"][ind]
            target = [df_r2.iloc[i]["review_summary"],
                      df_r3.iloc[i]["review_summary"],
                      df_r1.iloc[i]["review_summary"],
                      df_r3.iloc[i]["review_summary"],
                      df_r1.iloc[i]["review_summary"],
                      df_r2.iloc[i]["review_summary"],
                      df_r1.iloc[i]["review_summary"],
                      df_r2.iloc[i]["review_summary"],
                      df_r3.iloc[i]["review_summary"]][ind]
            df_finetune_raw.loc[len(df_finetune_raw)] = [prompts_dict["gen_review_system"], prompts_dict["gen_review_user"].format(synopsis,source), target]

print(len(df_finetune_raw))
df_finetune_raw.to_parquet(f"{dataset_dir}dataset_finetune_raw.parquet", index=False)


In [None]:
# @title Convert Standard Dataset to Llama3 Finetuning Format

class Llama3InstructDataset:
    def __init__(self, data):
        self.data = data
        self.prompts = []
        self.create_prompts()

    def create_prompt(self, row):
        if 'input' not in row: row['input'] = ''
        prompt = f'''<|begin_of_text|><|start_header_id|>system<|end_header_id|>

Cutting Knowledge Date: December 2023
Today Date: 26 Jul 2024

{row["instruction"]}<|eot_id|><|start_header_id|>user<|end_header_id|>

{row["input"]}<|eot_id|><|start_header_id|>assistant<|end_header_id|>

{row["output"]}<|eot_id|>'''
        return prompt

    def create_prompts(self):
        for index, row in self.data.iterrows():
            prompt = self.create_prompt(row)
            self.prompts.append(prompt)

    def get_dataset(self):
        df = pd.DataFrame({'prompt': self.prompts})
        return df

def create_dataset_hf(dataset):
    dataset.reset_index(drop=True, inplace=True)
    return DatasetDict({"train": Dataset.from_pandas(dataset)})

if __name__ == "__main__":
    df_finetune_raw = pd.read_parquet(f"{dataset_dir}dataset_finetune_raw.parquet",engine='fastparquet')
    dataset = Llama3InstructDataset(df_finetune_raw)
    df_finetune_llama3 = dataset.get_dataset()

    llama3_dataset = create_dataset_hf(df_finetune_llama3)
    llama3_dataset.save_to_disk(os.path.join(finetune_cache_dir, "llama3_finetune_data"))
    llama3_dataset.push_to_hub(f"{huggingface_user}/{dataset_name}") # This requires the huggingface token with the `write` role

# Finetune Model

In [None]:
# @title LoRa Finetuning Configurations
# Defining the configuration for the base model, LoRA and training
config = {
    "hugging_face_username":huggingface_user,
    "model_config": {
        "base_model":"unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit", # The base model (warn: think whether using a 4bit model) !!!!!!!!!!!!!!!!!!!!!!!!!!
        "finetuned_model":"Meta-Llama-3.1-8B-bnb-4bit-finetune-peer-review-judge", # finetuned_model sets your models name on HF
        "max_seq_length": 4096, # The maximum sequence length
        "dtype":torch.bfloat16, # The data type
        "load_in_4bit": True, # Load the model in 4-bit
    },
    "lora_config": {
      "r": 16, # The number of LoRA layers 8, 16, 32, 64
      "target_modules": ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj"], # The target modules
      "lora_alpha":16, # The alpha value for LoRA
      "lora_dropout":0, # The dropout value for LoRA
      "bias":"none", # The bias for LoRA
      "use_gradient_checkpointing":True, # Use gradient checkpointing
      "use_rslora":False, # Use RSLora
      "use_dora":False, # Use DoRa
      "loftq_config":None # The LoFTQ configuration
    },
    "training_dataset":{
        "name":f"{huggingface_user}/{dataset_name}", # The dataset name(huggingface/datasets)
        "split":"train", # The dataset split
        "input_field":"prompt", # The input field
    },
    "training_config": {
        "per_device_train_batch_size": 2, # The batch size
        "gradient_accumulation_steps": 4, # The gradient accumulation steps
        "warmup_steps": 5, # The warmup steps
        "max_steps":0, # The maximum steps (0 if the epochs are defined)
        "num_train_epochs": 3, # The number of training epochs(0 if the maximum steps are defined)
        "learning_rate": 2e-4, # The learning rate
        "fp16": not torch.cuda.is_bf16_supported(),  # The fp16
        "bf16": torch.cuda.is_bf16_supported(), # The bf16
        "logging_steps": 1, # The logging steps
        "optim" :"adamw_8bit", # The optimizer
        "weight_decay" : 0.01,  # The weight decay
        "lr_scheduler_type": "linear", # The learning rate scheduler
        "seed" : 42, # The seed
        "output_dir" : finetune_cache_dir, # The output directory
    }
}

In [None]:
# @title Load Model, QLoRA and Trainer Model

# Loading the model and the tokinizer for the model
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = config.get("model_config").get("base_model"),
    max_seq_length = config.get("model_config").get("max_seq_length"),
    dtype = config.get("model_config").get("dtype"),
    load_in_4bit = config.get("model_config").get("load_in_4bit"),
)

# Setup for QLoRA/LoRA peft of the base model
model = FastLanguageModel.get_peft_model(
    model,
    r = config.get("lora_config").get("r"),
    target_modules = config.get("lora_config").get("target_modules"),
    lora_alpha = config.get("lora_config").get("lora_alpha"),
    lora_dropout = config.get("lora_config").get("lora_dropout"),
    bias = config.get("lora_config").get("bias"),
    use_gradient_checkpointing = config.get("lora_config").get("use_gradient_checkpointing"),
    random_state = 42,
    use_rslora = config.get("lora_config").get("use_rslora"),
    use_dora = config.get("lora_config").get("use_dora"),
    loftq_config = config.get("lora_config").get("loftq_config"),
)

# Loading the training dataset
dataset_train = load_dataset(config.get("training_dataset").get("name"), split = config.get("training_dataset").get("split"))

# Setting up the trainer for the model
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset_train,
    dataset_text_field = config.get("training_dataset").get("input_field"),
    max_seq_length = config.get("model_config").get("max_seq_length"),
    dataset_num_proc = 2,
    packing = False,
    args = TrainingArguments(
        per_device_train_batch_size = config.get("training_config").get("per_device_train_batch_size"),
        gradient_accumulation_steps = config.get("training_config").get("gradient_accumulation_steps"),
        warmup_steps = config.get("training_config").get("warmup_steps"),
        max_steps = config.get("training_config").get("max_steps"),
        num_train_epochs= config.get("training_config").get("num_train_epochs"),
        learning_rate = config.get("training_config").get("learning_rate"),
        fp16 = config.get("training_config").get("fp16"),
        bf16 = config.get("training_config").get("bf16"),
        logging_steps = config.get("training_config").get("logging_steps"),
        optim = config.get("training_config").get("optim"),
        weight_decay = config.get("training_config").get("weight_decay"),
        lr_scheduler_type = config.get("training_config").get("lr_scheduler_type"),
        seed = 42,
        output_dir = config.get("training_config").get("output_dir"),
        save_steps = config.get("training_config").get("save_steps", 25),  # for checkpoints
        save_total_limit = config.get("training_config").get("save_total_limit", 3),
        resume_from_checkpoint = True,
    ),
)

In [None]:
# @title Train the Model

output_dir = config.get("training_config").get("output_dir")
checkpoint_files = glob.glob(os.path.join(output_dir, 'checkpoint-*'))

if checkpoint_files:
    print(f"Found {len(checkpoint_files)} checkpoint(s):")
    last_checkpoint = max(checkpoint_files, key=os.path.getctime)
    print(f"Resuming from checkpoint: {last_checkpoint}")
else:
    last_checkpoint = None
    print("No checkpoints found. Starting training from scratch.")

if True:
    if last_checkpoint is not None:
        print(f"Resuming training from checkpoint: {last_checkpoint}")
        trainer_stats = trainer.train(resume_from_checkpoint=last_checkpoint)
    else:
        print("No valid checkpoint found, starting training from scratch.")
        trainer_stats = trainer.train()

In [None]:
# @title Save Trainer Stats
with open(finetune_cache_dir + "trainer_stats.json", "w") as f:
    json.dump(trainer_stats, f, indent=4)

In [None]:
# @title Save Finetuned Model and Push to HF
model.save_pretrained_gguf(config.get("model_config").get("finetuned_model"), tokenizer, quantization_method = "q4_k_m")
model.push_to_hub_gguf(config.get("model_config").get("finetuned_model"), tokenizer, quantization_method = "q4_k_m")

In [None]:
# @title Save Model to Local
import shutil
import os

source_folder = "Meta-Llama-3.1-8B-bnb-4bit-finetune-peer-review-judge"
destination_folder = os.path.join(finetune_cache_dir, os.path.basename(source_folder))

shutil.rmtree(destination_folder)
shutil.copytree(source_folder, destination_folder)