# Model training (with Qwen 72b)

## Model initilization

Importing libs

In [None]:
import torch
import transformers
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model
from datasets import Dataset
from torch.utils.data import DataLoader
import os

creating model (with 4 bit quant & lora fine tune)

In [2]:
def setup_model():
    # 4 bit quant
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_use_double_quant=True,
    )

    # loading model
    model_name = "Qwen/Qwen-72B"
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    model = AutoModelForCausalLM.from_pretrained(
        model_name,
        quantization_config=bnb_config,
        device_map="auto",
        trust_remote_code=True
    )
    
    # LoRA
    lora_config = LoraConfig(
        r=8,
        lora_alpha=32,
        target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],
        lora_dropout=0.1,
        bias="none",
        task_type="CAUSAL_LM"
    )
    
    # prepare for training
    model = prepare_model_for_kbit_training(model)
    model = get_peft_model(model, lora_config)
    
    return model, tokenizer

## Data Preprocessing

In [3]:
def prepare_dataset(data_path, tokenizer):
    
    # example data in form: 
    # training_data = {
    #     'source_text': [...], 
    #     'target_text': [...] 
    # }
    def process_translation_data(examples):
        # prompt template
        prompts = [
            f"将以下英文翻译成中文：\n{src}\n中文翻译：" 
            for src in examples["source_text"]
        ]
        
        # output
        targets = [f"{tgt}</s>" for tgt in examples["target_text"]]
        
        # Tokenize
        inputs = tokenizer(
            prompts,
            padding=True,
            truncation=True,
            max_length=512,
            return_tensors="pt"
        )
        
        labels = tokenizer(
            targets,
            padding=True,
            truncation=True,
            max_length=512,
            return_tensors="pt"
        )["input_ids"]
        
        return {
            "input_ids": inputs["input_ids"],
            "attention_mask": inputs["attention_mask"],
            "labels": labels
        }

    # load csv data (colums: source, target)
    dataset = Dataset.from_csv(data_path)
    processed_dataset = dataset.map(
        process_translation_data,
        batched=True,
        remove_columns=dataset.column_names
    )
    
    return processed_dataset

## Configuring Training Process

loss function (BLEU)

In [None]:
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
import numpy as np
import evaluate

def compute_metrics(eval_preds, tokenizer):
    predictions, labels = eval_preds
    # decode preds and labels
    predictions = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # bleu
    bleu = evaluate.load("bleu")
    # sacrebleu
    sacrebleu = evaluate.load("sacrebleu")
    
    # calculate score
    bleu_score = bleu.compute(predictions=predictions, references=[[l] for l in labels])
    sacrebleu_score = sacrebleu.compute(predictions=predictions, references=[[l] for l in labels])
    
    return {
        "bleu_score": bleu_score["bleu"],
        "sacrebleu_score": sacrebleu_score["score"]
    }

checkpoint callback

In [5]:
from transformers import TrainerCallback, TrainingArguments, TrainerState, TrainerControl

class CustomCallback(TrainerCallback):
    def __init__(self, early_stopping_patience=3, early_stopping_threshold=0.01):
        self.early_stopping_patience = early_stopping_patience
        self.early_stopping_threshold = early_stopping_threshold
        self.best_bleu = 0
        self.no_improve_count = 0
        
    def on_evaluate(self, args, state, control, metrics, **kwargs):
        current_bleu = metrics.get("eval_bleu_score", 0)
        
        # If current model is the best
        if current_bleu > self.best_bleu + self.early_stopping_threshold:
            self.best_bleu = current_bleu
            self.no_improve_count = 0
            # save model
            kwargs['model'].save_pretrained(f"{args.output_dir}/best_model")
        else:
            self.no_improve_count += 1
            
        # early stop
        if self.no_improve_count >= self.early_stopping_patience:
            print(f"\nEarly stopping triggered! No improvement for {self.early_stopping_patience} evaluations")
            control.should_training_stop = True
            
        print(f"\nCurrent BLEU: {current_bleu:.4f}, Best BLEU: {self.best_bleu:.4f}")

start training process

In [6]:
def train():
    # init model
    model, tokenizer = setup_model()
    
    # train / valid data
    train_dataset = prepare_dataset("train_data.csv", tokenizer)
    valid_dataset = prepare_dataset("eval_data.csv", tokenizer)
    
    # training config
    training_args = TrainingArguments(
        output_dir="./translation_model",
        num_train_epochs=3,
        per_device_train_batch_size=4,
        gradient_accumulation_steps=4,
        learning_rate=2e-4,
        fp16=True,
        logging_steps=100,
        evaluation_strategy="steps",
        eval_steps=500,
        save_strategy="steps",
        save_steps=500,
        save_total_limit=5,
        load_best_model_at_end=True,
        metric_for_best_model="bleu_score",
        greater_is_better=True,
        warmup_steps=100,
        weight_decay=0.01,
    )

    callbacks = [CustomCallback(early_stopping_patience=3)]
    
    trainer = transformers.Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=valid_dataset,
        compute_metrics=compute_metrics,
        callbacks=callbacks,
    )
    
    # start training
    trainer.train(resume_from_checkpoint=True)
    
    # saving model
    trainer.save_model("./final_model")

restore training

In [7]:
def resume_training(checkpoint_dir):
    # load model
    model, tokenizer = setup_model()
    
    # train / valid data
    train_dataset = prepare_dataset("train_data.csv", tokenizer)
    valid_dataset = prepare_dataset("eval_data.csv", tokenizer)

    # load training status from checkpoint
    training_args = TrainingArguments(
        output_dir="./translation_model",
        resume_from_checkpoint=checkpoint_dir,
        num_train_epochs=3,
        per_device_train_batch_size=4,
        gradient_accumulation_steps=4,
        learning_rate=2e-4,
        fp16=True,
        logging_steps=100,
        evaluation_strategy="steps",
        eval_steps=500,
        save_strategy="steps",
        save_steps=500,
        save_total_limit=5,
        load_best_model_at_end=True,
        metric_for_best_model="bleu_score",
        greater_is_better=True,
        warmup_steps=100,
        weight_decay=0.01,
    )
    
    trainer = transformers.Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=valid_dataset,
        compute_metrics=compute_metrics,
        callbacks=[CustomCallback(early_stopping_patience=3)]
    )
    
    # continue training
    trainer.train(resume_from_checkpoint=checkpoint_dir)

## Start Training Process

In [None]:
if __name__ == "__main__":
    train()

## Loading model

In [None]:
def translate(text, model_path):
    # loading model
    model, tokenizer = setup_model()
    model.load_state_dict(torch.load(f"{model_path}/pytorch_model.bin"))
    
    # input
    prompt = f"将以下英文翻译成中文：\n{text}\n中文翻译："
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    
    # inference
    outputs = model.generate(
        **inputs,
        max_new_tokens=512,
        temperature=0.7,
        top_p=0.9,
        repetition_penalty=1.1
    )
    
    translation = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return translation.split("中文翻译：")[-1].strip()

test_text = """Two or six months prior to the expiration of 
the patent and exclusivity protection, as appropriate, 
submit an amendment to this application identifying changes, 
if any, in the conditions under which your product was 
tentatively approved. Any changes to the conditions outlined 
in this NDA require our review before final approval and the 
goal date for our review will be set accordingly. 
Your amendment should include updated labeling, chemistry, 
manufacturing and controls data, and a safety update. 
This amendment should include draft final printed labels and 
labeling which comply with all U.S. regulations (uniqueness of 
drug product appearance per 21 CFR 206; child-resistant 
packaging per 16 CFR 1700, etc.). """

translation = translate(test_text, "./final_model")
print(f"英文: {test_text}")
print(f"中文翻译: {translation}")

reference = """在专利和独占保护到期前两个月或六个月（视情况而定）
提交对本申请的修订，说明您的产品暂时批准的条件（如果有）的变化。
对本 NDA 中概述的条件的任何变化都需要我们在最终批准之前进行审查，
我们将据此确定审查的目标日期。您的修订应包括更新的标签、化学、
制造和控制数据以及安全更新。此修订应包括符合所有美国法规的最终
印刷标签和标签草案（根据 21 CFR 206 规定药品外观的独特性；
根据 16 CFR 1700 规定儿童安全包装等）。"""
bleu = evaluate.load("bleu")
score = bleu.compute(predictions=[translation], references=[[reference]])
print(f"BLEU分数: {score['bleu']:.4f}")