In [1]:
import os
import sys
import json
import pandas as pd
import torch

from functools import partial

from datasets import Dataset
from peft import (
    TaskType,
    LoraConfig,
    get_peft_model
)
from transformers import (
    Trainer,
    TrainingArguments,
    AutoTokenizer,
    AutoModelForCausalLM,
    DataCollatorForSeq2Seq
)
from huggingface_hub import snapshot_download

sys.path.append("/home/jovyan/Workspace/all_about_llms/")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from utils.llm_utils import QWenUtils

In [3]:
QWenUtils.load_model_and_tokenizer("phi-3.5-mini-instruct")

Load Model From  /home/jovyan/Workspace/all_about_llms/utils/../llm_models/phi-3.5-mini-instruct/


Fetching 19 files: 100%|██████████| 19/19 [01:52<00:00,  5.92s/it]
`flash-attention` package not found, consider installing for better performance: No module named 'flash_attn'.
Current `flash-attention` does not support `window_size`. Either upgrade or use `attn_implementation='eager'`.
Loading checkpoint shards: 100%|██████████| 2/2 [00:17<00:00,  8.79s/it]


(Phi3ForCausalLM(
   (model): Phi3Model(
     (embed_tokens): Embedding(32064, 3072, padding_idx=32000)
     (embed_dropout): Dropout(p=0.0, inplace=False)
     (layers): ModuleList(
       (0-31): 32 x Phi3DecoderLayer(
         (self_attn): Phi3Attention(
           (o_proj): Linear(in_features=3072, out_features=3072, bias=False)
           (qkv_proj): Linear(in_features=3072, out_features=9216, bias=False)
           (rotary_emb): Phi3LongRoPEScaledRotaryEmbedding()
         )
         (mlp): Phi3MLP(
           (gate_up_proj): Linear(in_features=3072, out_features=16384, bias=False)
           (down_proj): Linear(in_features=8192, out_features=3072, bias=False)
           (activation_fn): SiLU()
         )
         (input_layernorm): Phi3RMSNorm()
         (resid_attn_dropout): Dropout(p=0.0, inplace=False)
         (resid_mlp_dropout): Dropout(p=0.0, inplace=False)
         (post_attention_layernorm): Phi3RMSNorm()
       )
     )
     (norm): Phi3RMSNorm()
   )
   (lm_head): Lin

In [None]:
messages = QWenUtils.CMeEE_dataset_json_transfer("../../datasets/CMeEE-V2/CMeEE-V2_dev.json")

In [None]:
tokenizer = AutoTokenizer.from_pretrained(
    "../../llm_models/Qwen2-7B-Instruct/",
    trust_remote_code=True
)
model = AutoModelForCausalLM.from_pretrained(
    "../../llm_models/Qwen2-7B-Instruct/",
    trust_remote_code=True,
    torch_dtype=torch.bfloat16,
    device_map="cuda"
)
model.enable_input_require_grads()

Loading checkpoint shards: 100%|██████████| 4/4 [00:03<00:00,  1.10it/s]


In [None]:
train_df = pd.DataFrame(data=messages)
train_ds = Dataset.from_pandas(train_df)
train_dataset = train_ds.map(
    partial(QWenUtils.process_func, tokenizer=tokenizer),
    remove_columns=train_ds.column_names,
)

Map: 100%|██████████| 5000/5000 [00:01<00:00, 2568.28 examples/s]


In [None]:
config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    target_modules=[
        "q_proj", "k_proj", "v_proj", 
        "o_proj", "gate_proj", 
        "up_proj", "down_proj"],
    inference_mode=False,
    r=16, lora_alpha=32, lora_dropout=0.1
)
config

LoraConfig(peft_type=<PeftType.LORA: 'LORA'>, auto_mapping=None, base_model_name_or_path=None, revision=None, task_type=<TaskType.CAUSAL_LM: 'CAUSAL_LM'>, inference_mode=False, r=16, target_modules={'k_proj', 'down_proj', 'up_proj', 'v_proj', 'gate_proj', 'o_proj', 'q_proj'}, lora_alpha=32, lora_dropout=0.1, fan_in_fan_out=False, bias='none', use_rslora=False, modules_to_save=None, init_lora_weights=True, layers_to_transform=None, layers_pattern=None, rank_pattern={}, alpha_pattern={}, megatron_config=None, megatron_core='megatron.core', loftq_config={}, use_dora=False, layer_replication=None, runtime_config=LoraRuntimeConfig(ephemeral_gpu_offload=False))

In [None]:
model = get_peft_model(model, config)

In [None]:
args = TrainingArguments(
    output_dir="../../finetune_outputs/qwen2",
    per_device_train_batch_size=16,
    gradient_accumulation_steps=1,
    logging_steps=10,
    num_train_epochs=2,
    save_steps=1000,
    learning_rate=1e-4,
    gradient_checkpointing=True,
    report_to="none"
)

In [None]:
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    data_collator=DataCollatorForSeq2Seq(tokenizer, padding=True)
)

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [None]:
trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
  return fn(*args, **kwargs)
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step,Training Loss
10,1.058
20,0.465
30,0.3553
40,0.3024
50,0.2733
60,0.268
70,0.2269
80,0.2383
90,0.2103


TrainOutput(global_step=626, training_loss=0.161516233850211, metrics={'train_runtime': 1147.3419, 'train_samples_per_second': 8.716, 'train_steps_per_second': 0.546, 'total_flos': 1.0798407041716224e+17, 'train_loss': 0.161516233850211, 'epoch': 2.0})