In [1]:
!pip install torch 
!pip install peft
!pip install bitsandbytes
!pip install transformers
!pip install trl 
!pip install accelerate
!pip install einops



In [2]:
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    AutoTokenizer,
    TrainingArguments,
    pipeline,
)
from peft import LoraConfig, PeftModel, prepare_model_for_kbit_training
from trl import SFTTrainer
from langchain.document_loaders import HuggingFaceDatasetLoader

In [3]:
base_model = "microsoft/Phi-3-mini-128k-instruct"
#base_model = "microsoft/phi-2"
#new_model = "phi-3-matt-medicine-election"
new_model = "phi-3-Taiwan-election"


# Dataset
dataset = load_dataset("wenlianghuang/election_in_Taiwan",split="train")

# Tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model, use_fast=True)
tokenizer.pad_token = tokenizer.unk_token
tokenizer.padding_side = "right"

Downloading readme:   0%|          | 0.00/126 [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/9.28k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/113 [00:00<?, ? examples/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [4]:
# Quantization configuration
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=False,
)

# Load base moodel
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    trust_remote_code=True,
    attn_implementation="flash_attention_2",
    device_map={"": 0}
)

model.config.use_cache = False
model.config.pretraining_tp = 1

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [5]:
peft_config = LoraConfig(
    r= 64,          
    lora_alpha= 16,
    lora_dropout=0.05, #0.1
    bias="none",
    task_type="CAUSAL_LM",
    #target_modules= ["Wqkv", "out_proj"],
    target_modules="all-linear",
)

In [6]:
# Set training arguments
training_arguments = TrainingArguments(
    output_dir = "./results_0701",
    num_train_epochs = 1,
    fp16 = False,
    bf16 = False,
    per_device_train_batch_size = 4,
    per_device_eval_batch_size = 4,
    gradient_accumulation_steps = 1,
    gradient_checkpointing = True,
    max_grad_norm = 0.3,
    learning_rate = 2e-4,
    weight_decay = 0.001,
    optim = "paged_adamw_32bit",
    lr_scheduler_type = "cosine",
    max_steps = -1,
    warmup_ratio = 0.03,
    group_by_length = True,
    save_steps = 0,
    logging_steps = 25,
)

In [7]:
# Set supervised fine-tuning parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length= None,
    tokenizer=tokenizer,
    args=training_arguments,
)

# Train model
trainer.train()



Map:   0%|          | 0/113 [00:00<?, ? examples/s]

  0%|          | 0/29 [00:00<?, ?it/s]

The input hidden states seems to be silently casted in float32, this might be related to the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in torch.float16.


{'loss': 1.5172, 'grad_norm': 0.45467516779899597, 'learning_rate': 9.903113209758096e-06, 'epoch': 0.86}
{'train_runtime': 22.7913, 'train_samples_per_second': 4.958, 'train_steps_per_second': 1.272, 'train_loss': 1.4424784923421925, 'epoch': 1.0}


TrainOutput(global_step=29, training_loss=1.4424784923421925, metrics={'train_runtime': 22.7913, 'train_samples_per_second': 4.958, 'train_steps_per_second': 1.272, 'total_flos': 102470538516480.0, 'train_loss': 1.4424784923421925, 'epoch': 1.0})

In [8]:
# Save trained model
trainer.model.save_pretrained(new_model)



In [None]:


prompt = "Who is the current president in Taiwan?"
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=300,)
result = pipe(f"### Instruction: {prompt}")
print(result[0]['generated_text'])

In [9]:
# Reload model and merge it with LoRA parameters
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    torch_dtype=torch.float16,
    trust_remote_code=True,
    cache_dir="",
    device_map={"": 0},
)
model = PeftModel.from_pretrained(model, new_model)
model = model.merge_and_unload()

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [10]:
# Reload tokenizer to save it
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [11]:
model.push_to_hub(new_model, use_temp_dir=False)

model-00002-of-00002.safetensors:   0%|          | 0.00/2.67G [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/wenlianghuang/phi-3-Taiwan-election/commit/a38a79a983b3cb5bc82996f0793a01aea0343d1d', commit_message='Upload Phi3ForCausalLM', commit_description='', oid='a38a79a983b3cb5bc82996f0793a01aea0343d1d', pr_url=None, pr_revision=None, pr_num=None)

In [12]:
tokenizer.push_to_hub(new_model, use_temp_dir=False)

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/wenlianghuang/phi-3-Taiwan-election/commit/99a10dd9dd626901711ad5bc8ed3a89cda0ad575', commit_message='Upload tokenizer', commit_description='', oid='99a10dd9dd626901711ad5bc8ed3a89cda0ad575', pr_url=None, pr_revision=None, pr_num=None)