In [1]:
BASE_MODEL_DIRECTORY = "meta-llama"
BASE_MODEL_NAME = "Meta-Llama-3-8B-Instruct"
BASE_MODEL_IDENTIFIER = f"{BASE_MODEL_DIRECTORY}/{BASE_MODEL_NAME}"
TOKENIZER_PATH = f"/content/drive/MyDrive/diploma-llm/models/{BASE_MODEL_NAME}/tokenizer"
MODEL_PATH = f"/content/drive/MyDrive/diploma-llm/models/{BASE_MODEL_NAME}/model"
DATASET_PATH = "/content/drive/MyDrive/diploma-llm/data/fine-tuning/dataset.json"
DATASET_PERCENT_TO_USE = 0.07
TEST_SPLIT = 0.05
RESULT_TOKENIZER_PATH = f"/content/drive/My Drive/diploma-llm/models/{BASE_MODEL_NAME}-ft/tokenizer"
RESULT_MODEL_PATH = f"/content/drive/My Drive/diploma-llm/models/{BASE_MODEL_NAME}-ft/model"

In [2]:
################################################################################
# TrainingArguments parameters
################################################################################

# Output directory where the model predictions and checkpoints will be stored
output_dir = f"/content/drive/MyDrive/diploma-llm/models/{BASE_MODEL_NAME}-ft"

# Number of training epochs
num_train_epochs = 1

# Enable fp16/bf16 training (set bf16 to True with an A100)
fp16 = False
bf16 = False

# Batch size per GPU for training
per_device_train_batch_size = 14

# Batch size per GPU for evaluation
per_device_eval_batch_size = 4

# Number of update steps to accumulate the gradients for
gradient_accumulation_steps = 1

# Enable gradient checkpointing
gradient_checkpointing = True

# Maximum gradient normal (gradient clipping)
max_grad_norm = 0.3

# Initial learning rate (AdamW optimizer)
learning_rate = 2e-4

# Weight decay to apply to all layers except bias/LayerNorm weights
weight_decay = 0.001

# Optimizer to use
optim = "paged_adamw_32bit"

# Learning rate schedule (constant a bit better than cosine)
lr_scheduler_type = "constant"

# Number of training steps (overrides num_train_epochs)
max_steps = -1

# Ratio of steps for a linear warmup (from 0 to learning rate)
warmup_ratio = 0.03

# Group sequences into batches with same length
# Saves memory and speeds up training considerably
group_by_length = True

# Save checkpoint every X updates steps
save_steps = 25

# Log every X updates steps
logging_steps = 25

################################################################################
# QLoRA parameters
################################################################################

# LoRA attention dimension
lora_r = 64

# Alpha parameter for LoRA scaling
lora_alpha = 16

# Dropout probability for LoRA layers
lora_dropout = 0.1

################################################################################
# SFT parameters
################################################################################

# Maximum sequence length to use
max_seq_length = None

# Pack multiple short examples in the same input sequence to increase efficiency
packing = False

# Load the entire model on the GPU 0
device_map = {"": 0}

In [3]:
!pip install torch
!pip install transformers
!pip install bitsandbytes
!pip install accelerate
!pip install peft
!pip install trl

Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch)
  Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl (121.6 MB)
Collecting nvidia-curand-cu12==10.3.2.106 (from torch)
  Using cached nvidia_curand_cu12-10.3.2.106-py3-none-manylinux1_x86_64.whl (56.5 MB)
Collectin

In [4]:
import os
from google.colab import drive

drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
from transformers import AutoTokenizer

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained(TOKENIZER_PATH)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right" # Fix weird overflow issue with fp16 training

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [6]:
from datasets import Dataset
import random

def traformInput(input):
    return (
        input.replace("<TOPIC>", "тема: ")
        .replace("</TOPIC>|", "\n")
        .replace("<KEYWORDS>", "ключові слова: ")
        .replace("</KEYWORDS>|", "\n")
        .replace("<CHAPTER_NAME>", "розділ: ")
        .replace("</CHAPTER_NAME>", "\n")
    )


dataset = Dataset.from_json(DATASET_PATH, split="train")
print(len(dataset))
dataset = dataset.train_test_split(test_size=DATASET_PERCENT_TO_USE)["test"]

split_dataset = dataset.train_test_split(test_size=TEST_SPLIT)

dataset = split_dataset["train"]
eval_dataset = split_dataset["test"]

dataset = dataset.map(lambda sample: {'text': f"{traformInput(sample['input'])}{sample['output']}{tokenizer.eos_token}"}, remove_columns=["input", "output"])
eval_dataset = eval_dataset.map(lambda sample: {'text': f"{traformInput(sample['input'])}{sample['output']}{tokenizer.eos_token}"}, remove_columns=["input", "output"])
print(dataset[0]['text'])
print(eval_dataset[0]['text'])

Generating train split: 0 examples [00:00, ? examples/s]

30251


Map:   0%|          | 0/2012 [00:00<?, ? examples/s]

Map:   0%|          | 0/106 [00:00<?, ? examples/s]

тема: Вебзастосунок для пошуку волонтерської допомоги
ключові слова: вебзастосунок, волонтерська діяльність, пошук допомоги, волонтерство, інформаційна система
розділ: 4 ІНСТРУКЦІЯ ДЛЯ КОРИСТУВАЧА
   4 ІНСТРУКЦІЯ ДЛЯ КОРИСТУВАЧА Інструкція користувача надає вичерпну інформацію та пояснення щодо використання та функціональності веб-застосунку. Вона призначена для кінцевих користувачів та надає детальні вказівки, як користуватися застосунком, його основними функціями, налаштуваннями та можливостями. Незареєстрований користувач має можливість переглядати пости волонтерів по категоріях на головній сторінці, що зображена на рисунку 4.1.  Проте для створення відгука йому потрібно створити акаунт. Форма реєстрації зображена на рисунку 4.2. Рисунок показує, що при некоректному вводу даних, наприклад неправильного формату телефону чи закороткого паролю, система повідомляє про це користувача.  Рисунок 4.1 – Головна сторінка Рисунок 4.2 – Реєстрація користувача  Після реєстрації користувачу необх

In [7]:
from transformers import AutoModelForCausalLM

# Load the model
model = AutoModelForCausalLM.from_pretrained(
    MODEL_PATH,
    device_map=device_map
    )
model.config.use_cache = False
model.config.pretraining_tp = 1

Unused kwargs: ['_load_in_4bit', '_load_in_8bit', 'quant_method']. These kwargs are not used in <class 'transformers.utils.quantization_config.BitsAndBytesConfig'>.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [8]:
from transformers import TrainingArguments

training_arguments = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    gradient_checkpointing=gradient_checkpointing,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    fp16=fp16,
    bf16=bf16,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    group_by_length=group_by_length,
    lr_scheduler_type=lr_scheduler_type,
    report_to="tensorboard",
    # resume_from_checkpoint=f"{output_dir}/checkpoint-100",
)

In [9]:
from peft import LoraConfig, PeftModel

peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM",
)

In [10]:
from trl import SFTTrainer

trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    # eval_dataset=eval_dataset,
    peft_config=peft_config,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    tokenizer=tokenizer,
    args=training_arguments,
    packing=packing,
)



Map:   0%|          | 0/2012 [00:00<?, ? examples/s]

In [11]:
trainer.train()



Step,Training Loss
25,1.8621
50,1.7443
75,1.6962
100,1.7167
125,1.7087




TrainOutput(global_step=144, training_loss=1.7332503663169012, metrics={'train_runtime': 4337.2387, 'train_samples_per_second': 0.464, 'train_steps_per_second': 0.033, 'total_flos': 7.67861036300206e+16, 'train_loss': 1.7332503663169012, 'epoch': 1.0})

In [None]:
trainer.train(resume_from_checkpoint=True)



Step,Training Loss
125,1.5374
150,1.5598




TrainOutput(global_step=156, training_loss=0.5534325807522504, metrics={'train_runtime': 1442.2243, 'train_samples_per_second': 1.514, 'train_steps_per_second': 0.108, 'total_flos': 7.951377993611674e+16, 'train_loss': 0.5534325807522504, 'epoch': 1.0})

In [None]:
# Save fine-tuned tokenizer and model
tokenizer.save_pretrained(RESULT_TOKENIZER_PATH)
model.save_pretrained(RESULT_MODEL_PATH)

In [12]:
from peft import get_peft_model
tokenizer.save_pretrained(RESULT_TOKENIZER_PATH)
peft_model = get_peft_model(model, peft_config)
peft_model.save_pretrained(RESULT_MODEL_PATH)



In [None]:
from google.colab import runtime
runtime.unassign()