In [1]:
import torch
major_version, minor_version = torch.cuda.get_device_capability()
major_version, minor_version

(6, 1)

In [6]:
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps packaging ninja einops flash-attn xformers trl peft accelerate bitsandbytes

Collecting unsloth@ git+https://github.com/unslothai/unsloth.git (from unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git)
  Cloning https://github.com/unslothai/unsloth.git to /tmp/pip-install-4bd6rzvj/unsloth_7293d2addcc849bbb3b72a0f26540cb2
  Running command git clone --filter=blob:none --quiet https://github.com/unslothai/unsloth.git /tmp/pip-install-4bd6rzvj/unsloth_7293d2addcc849bbb3b72a0f26540cb2
  Resolved https://github.com/unslothai/unsloth.git to commit 85f1fa096afde5efe2fb8521d8ceec8d13a00715
  Installing build dependencies ... [?2done
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Collecting einops
  Downloading einops-0.8.0-py3-none-any.whl.metadata (12 kB)
Collecting flash-attn
  Downloading flash_attn-2.7.2.post1.tar.gz (3.1 MB)
[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m3.1/3.1 MB[0m 

In [7]:
import torch
from unsloth import FastLanguageModel
import huggingface_hub
import os
from datasets import load_dataset
from trl import SFTTrainer
from transformers import TrainingArguments
from peft import LoraConfig

In [8]:
max_seq_length = 4096
dtype = None
load_in_4bit = True

In [10]:
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/llama-3-8b-bnb-4bit",
    max_seq_length=4096,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
)

==((====))==  Unsloth 2024.12.4: Fast Llama patching. Transformers:4.47.1.
   \\   /|    GPU: NVIDIA GeForce GTX 1060 6GB. Max memory: 6.0 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 6.1. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


ValueError: Some modules are dispatched on the CPU or the disk. Make sure you have enough GPU RAM to fit the quantized model. If you want to dispatch the model on the CPU or the disk while keeping these modules in 32-bit, you need to set `llm_int8_enable_fp32_cpu_offload=True` and pass a custom `device_map` to `from_pretrained`. Check https://huggingface.co/docs/transformers/main/en/main_classes/quantization#offload-between-cpu-and-gpu for more details. 

In [6]:
EOS_TOKEN = tokenizer.eos_token
alpaca_prompt = """Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
{}
### Response:
{}"""

In [8]:
def formatting_prompts_func(examples):
    instructions = examples["instruction"] 
    outputs = examples["output"] 
    texts = [] 
    for instruction, output in zip(instructions, outputs):
        text = alpaca_prompt.format(instruction, output) + EOS_TOKEN
        texts.append(text)
    return {
        "text": texts,
    }

dataset = load_dataset("prismdata/KDI-DATASET-2014", split="train")
dataset = dataset.map(
    formatting_prompts_func,
    batched=True,
)

README.md:   0%|          | 0.00/426 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/165k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1331 [00:00<?, ? examples/s]

Map:   0%|          | 0/1331 [00:00<?, ? examples/s]

In [10]:
tokenizer.padding_side = "right" 
Lora_config = LoraConfig(
r=16,
lora_alpha=32,
lora_dropout=0.05,
bias="none"
)

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,  
    train_dataset=dataset,  
    eval_dataset=dataset,
    dataset_text_field="text", 
    max_seq_length=max_seq_length, 
    dataset_num_proc=1,  
    packing=False, 
    peft_config =Lora_config,
    args=TrainingArguments(
        per_device_train_batch_size=1,  
        gradient_accumulation_steps=4, 
        warmup_steps=5, 
        num_train_epochs=3,
        max_steps=100,
        do_eval=True,
        evaluation_strategy="steps",
        logging_steps=1,
        learning_rate=2e-4, 
        fp16=not torch.cuda.is_bf16_supported(), 
        bf16=torch.cuda.is_bf16_supported(), 
        optim="adamw_8bit", 
        weight_decay=0.01, 
        lr_scheduler_type="cosine", 
        seed=123, 
        output_dir="outputs", 
    ),
)



Map:   0%|          | 0/1331 [00:00<?, ? examples/s]

Map:   0%|          | 0/1331 [00:00<?, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs


In [11]:
from peft import LoraConfig, get_peft_model
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 1,331 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 1 | Gradient Accumulation steps = 4
\        /    Total batch size = 4 | Total steps = 100
 "-____-"     Number of trainable parameters = 6,815,744


Step,Training Loss,Validation Loss
1,3.2075,No log
2,3.106,No log
3,3.0454,No log
4,2.8695,No log
5,3.2232,No log
6,2.7195,No log
7,2.8115,No log
8,2.4837,No log
9,2.1708,No log
10,2.2399,No log


In [21]:
model.save_pretrained("Llama-3-Open-Ko-8B-prismdata")

In [19]:
#ÏòµÏÖò 1) vLLMÏö©

In [38]:
base_model = "beomi/Llama-3-Open-Ko-8B"  # Î≥ëÌï©ÏùÑ ÏàòÌñâÌï† Î≤†Ïù¥Ïä§ Î™®Îç∏
save_method = ("merged_16bit")

In [18]:
model.save_pretrained_merged(
    base_model,
    tokenizer,
    save_method=save_method,
)

Unsloth: Kaggle/Colab has limited disk space. We need to delete the downloaded
model which will save 4-16GB of disk space, allowing you to save on Kaggle/Colab.
Unsloth: Will remove a cached repo with size 16.1G


Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 30.9 out of 52.96 RAM for saving.


 78%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä  | 25/32 [00:00<00:00, 51.19it/s]We will save to Disk and not RAM now.
100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 32/32 [00:04<00:00,  7.01it/s]


Unsloth: Saving tokenizer... Done.
Unsloth: Saving model... This might take 5 minutes for Llama-7b...
Done.


In [22]:
#ÏòµÏÖò 2) HuggingFace Ïóê ÏóÖÎ°úÎìú

In [23]:
# Hub Ïóê ÏóÖÎ°úÎìú
huggingface_repo = ''
huggingface_token = ''
model.push_to_hub_merged(
    huggingface_repo,
    tokenizer,
    save_method=save_method,
    token=huggingface_token,
)

Unsloth: You are pushing to hub, but you passed your HF username = prismdata.
We shall truncate prismdata/KDI-Llama-3-Open-Ko-8B-Instruct to KDI-Llama-3-Open-Ko-8B-Instruct


Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 29.36 out of 52.96 RAM for saving.


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 32/32 [00:04<00:00,  7.11it/s]


Unsloth: Saving tokenizer...

  0%|          | 0/1 [00:00<?, ?it/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

 Done.
Unsloth: Saving model... This might take 5 minutes for Llama-7b...


README.md:   0%|          | 0.00/31.0 [00:00<?, ?B/s]

  0%|          | 0/4 [00:00<?, ?it/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

Done.
Saved merged model to https://huggingface.co/prismdata/KDI-Llama-3-Open-Ko-8B-Instruct


In [24]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [27]:
import shutil
import os

# ÏõêÎ≥∏ Í≤ΩÎ°ú
source_path = '/content/KDI-Llama-3-Open-Ko-8B-Instruct'

# Î™©Ï†ÅÏßÄ Í≤ΩÎ°ú
destination_path = '/content/drive/MyDrive/Colab Notebooks/KDI-Llama-3-Open-Ko-8B-Instruct'

# Î™©Ï†ÅÏßÄ Í≤ΩÎ°úÍ∞Ä ÏóÜÏúºÎ©¥ ÏÉùÏÑ±
if not os.path.exists(destination_path):
    os.makedirs(destination_path)

# '/content' Í≤ΩÎ°úÏóê ÏûàÎäî Î™®Îì† ÌååÏùºÍ≥º Ìè¥ÎçîÎ•º Ïù¥Îèô
for filename in os.listdir(source_path):
    file_path = os.path.join(source_path, filename)
    if os.path.isfile(file_path) or os.path.isdir(file_path):
        shutil.copy(file_path, destination_path)

In [28]:
# ÏõêÎ≥∏ Í≤ΩÎ°ú
source_path = '/content/Llama-3-Open-Ko-8B-prismdata'

# Î™©Ï†ÅÏßÄ Í≤ΩÎ°ú
destination_path = '/content/drive/MyDrive/Colab Notebooks/Llama-3-Open-Ko-8B-prismdata'

# Î™©Ï†ÅÏßÄ Í≤ΩÎ°úÍ∞Ä ÏóÜÏúºÎ©¥ ÏÉùÏÑ±
if not os.path.exists(destination_path):
    os.makedirs(destination_path)

# '/content' Í≤ΩÎ°úÏóê ÏûàÎäî Î™®Îì† ÌååÏùºÍ≥º Ìè¥ÎçîÎ•º Ïù¥Îèô
for filename in os.listdir(source_path):
    file_path = os.path.join(source_path, filename)
    if os.path.isfile(file_path) or os.path.isdir(file_path):
        shutil.copy(file_path, destination_path)

In [29]:
#Inference
model.config.torch_dtype = torch.bfloat16 #Îß§Ïö∞ Ï§ëÏöî..

In [70]:
#Ïù¥ Î£®Ìã¥Ïù¥ Í∞ÄÏû• Í∞ÑÎã®Ìï®.
FastLanguageModel.for_inference(model) 
inputs = tokenizer(
[
    alpaca_prompt.format(
         "CentrelinkÎ•º Ï§ëÏã¨ÏúºÎ°ú Ìïú Ìò∏Ï£ºÏùò Í≥µÍ≥µÏÑúÎπÑÏä§Ïóê ÎåÄÌï¥ ÏïåÎ†§Ï§ò",
        "",
    )
], return_tensors = "pt").to("cuda")

print("torch data type", model.config.torch_dtype)

outputs = model.generate(**inputs, max_new_tokens = 4096, use_cache = True)
decoded_output = tokenizer.batch_decode(outputs)
print('----------------')
print(decoded_output)

torch data type torch.bfloat16
----------------
['<|begin_of_text|>Below is an instruction that describes a task. Write a response that appropriately completes the request.\n\n### Instruction:\nCentrelinkÎ•º Ï§ëÏã¨ÏúºÎ°ú Ìïú Ìò∏Ï£ºÏùò Í≥µÍ≥µÏÑúÎπÑÏä§Ïóê ÎåÄÌï¥ ÏïåÎ†§Ï§ò\n### Response:\nCentrelinkÎ•º Ï§ëÏã¨ÏúºÎ°ú Ìïú Ìò∏Ï£ºÏùò Í≥µÍ≥µÏÑúÎπÑÏä§Îäî Ìò∏Ï£ºÏùò Í≥µÍ≥µÏÑúÎπÑÏä§ Ï†ÑÎã¨Ï≤¥Í≥ÑÎ•º Íµ¨ÏÑ±ÌïòÍ≥† ÏûàÏäµÎãàÎã§.<|end_of_text|>']
