In [None]:
import os
import torch
from datasets import load_dataset

from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig
from trl import SFTTrainer

import huggingface_hub
huggingface_hub.login('')

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /home/students/cs/sjuhun1/.cache/huggingface/token
Login successful


In [2]:
# Hugging Face Basic Model 한국어 모델
base_model = "beomi/Llama-3-Open-Ko-8B"               # beomi님의 Llama3 한국어 파인튜닝 모델

# 주가 증권 보고서 gemini 데이터셋
hkcode_dataset = "zoohun/zoo_med_Q_test2"

# 새로운 모델 이름
new_model = "zoohun/zoo_med_all_test3"

In [3]:
if torch.cuda.get_device_capability()[0] >= 8:
    !pip install -qqq flash-attn
    attn_implementation = "flash_attention_2"
    torch_dtype = torch.bfloat16
else:
    attn_implementation = "eager"
    torch_dtype = torch.float16

In [4]:
# QLoRA config
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch_dtype,
    bnb_4bit_use_double_quant=False,
)

In [5]:
# llama 데이터 로드
dataset = load_dataset(hkcode_dataset, split="train")

# 데이터 확인
print( dataset[1] )

{'text': '<s>[INST]1.나잘스프레이오리지날(옥시메타졸린염산염)(수출용)(수출명 : Nasal Spray Original)\n\n2.나잘스프레이엑스트라모이스쳐라이징(옥시메타졸린염산염)(수출용)(수출명 : Nasal Spray Extra Moisturizing)에 대해 설명해주세요.[/INST][1.나잘스프레이오리지날(옥시메타졸린염산염)(수출용)(수출명 : Nasal Spray Original)\n\n2.나잘스프레이엑스트라모이스쳐라이징(옥시메타졸린염산염)(수출용)(수출명 : Nasal Spray Extra Moisturizing)]는 제품명이며 해당 약품의 주성분은 [옥시메타졸린염산염] 이고 첨가제는 [벤잘코늄염화물,에데트산나트륨수화물,정제수,농글리세린] 입니다.</s>'}


In [6]:
print( dataset[0] )

{'text': '<s>[INST]1. 포비딘인후스프레이액(포비돈요오드)(바닐라향), 2. 포비딘인후스프레이액(포비돈요오드)(청포도향)에 대해 설명해주세요.[/INST][1. 포비딘인후스프레이액(포비돈요오드)(바닐라향), 2. 포비딘인후스프레이액(포비돈요오드)(청포도향)]는 제품명이며 해당 약품의 주성분은 [포비돈 요오드] 이고 첨가제는 [프로필렌글리콜,청포도71334,자일리톨,유칼립톨,효소처리스테비아,L-멘톨,요오드화칼륨,바닐라향 HF-63374,에탄올,농글리세린,정제수] 입니다.</s>'}


In [7]:
### 모델 로드
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=quant_config,
    device_map= "auto"
)
model.config.use_cache = False
model.config.pretraining_tp = 1

Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

In [8]:
tokenizer = AutoTokenizer.from_pretrained(
              base_model, 
              trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

In [None]:
peft_params = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64,
    bias="none",
    task_type="CAUSAL_LM",
)

In [10]:
import torch
from transformers import Trainer, TrainingArguments

# 0번 GPU로 설정
torch.cuda.set_device(0)

training_params = TrainingArguments(
    output_dir="./results",
    num_train_epochs=20,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=1,
    optim="paged_adamw_32bit",
    save_steps=25,
    logging_steps=25,
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16=False,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="constant",
    report_to="tensorboard"
)

In [11]:
# 파인튜닝
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_params,
    dataset_text_field="text",
    max_seq_length=None,
    tokenizer=tokenizer,
    args=training_params,
    packing=False,
)


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


In [12]:
trainer.train()

Step,Training Loss
25,1.4311
50,0.9487
75,0.9762
100,0.8292
125,0.9071
150,0.7991
175,0.8369
200,0.7583
225,0.8153
250,0.7413


TrainOutput(global_step=35200, training_loss=0.2222396541522308, metrics={'train_runtime': 97033.0523, 'train_samples_per_second': 1.45, 'train_steps_per_second': 0.363, 'total_flos': 1.076271852593922e+18, 'train_loss': 0.2222396541522308, 'epoch': 20.0})

In [18]:
logging.set_verbosity(logging.CRITICAL)

prompt = "아미포틴정에 대해 설명해주세요."
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)
result = pipe(f"<s>[INST] {prompt} [/INST]")
print(result[0]['generated_text'])

<s>[INST] 아미포틴정에 대해 설명해주세요. [/INST] [아미포틴정]는 제품명이며 해당 약품의 주성분은 [L-류신/L-메티오닌/L-발린/L-이소류신/L-트레오닌/L-페닐알라닌/L-트리프토판/L-리신염산염/L-아르기닌염산염/티아민질산염/토코페롤숙시네이트칼슘/리보플라빈/니코틴산아미드/피리독신염산염] 이고 첨가제는 [오파드라이80W41001그린,크로스카르멜로오스나트륨,저치환도히드록시프로필셀룰로오스,오�


In [None]:
가두벌크림에 대해 설명해주세요.

In [None]:
[가두벌크림]는 제품명이며 해당 약품의 주성분은 [디펜히드라민/l-멘톨/디부카인염산염/dl-캄파/에녹솔론] 이고 첨가제는 [프로필렌글리콜,파라옥시벤조산프로필,세토스테아릴알코올,폴리소르베이트60,스테아릴알코올,경질유동파라핀,디메티콘,소르비탄스테아레이트,파라옥시벤조산메틸,정제수] 입니다.

In [None]:
아미포틴정에 대해 설명해주세요.

In [None]:
[아미포틴정]는 제품명이며 해당 약품의 주성분은 [L-류신/L-메티오닌/L-발린/L-이소류신/L-트레오닌/L-페닐알라닌/L-트리프토판/L-리신염산염/L-아르기닌염산염/티아민질산염/토코페롤숙시네이트칼슘/리보플라빈/니코틴산아미드/피리독신염산염] 이고 첨가제는 [오파드라이80W41001그린,크로스카르멜로오스나트륨,저치환도히드록시프로필셀룰로오스,오파그로스 클리어 97W19196,미결정셀룰로오스,오파드라이 03B28796,스테아르산마그네슘,경질무수규산,탤크,D-만니톨] 입니다.

In [18]:
model.push_to_hub("zoohun/med_llama-3-ko-8B")
tokenizer.push_to_hub("zoohun/med_llama-3-ko-8B")

model-00002-of-00002.safetensors:   0%|          | 0.00/3.24G [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.18k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/zoohun/med_llama-3-ko-8B/commit/7738a07094a1d9082e01b8aa4c817b9854730059', commit_message='Upload tokenizer', commit_description='', oid='7738a07094a1d9082e01b8aa4c817b9854730059', pr_url=None, pr_revision=None, pr_num=None)

In [22]:
pwd

'/home/students/cs/sjuhun1/llama3_rag_finetune'

In [23]:
from huggingface_hub import HfApi, create_repo

# Hugging Face 리포지토리 생성
repo_id = "med-llama3-ko"
create_repo(repo_id)

RepoUrl('https://huggingface.co/zoohun/med-llama3-ko', endpoint='https://huggingface.co', repo_type='model', repo_id='zoohun/med-llama3-ko')

In [33]:
# 모델 업로드
model.push_to_hub(repo_id)
tokenizer.push_to_hub(repo_id)

model-00001-of-00002.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.24G [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

README.md:   0%|          | 0.00/5.18k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/zoohun/med-llama3-ko/commit/1fe8759266cd61bbc570d1e000ef0cf51b52ae5d', commit_message='Upload tokenizer', commit_description='', oid='1fe8759266cd61bbc570d1e000ef0cf51b52ae5d', pr_url=None, pr_revision=None, pr_num=None)

In [34]:
from huggingface_hub import HfApi, create_repo

# Hugging Face 리포지토리 생성
repo_id = "med-llama3-ko2"
create_repo(repo_id)

# 모델과 토크나이저 저장
model.push_to_hub(repo_id, safe_serialization=True)
tokenizer.push_to_hub(repo_id)

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.24G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.18k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/zoohun/med-llama3-ko2/commit/68ee48ad24b8d4ccf1b8640cafdecbbae50b381e', commit_message='Upload tokenizer', commit_description='', oid='68ee48ad24b8d4ccf1b8640cafdecbbae50b381e', pr_url=None, pr_revision=None, pr_num=None)

In [27]:
ls

chroma_db_create.ipynb         llama3_finetune.ipynb  [0m[01;34mresults[0m/
[01;34mchroma_db_eeve[0m/                llama-3-ko_rag.ipynb   [01;34msaved_model[0m/
[01;34mdataset[0m/                       medicines_info.txt
huggingface_data_upload.ipynb  [01;34mmodel[0m/


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [40]:
model.save_pretrained("./model/saved_model")
tokenizer.save_pretrained("./model/saved_model")

('./model/saved_model/tokenizer_config.json',
 './model/saved_model/special_tokens_map.json',
 './model/saved_model/tokenizer.json')

In [42]:
model.save_pretrained("./model/saved_model", quantization_config=quant_config)
tokenizer.save_pretrained("./model/saved_model")

('./model/saved_model/tokenizer_config.json',
 './model/saved_model/special_tokens_map.json',
 './model/saved_model/tokenizer.json')

In [None]:
quant_config

In [41]:
pwd

'/home/students/cs/sjuhun1/llama3_rag_finetune'