In [32]:
import torch
import os
import torch
from datasets import load_dataset


from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig
from trl import SFTTrainer

from transformers import PreTrainedTokenizerFast
from transformers import GPT2LMHeadModel
import huggingface_hub

In [33]:
print(torch.cuda.is_available())
huggingface_hub.login(token="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")

True
The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `huggingface-cli` if you want to set the git credential as well.
Token is valid (permission: read).
Your token has been saved to C:\Users\WeSeongGu\.cache\huggingface\token
Login successful


In [34]:
# Hugging Face Basic Model 한국어 모델


# Custom Dataset ★ 본인이 hugging face 내 저장한 모델경로를 설정해야함 ★
hkcode_dataset = "mogoi/delivery_all"



dataset = load_dataset(hkcode_dataset, split="train")
# 데이터 확인
print( dataset[197] )
print(torch.cuda.get_device_capability()[0])

{'text': '<s>[INST]손님:효자2동 김우빈 입니다.[/INST]직원:확인 한 번 부탁드립니다. 짜장면에 효자2동이고, 김우빈 맞으실까요?</s>'}
7


In [35]:
if torch.cuda.get_device_capability()[0] >= 8:
    attn_implementation = "flash_attention_2"
    torch_dtype = torch.bfloat16
else:
    attn_implementation = "eager"
    torch_dtype = torch.float16

# QLoRA config
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch_dtype,
    bnb_4bit_use_double_quant=False,
)

In [36]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [37]:
tokenizer = PreTrainedTokenizerFast.from_pretrained("skt/kogpt2-base-v2",
  bos_token='</s>', eos_token='</s>', unk_token='<unk>',
  pad_token='<pad>', mask_token='<mask>')

model = GPT2LMHeadModel.from_pretrained('skt/kogpt2-base-v2')

In [38]:

peft_params = LoraConfig(
        lora_alpha=16,
        lora_dropout=0.1,
        r=64,
        bias="none",
        task_type="CAUSAL_LM",
    )
    
training_params = TrainingArguments(
        output_dir="./results",
        num_train_epochs=40,
        per_device_train_batch_size=8,
        gradient_accumulation_steps=1,
        optim="paged_adamw_32bit",
        save_steps=500,
        logging_steps=500,
        learning_rate=2e-4,
        weight_decay=0.001,
        fp16=False,
        bf16=False,
        max_grad_norm=0.3,
        max_steps=-1,
        warmup_ratio=0.03,
        group_by_length=True,
        lr_scheduler_type="constant",
        report_to="tensorboard"
    )
    
trainer = SFTTrainer(
        model=model,
        train_dataset=dataset,
        peft_config=peft_params,
        dataset_text_field="text",
        max_seq_length=None,
        tokenizer=tokenizer,
        args=training_params,
        packing=False,
    )







Map:   0%|          | 0/53350 [00:00<?, ? examples/s]

In [None]:
trainer.train()
logging.set_verbosity(logging.CRITICAL)

{'loss': 1.3478, 'grad_norm': 1.100317120552063, 'learning_rate': 0.0002, 'epoch': 0.0749737591842855}
{'loss': 0.6249, 'grad_norm': 1.057011604309082, 'learning_rate': 0.0002, 'epoch': 0.149947518368571}




{'loss': 0.4864, 'grad_norm': 0.8052161335945129, 'learning_rate': 0.0002, 'epoch': 0.2249212775528565}




{'loss': 0.3944, 'grad_norm': 0.8334420919418335, 'learning_rate': 0.0002, 'epoch': 0.299895036737142}




{'loss': 0.3513, 'grad_norm': 0.838429868221283, 'learning_rate': 0.0002, 'epoch': 0.3748687959214275}




{'loss': 0.318, 'grad_norm': 0.685339093208313, 'learning_rate': 0.0002, 'epoch': 0.449842555105713}




{'loss': 0.3014, 'grad_norm': 0.6162809729576111, 'learning_rate': 0.0002, 'epoch': 0.5248163142899985}




{'loss': 0.2875, 'grad_norm': 0.6571316123008728, 'learning_rate': 0.0002, 'epoch': 0.599790073474284}




{'loss': 0.2833, 'grad_norm': 0.5563750863075256, 'learning_rate': 0.0002, 'epoch': 0.6747638326585695}




{'loss': 0.2697, 'grad_norm': 0.5251194834709167, 'learning_rate': 0.0002, 'epoch': 0.749737591842855}




{'loss': 0.2695, 'grad_norm': 0.5471196174621582, 'learning_rate': 0.0002, 'epoch': 0.8247113510271405}




{'loss': 0.2632, 'grad_norm': 0.593270480632782, 'learning_rate': 0.0002, 'epoch': 0.899685110211426}




{'loss': 0.2597, 'grad_norm': 0.5026218891143799, 'learning_rate': 0.0002, 'epoch': 0.9746588693957114}




{'loss': 0.2572, 'grad_norm': 0.3417667746543884, 'learning_rate': 0.0002, 'epoch': 1.049632628579997}




In [None]:
prompt = "짜장면 배달 가능한가요?"
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)
result = pipe(f"<s>[INST] {prompt} [/INST]")
print(result[0]['generated_text'])

input_text = "짜장면 배달 주문 가능한가요?"
input_ids = tokenizer.encode(input_text, return_tensors="pt")

output = model.generate(input_ids, max_length=100, num_return_sequences=1, temperature=1.0, pad_token_id=tokenizer.eos_token_id)

decoded_output = tokenizer.decode(output[0], skip_special_tokens=True)
print(decoded_output)

In [None]:
model.push_to_hub(
            "fkskdldh/test-5-25",
            use_temp_dir=True,
            use_auth_token="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"
)
tokenizer.push_to_hub(
            "fkskdldh/test-5-25",
            use_temp_dir=True,
            use_auth_token="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"
)

In [None]:
# save_path = "./trained_model/"
# tokenizer.save_pretrained(save_path)
# torch.save(model.state_dict(), "./trained_model/pytorch_model.bin")
# model.save_pretrained('./trained_model')
# model.save_pretrained(save_path)
# tokenizer.save_pretrained(save_path)
# torch.save(model, save_path)
# torch.save(tokenizer, save_path)
tokenizer.save_pretrained('model/tokenizer/')
torch.save(model.state_dict(), "model/pytorch_model.bin")
model.save_pretrained('model/model/')
print('finish')

In [None]:

save_path = "./model/model"

loaded_model = GPT2LMHeadModel.from_pretrained("fkskdldh/test-5-25")

# tokenizer = PreTrainedTokenizerFast.from_pretrained(save_path)
loaded_tokenizer = PreTrainedTokenizerFast.from_pretrained("fkskdldh/test-5-25")
input_text = "짜장면 배달 가능한가요?"
input_ids = loaded_tokenizer.encode(input_text, return_tensors="pt")

output = loaded_model.generate(input_ids, max_length=100, num_return_sequences=1, temperature=1.0, pad_token_id=loaded_tokenizer.eos_token_id)

decoded_output = loaded_tokenizer.decode(output[0], skip_special_tokens=True)
print(decoded_output)


# input_text = "[직원][1] 안녀하세요 락궁입니다. [손님][2]10월 20일 7시에 예약 가능한가요? [직원][3] 잠시 기달려 주세요 네 가능합니다. 네 가능합니다. 인원을 말해주세요[손님][4] 3명입니다."
# input_text = "혹시 이번주 월요일에 예약 가능한가요?"
# pipe = pipeline(task="text-generation", model=loaded_model, tokenizer=tokenizer, max_length=200)
# result = pipe(f"<s>[INST] {input_text} [/INST]")
# print(result[0]['generated_text'])

In [None]:
input_text = "짜장면 배달 가능한가요?"
input_ids = loaded_tokenizer.encode(input_text, return_tensors="pt")

output = loaded_model.generate(input_ids, max_length=100, num_return_sequences=1, temperature=1.0, pad_token_id=loaded_tokenizer.eos_token_id)

decoded_output = loaded_tokenizer.decode(output[0], skip_special_tokens=True)
print(decoded_output)

In [None]:
# 모델의 config 비교
assert model.config.to_dict() == loaded_model.config.to_dict(), "Model configs do not match!"

# 토크나이저의 config 비교
assert tokenizer.get_vocab() == loaded_tokenizer.get_vocab(), "Tokenizer vocabs do not match!"
