<a href="https://colab.research.google.com/github/qixiangme/artistic_AI/blob/main/dialogueTest.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# tinyllama_hospital_bot.py

# 1. 설치
!pip install -q transformers datasets accelerate bitsandbytes

# 2. 데이터 정의
from datasets import Dataset

def join_dialogue(messages):
    prompt = "<|system|>\n당신은 병원 예약을 도와주는 비서입니다.\n"
    for msg in messages:
        role_tag = "<|user|>" if msg["role"] == "user" else "<|assistant|>"
        prompt += f"{role_tag}\n{msg['content']}\n"
    return prompt.strip()

dataset = Dataset.from_dict({
    "text": [join_dialogue(dialogue) for dialogue in dialogues]
})


# 3. 모델 불러오기from transformers import
from transformers import AutoTokenizer, AutoModelForCausalLM,BitsAndBytesConfig
import wandb
from peft import get_peft_model, LoraConfig, TaskType
from huggingface_hub import login
login("")
wandb.login()
model_id = "mistralai/Mistral-7B-v0.1"
import torch
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token  # 이 줄

bnb_config = BitsAndBytesConfig(
    load_in_8bit=True,
    llm_int8_enable_fp32_cpu_offload=True,  # CPU 오프로딩 활성화
)


model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    torch_dtype= torch.float16,
    quantization_config=bnb_config,
)
# 4. PEFT LoRA 설정 및 모델에 적용
lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    inference_mode=False,
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
)
model = get_peft_model(model, lora_config)
# 4. 전처리 (토크나이징)
def tokenize(example):
    out = tokenizer(example["text"], truncation=True, padding="max_length", max_length=256)
    out["labels"] = out["input_ids"]
    return out

tokenized_dataset = dataset.map(tokenize)

# 5. 파인튜닝 설정 및 Trainer 생성
from transformers import TrainingArguments, Trainer



training_args = TrainingArguments(
    output_dir="./tinyllama-results",
    per_device_train_batch_size=4,
    num_train_epochs=3,
    learning_rate=2e-5,
    save_strategy="epoch",
    logging_steps=1,
    run_name="tinyllama_hospital_bot_finetune",
    fp16=False,     # ← 반드시 False
    bf16=False,
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
)

# 6. 파인튜닝 시작
trainer.train()

# 7. 테스트 예시
# 7. 테스트 예시 (유저 입력 1개 → 챗봇 응답 1개)
import torch
prompt = "<|system|>\n당신은 병원 예약을 도와주는 비서입니다.\n<|user|>\n안녕하세요 내일 오후에 연락하고 싶은데 어떻게 하면 되죠?.\n<|assistant|>\n"
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

output = model.generate(
    **inputs,
    max_new_tokens=100,
    do_sample=True,
    temperature=0.7,
    top_p=0.9
)

# 디코딩 및 응답 추출
decoded = tokenizer.decode(output[0], skip_special_tokens=True)

# <|assistant|> 이후 텍스트만 가져오기
if "<|assistant|>" in decoded:
    response = decoded.split("<|assistant|>")[-1].strip()
else:
    response = decoded.strip()

print("\n🤖 챗봇 응답 (1회):\n")
print(response)



[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.9/72.9 MB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m66.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m67.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m32.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m11.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33merang903[0m ([33merang903-kh[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/996 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Map:   0%|          | 0/591 [00:00<?, ? examples/s]

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


OutOfMemoryError: CUDA out of memory. Tried to allocate 224.00 MiB. GPU 0 has a total capacity of 14.74 GiB of which 78.12 MiB is free. Process 25768 has 14.66 GiB memory in use. Of the allocated memory 14.26 GiB is allocated by PyTorch, and 270.68 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

# 1. 일반 모델 로드 (PEFT 아님)
model = AutoModelForCausalLM.from_pretrained("./tinyllama-checkpoint")
tokenizer = AutoTokenizer.from_pretrained("./tinyllama-checkpoint")
tokenizer.pad_token = tokenizer.eos_token

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

def trim_history(history, max_turns=6):
    parts = history.split("<|user|>")
    system_msg = parts[0]
    recent_dialogues = parts[-max_turns:] if len(parts) > max_turns else parts[1:]
    trimmed_history = system_msg + "<|user|>" + "<|user|>".join(recent_dialogues)
    return trimmed_history

history = "<|system|>\n당신은 병원 예약을 도와주는 비서입니다.\n"
user_turns = [
    "다음 주 화요일 오전에 내과 진료 예약할 수 있나요?",
    "그럼 그때 해주세요",
    "잠시만요 다른 요일로 바꿔도 될까요?.",
    "이번 주 금요일 오전 9시에 예약 가능할까요?",
    "잠시만요 주치의 선생님 진료 예약 가능한 날짜 알려주세요.",
    "그럼 그 시간대로 바꿔주세요.",
]

for user_input in user_turns:
    history += f"<|user|>\n{user_input}\n<|assistant|>\n"

    inputs = tokenizer(history, return_tensors="pt", padding=True, truncation=True, max_length=512)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    output = model.generate(
        input_ids=inputs["input_ids"],
        attention_mask=inputs.get("attention_mask"),
        max_new_tokens=100,
        do_sample=True,
        temperature=0.7,
        top_p=0.9,
        pad_token_id=tokenizer.eos_token_id,
        eos_token_id=tokenizer.eos_token_id,
    )

    decoded = tokenizer.decode(output[0], skip_special_tokens=True)
    if "<|assistant|>" in decoded:
        response = decoded.split("<|assistant|>")[-1].strip()
    else:
        response = decoded.strip()

    history += response + "\n"
    history = trim_history(history, max_turns=3)

    print(f"👤 {user_input}")
    print(f"🤖 {response}")
    print("-" * 60)
