In [None]:
from datasets import load_dataset
# Load the dataset from your specified path
dataset = load_dataset('json', data_files=r"your_path_here.json")

In [None]:
# Clear cache and set memory allocator settings
# This is useful for managing memory in PyTorch, especially when using large models or datasets.
import os
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'

import torch 
import gc

torch.cuda.empty_cache()
gc.collect()

torch.cuda.memory._set_allocator_settings("expandable_segments:True")

In [None]:
# Login to Hugging Face Hub
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [3]:
from transformers import pipeline

In [4]:
from transformers import AutoTokenizer

In [None]:
model_id = 'mistralai/Mistral-7B-Instruct-v0.3'

# Since, I am resuming training from a checkpoint, I will load the tokenizer from the checkpoint path.
# In your case, Delete the resume_model_path and use the model_id directly.
resume_model_path = r"D:\Camtour\src\model\chatbot_v0.2\checkpoint-750" 

# Load the tokenizer
# tokenizer = AutoTokenizer.from_pretrained(model_id) 

tokenizer = AutoTokenizer.from_pretrained(resume_model_path)
tokenizer.pad_token = tokenizer.eos_token

In [None]:
# Apply the chat template to the dataset
def chat_format(examples):
  return {"prompt": tokenizer.apply_chat_template(examples["messages"], tokenize=False)}

In [7]:
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

In [8]:
format_dataset = dataset.map(chat_format)

In [9]:
print(format_dataset['train'][0]['prompt'])
print(format_dataset['train'][1]['prompt'])

<s>[INST] What is the general characteristic of the people of Cambodia, according to Adventure-Cambodia?[/INST] Friendly people.</s>
<s>[INST] What does a blue flag indicate at the Royal Palace?[/INST] That's a great question! The King is currently in residence.</s>


In [None]:
# Check if CUDA is available to utilize the GPU
import torch

if torch.cuda.is_available():
    print("CUDA is available. PyTorch can utilize the GPU.")
    print(f"Number of GPUs available: {torch.cuda.device_count()}")
    print(f"GPU Name: {torch.cuda.get_device_name(0)}")
else:
    print("CUDA is not available. PyTorch will use the CPU.")

CUDA is available. PyTorch can utilize the GPU.
Number of GPUs available: 1
GPU Name: NVIDIA GeForce RTX 3080


In [11]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"device: ", {device})

device:  {'cuda'}


In [None]:
# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(
        examples["prompt"],
        truncation=True,
        padding=True,
        max_length=1024,
        return_tensors='pt'
    )

tokenized_dataset = format_dataset.map(tokenize_function, batched=True)

In [None]:
import transformers
from transformers import AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments
from peft import LoraConfig
from transformers import DataCollatorForLanguageModeling, Trainer

In [None]:
# Using DataCollatorForLanguageModeling to handle padding and masking
# This is important for language modeling tasks, especially when using causal language models.
# It ensures that the input sequences are properly padded and masked for training.

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

In [17]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [18]:
llm_int8_enable_fp32_cpu_offload=True

In [None]:
# In your case, you can directly use the model_id instead of resume_model_path.
model = AutoModelForCausalLM.from_pretrained(
    resume_model_path,
    quantization_config=bnb_config,
    device_map="auto", # Automatically maps the model to available GPUs
    trust_remote_code=True
)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [None]:
# Base on your dataset and model, you can adjust the LoRA configuration.
# r-rank can be 4, 8, 16, 32, etc.
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    bias="none",
    task_type="CAUSAL_LM",
)

In [21]:
from peft import get_peft_model, TaskType

In [None]:
# Apply LoRA to the model
model = get_peft_model(model, lora_config)



In [None]:
# Ensure that the model is showing in your console (Step and Loss)
from transformers.utils import logging

logging.set_verbosity_info()

In [None]:
# Initialize the Trainer (Important)
training_args = TrainingArguments(
    output_dir=r"D:\Camtour\src\model\chatbot_v0.3",
    per_device_train_batch_size=4, # Adjust based on your GPU memory can be 4, 8, 16          
    gradient_accumulation_steps=2, # Increase this if you have limited GPU memory        
    learning_rate=2e-4,
    fp16=True,
    num_train_epochs=3,
    logging_steps=50,
    save_strategy="epoch",
    save_total_limit=2,
    optim="adamw_torch",  # Use adamw_torch optimizer is recommended
    warmup_steps=50,
    report_to=None,
    disable_tqdm=False,
    # The Options for memory optimization
    greater_is_better=False,
    dataloader_pin_memory=False,
    gradient_checkpointing=True,
    ddp_find_unused_parameters=False,
    remove_unused_columns=True,
    load_best_model_at_end=False,
    save_safetensors=True,
)

In [25]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset.get("test"),
    data_collator=data_collator,
)

Using auto half precision backend
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [None]:
trainer.train()

The following columns in the Training set don't have a corresponding argument in `PeftModelForCausalLM.forward` and have been ignored: prompt, messages. If prompt, messages are not expected by `PeftModelForCausalLM.forward`,  you can safely ignore this message.
skipped Embedding(32768, 4096): 128.0M params
skipped: 128.0M params
***** Running training *****
  Num examples = 1,000
  Num Epochs = 3
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 750
  Number of trainable parameters = 13,631,488
  attn_output = torch.nn.functional.scaled_dot_product_attention(


In [None]:
def ask_chatbot(prompt, chat_history=None):
    if chat_history is None:
        chat_history = []
    chat_history.append({"role": "user", "content": prompt})

    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    input_ids = tokenizer.apply_chat_template(
        chat_history,
        tokenize=True,
        add_generation_prompt=True,
        return_tensors="pt"
    ).to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            input_ids,
            max_new_tokens=256,
            do_sample=True,
            temperature=0.6,
            top_p=0.9,
            repetition_penalty=1.15,
            pad_token_id=tokenizer.pad_token_id,
           # eos_token_id=tokenizer.eos_token_id
        )
    num_input_tokens = input_ids.shape[1]
    response_ids = outputs[0][num_input_tokens:]
    new_response = tokenizer.decode(response_ids, skip_special_tokens=True)
    chat_history.append({"role": "assistant", "content": new_response})

    return new_response, chat_history


In [None]:
print("Chatbot is ready. Ask a question!")
prompt1 = "What are some must-see temples in Cambodia besides Angkor Wat?"
response1, history = ask_chatbot(prompt1)
print(f"User: {prompt1}")
print(f"Bot: {response1}\n")

prompt2 = "Tell me more about the one with the giant stone faces."
response2, history = ask_chatbot(prompt2, chat_history=history)
print(f"User: {prompt2}")
print(f"Bot: {response2}\n")