In [None]:
from datasets import load_dataset
import os
import torch 
import gc
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, TrainingArguments
from transformers import DataCollatorForLanguageModeling, Trainer
from peft import LoraConfig, get_peft_model
from huggingface_hub import login
import configparser

config = configparser.ConfigParser()
config.read('configure.ini')

In [None]:
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
torch.cuda.empty_cache()
gc.collect()
torch.cuda.memory._set_allocator_settings("expandable_segments:True")


In [None]:
if torch.cuda.is_available():
    print(f"CUDA available. GPU: {torch.cuda.get_device_name(0)}")
    device = "cuda"
else:
    print("CUDA not available. Using CPU.")
    device = "cpu"

In [None]:
dataset = load_dataset('json', data_files=r"your_new_dataset_path.json")

In [1]:
# model_id = config.get('model', 'base_model')
# previous_fine_tuned_model = config.get('model', 'resume_model')

model_id = 'mistralai/Mistral-7B-Instruct-v0.3'
resume_model_path = r"D:\Camtour\src\model\chatbot_v0.2\checkpoint-750"
output_dir = r"D:\Camtour\src\model\chatbot_v0.3"

In [None]:
tokenizer = AutoTokenizer.from_pretrained(resume_model_path)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"  

In [None]:
def chat_format(examples):
    try:
        prompt = tokenizer.apply_chat_template(examples["messages"], tokenize=False)
        return {"prompt": prompt}
    except Exception as e:
        print(f"Error processing messages: {e}")
        return {"prompt": ""}

format_dataset = dataset.map(chat_format, remove_columns=dataset["train"].column_names)

print("Sample formatted prompts:")
for i in range(min(2, len(format_dataset['train']))):
    print(f"Example {i}: {format_dataset['train'][i]['prompt'][:200]}...")

In [None]:
def tokenize_function(examples):
    return tokenizer(
        examples["prompt"],
        truncation=True,
        padding="max_length",
        max_length=2048,  
        return_tensors='pt'
    )

tokenized_dataset = format_dataset.map(
    tokenize_function, 
    batched=True, 
    batch_size=8,
    remove_columns=["prompt"] 
)


In [None]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

In [None]:
print("Loading previously fine-tuned model...")
previous_fine_tuned_model = AutoModelForCausalLM.from_pretrained(
    resume_model_path,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True
)


In [None]:
new_lora_config = LoraConfig(
    r=32,  
    lora_alpha=64,
    lora_dropout=0.1,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    bias="none",
    task_type="CAUSAL_LM",
)

print("Applying new LoRA configuration...")
model = get_peft_model(previous_fine_tuned_model, new_lora_config)

print(f"Model trainable parameters: {model.num_parameters()/1e6:.2f}M")
model.print_trainable_parameters()

In [None]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False 
)

In [None]:
training_args = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=2,
    learning_rate=1e-4, 
    fp16=True,
    num_train_epochs=3,
    logging_steps=50,
    save_strategy="epoch",
    save_total_limit=2,
    optim="adamw_torch",
    warmup_steps=50,
    report_to=None,
    disable_tqdm=False,
    dataloader_pin_memory=False,
    gradient_checkpointing=True,
    remove_unused_columns=False,
    save_safetensors=True,
    logging_first_step=True,
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset.get("test"),
    data_collator=data_collator,
)

print("Starting training...")
trainer.train()

print("Saving model...")
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
print(f"Model saved to {output_dir}")