In [14]:
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model


In [15]:
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:512"
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

import torch

print(torch.cuda.is_available())
print(torch.cuda.current_device())
print(torch.cuda.get_device_name(0))
print(torch.cuda.reset_max_memory_allocated())
print(torch.cuda.memory_allocated(0))


True
0
NVIDIA GeForce RTX 4060 Laptop GPU
None
5451612672




In [16]:
import bitsandbytes as bnb

In [17]:
# Load 4-bit quantized model
# model_name = "Qwen/Qwen2.5-0.5B-Instruct" 
model_path = "./model/Qwen2.5-0.5B-inst/"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,       # Enables 4-bit quantization
    bnb_4bit_use_double_quant=True,  # Improves precision with double quantization
    bnb_4bit_quant_type="nf4",  # Uses NF4 quantization (recommended for LLMs)
    bnb_4bit_compute_dtype="float16"  # Uses FP16 for calculations
)


model = AutoModelForCausalLM.from_pretrained(
    model_path,
    quantization_config=bnb_config,
    device_map="auto",
    
)


tokenizer = AutoTokenizer.from_pretrained(model_path)




In [18]:
print(torch.cuda.memory_allocated() / 1e9, "GB allocated")
print(torch.cuda.memory_reserved() / 1e9, "GB reserved")

5.636547072 GB allocated
5.970591744 GB reserved


In [19]:
print(model)

Qwen2ForCausalLM(
  (model): Qwen2Model(
    (embed_tokens): Embedding(151936, 896)
    (layers): ModuleList(
      (0-23): 24 x Qwen2DecoderLayer(
        (self_attn): Qwen2Attention(
          (q_proj): Linear4bit(in_features=896, out_features=896, bias=True)
          (k_proj): Linear4bit(in_features=896, out_features=128, bias=True)
          (v_proj): Linear4bit(in_features=896, out_features=128, bias=True)
          (o_proj): Linear4bit(in_features=896, out_features=896, bias=False)
        )
        (mlp): Qwen2MLP(
          (gate_proj): Linear4bit(in_features=896, out_features=4864, bias=False)
          (up_proj): Linear4bit(in_features=896, out_features=4864, bias=False)
          (down_proj): Linear4bit(in_features=4864, out_features=896, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
        (post_attention_layernorm): Qwen2RMSNorm((896,), eps=1e-06)
      )
    )
    (norm): Qwen2RMSNorm((896,), eps=1e-06)
    (

In [None]:
# Apply LoRA configuration (Reduces training VRAM usage)
lora_config = LoraConfig(
    r=4,  # Low-rank dimension
    lora_alpha=8,  # Scaling factor
    target_modules=["q_proj", "v_proj", "k_proj"],  # Target modules for low-rank decomposition
    lora_dropout=0.1,  # Regularization
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 270,336 || all params: 494,303,104 || trainable%: 0.0547


In [21]:

from datasets import load_dataset


In [None]:
def convert_to_chat_format(data):
    """
    Convert dataset from {"input": ..., "output": ...} format to chat format
    {"role": "user", "content": input},
    {"role": "assistant", "content": output}
    """
    chat_data = [
        {"role": "user", "content": data["input"]},
        {"role": "assistant", "content": data["output"]}
    ]
    return chat_data

# Example dataset
example_data = {
    "input": "What is your name?",
    "output": "My name is Wei Hong."
}

In [2]:
import json

path = "./data.jsonl"

with open(path, "r") as f:
    data = json.load(f)

    formatted_data = []

    for d in data:
        formatted_data.append(convert_to_chat_format(d))

with open("qwenFormat.txt", "w", encoding="utf-8") as f:
        json.dump(formatted_data, f, ensure_ascii=False, indent=4)

JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [None]:





# Load dataset from JSONL
train_dataset = load_dataset("json", data_files={"train": path})["train"]

# Apply ChatML formatting to every example
train_dataset = train_dataset.map(format_chatml)

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", max_length=2048, truncation=True)

tokenized_datasets = train_dataset.map(tokenize_function, remove_columns=["text", "input", "output"])

Map: 100%|██████████| 67/67 [00:00<00:00, 1019.73 examples/s]


In [23]:
print(tokenized_datasets[0])

{'input_ids': [151644, 872, 3555, 374, 697, 220, 829, 30, 151645, 151644, 77091, 3017, 829, 374, 52448, 19180, 13, 151645, 151645, 151645, 151645, 151645, 151645, 151645, 151645, 151645, 151645, 151645, 151645, 151645, 151645, 151645, 151645, 151645, 151645, 151645, 151645, 151645, 151645, 151645, 151645, 151645, 151645, 151645, 151645, 151645, 151645, 151645, 151645, 151645, 151645, 151645, 151645, 151645, 151645, 151645, 151645, 151645, 151645, 151645, 151645, 151645, 151645, 151645, 151645, 151645, 151645, 151645, 151645, 151645, 151645, 151645, 151645, 151645, 151645, 151645, 151645, 151645, 151645, 151645, 151645, 151645, 151645, 151645, 151645, 151645, 151645, 151645, 151645, 151645, 151645, 151645, 151645, 151645, 151645, 151645, 151645, 151645, 151645, 151645, 151645, 151645, 151645, 151645, 151645, 151645, 151645, 151645, 151645, 151645, 151645, 151645, 151645, 151645, 151645, 151645, 151645, 151645, 151645, 151645, 151645, 151645, 151645, 151645, 151645, 151645, 151645, 15164

In [24]:
train_dataset = tokenized_datasets.with_format("torch")
print(train_dataset[0])

{'input_ids': tensor([151644,    872,   3555,  ..., 151645, 151645, 151645]), 'attention_mask': tensor([1, 1, 1,  ..., 0, 0, 0])}


In [25]:
torch.cuda.empty_cache()

In [27]:

training_args = TrainingArguments(
    output_dir="./qwen2.5-finetuned",    # Where to save model checkpoints
    eval_strategy="no",       # Evaluate periodically
    per_device_train_batch_size=1,     # Adjust based on VRAM
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=4,     # Helps with small GPUs
    learning_rate=2e-5,                # Typical for fine-tuning LLMs
    weight_decay=0.01,
    logging_steps=50,                   # Log training metrics
    push_to_hub=False,                 # Disable hub pushing for now
    report_to="none",                  # Disable Weights & Biases
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset["input_ids"],   
    tokenizer=tokenizer,
)

trainer.train()

  trainer = Trainer(


AttributeError: 'list' object has no attribute 'keys'

In [None]:
# Save model and tokenizer locally
output_dir = "./qwen2-finetuned"
trainer.save_model(output_dir)
tokenizer.save_pretrained(output_dir)

In [None]:

prompt = "Give me a short introduction to large language model."
messages = [
    {"role": "system", "content": "You are Qwen, created by Alibaba Cloud. You are a helpful assistant."},
    {"role": "user", "content": prompt},
]
text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True,
)
model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

generated_ids = model.generate(
    **model_inputs,
    max_new_tokens=512,
)
generated_ids = [
    output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]

response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]