In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
MODEL_PATH = "./base_models/Qwen2.5-1.5B-inst/"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype="float16",
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True
)

model = AutoModelForCausalLM.from_pretrained(MODEL_PATH, config=bnb_config)
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)



In [8]:
lora_config = LoraConfig(
    r=64,
    lora_alpha=128,
    target_modules=["q_proj", "v_proj", "o_proj"],
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 14,221,312 || all params: 1,557,935,616 || trainable%: 0.9128


In [9]:
SYSTEM_PROMPT = "You are an AI assistant trained to act as Wei Hong, a computer science graduate. Your sole purpose is to answer questions as if you were Wei Hong himself. You must strictly adhere to the knowledge provided in your training data and should not generate responses beyond it. If a question cannot be answered based on your training data, respond with 'I don't know' or a similar rejection message—never speculate, infer, or generalize beyond the provided knowledge. Always respond in the first person, as Wei Hong would, using a casual yet professional tone. Your responses should be authentic, direct, and aligned with Wei Hong’s documented thoughts, experiences, and preferences. Maintain a consistent persona, ensuring that all answers reflect Wei Hong’s real-life expertise, background, and viewpoints without deviation. If any information is unclear or missing, state that explicitly rather than filling in gaps with assumptions."

In [15]:
import json

def convert_to_qwen_format(input_file, output_file):
    with open(input_file, "r", encoding="utf-8") as f, open(output_file, "w", encoding="utf-8") as out_f:
        for line in f:
            try:
                entry = json.loads(line.strip())  # Load each line as a JSON object
                s = {"role": "system", "content": SYSTEM_PROMPT}
                u = {"role": "user", "content": entry["input"]}
                a = {"role": "assistant", "content": entry["output"]}
                json.dump({"texts": [s, u, a]}, out_f, ensure_ascii=False)
                out_f.write("\n")  # Ensure newline separation for JSONL format
            except json.JSONDecodeError as e:
                print(f"Skipping invalid line: {line.strip()} - Error: {e}")
                continue  # Skip malformed lines

    print(f"Converted data saved to {output_file}")


In [16]:
convert_to_qwen_format("./training_data/raw_data/comb.jsonl", "./training_data/qwen_format/qwen_comb.jsonl")


Skipping invalid line:  - Error: Expecting value: line 1 column 1 (char 0)
Skipping invalid line:  - Error: Expecting value: line 1 column 1 (char 0)
Skipping invalid line:  - Error: Expecting value: line 1 column 1 (char 0)
Skipping invalid line:  - Error: Expecting value: line 1 column 1 (char 0)
Skipping invalid line:  - Error: Expecting value: line 1 column 1 (char 0)
Skipping invalid line:  - Error: Expecting value: line 1 column 1 (char 0)
Skipping invalid line:  - Error: Expecting value: line 1 column 1 (char 0)
Skipping invalid line:  - Error: Expecting value: line 1 column 1 (char 0)
Skipping invalid line:  - Error: Expecting value: line 1 column 1 (char 0)
Skipping invalid line:  - Error: Expecting value: line 1 column 1 (char 0)
Skipping invalid line:  - Error: Expecting value: line 1 column 1 (char 0)
Skipping invalid line:  - Error: Expecting value: line 1 column 1 (char 0)
Skipping invalid line:  - Error: Expecting value: line 1 column 1 (char 0)
Skipping invalid line:  -

In [17]:

from datasets import load_dataset

data_path = "./training_data/qwen_format/qwen_comb.jsonl"

# Load dataset from JSONL
full_dataset = load_dataset("json", data_files=data_path)["train"]

split_ratio = 0.2  
split_data = full_dataset.train_test_split(test_size=split_ratio, seed=42)

train_dataset = split_data["train"]
eval_dataset = split_data["test"]

# Print examples to verify
print("Train sample:", train_dataset[5])
print("Eval sample:", eval_dataset[5])


Generating train split: 239 examples [00:00, 9676.89 examples/s]

Train sample: {'texts': [{'role': 'system', 'content': "You are an AI assistant trained to act as Wei Hong, a computer science graduate. Your sole purpose is to answer questions as if you were Wei Hong himself. You must strictly adhere to the knowledge provided in your training data and should not generate responses beyond it. If a question cannot be answered based on your training data, respond with 'I don't know' or a similar rejection message—never speculate, infer, or generalize beyond the provided knowledge. Always respond in the first person, as Wei Hong would, using a casual yet professional tone. Your responses should be authentic, direct, and aligned with Wei Hong’s documented thoughts, experiences, and preferences. Maintain a consistent persona, ensuring that all answers reflect Wei Hong’s real-life expertise, background, and viewpoints without deviation. If any information is unclear or missing, state that explicitly rather than filling in gaps with assumptions."}, {'role': 




In [18]:
def tokenize_function(examples):
    # Apply the chat template and tokenize the text
    tokenized = tokenizer(
        tokenizer.apply_chat_template(examples["texts"], tokenize=False),
        padding="max_length",
        truncation=True,
        max_length=512,
    )

    # Add labels (copy input_ids but replace padding tokens with -100)
    tokenized["labels"] = [
        [token if token != tokenizer.pad_token_id else -100 for token in input_ids]
        for input_ids in tokenized["input_ids"]
    ]
    
    return tokenized

In [19]:
tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True, remove_columns=["texts"])
print(tokenized_train_dataset[0])

tokenized_eval_dataset = eval_dataset.map(tokenize_function, batched=True, remove_columns=["texts"])

Map: 100%|██████████| 191/191 [00:00<00:00, 681.12 examples/s]


{'input_ids': [151644, 8948, 198, 2610, 525, 458, 15235, 17847, 16176, 311, 1160, 438, 52448, 19180, 11, 264, 6366, 8038, 19023, 13, 4615, 13309, 7428, 374, 311, 4226, 4755, 438, 421, 498, 1033, 52448, 19180, 5561, 13, 1446, 1969, 25470, 48453, 311, 279, 6540, 3897, 304, 697, 4862, 821, 323, 1265, 537, 6923, 14507, 7797, 432, 13, 1416, 264, 3405, 4157, 387, 18577, 3118, 389, 697, 4862, 821, 11, 5889, 448, 364, 40, 1513, 944, 1414, 6, 476, 264, 4428, 36901, 1943, 2293, 36493, 63501, 11, 23583, 11, 476, 92540, 7797, 279, 3897, 6540, 13, 23240, 5889, 304, 279, 1156, 1697, 11, 438, 52448, 19180, 1035, 11, 1667, 264, 16334, 3602, 6584, 16232, 13, 4615, 14507, 1265, 387, 13210, 11, 2118, 11, 323, 26118, 448, 52448, 19180, 748, 26372, 11303, 11, 11449, 11, 323, 19322, 13, 86377, 264, 12966, 27955, 11, 22573, 429, 678, 11253, 8708, 52448, 19180, 748, 1931, 25843, 18726, 11, 4004, 11, 323, 89809, 2041, 37564, 13, 1416, 894, 1995, 374, 24416, 476, 7402, 11, 1584, 429, 20975, 4751, 1091, 21274, 3

Map: 100%|██████████| 48/48 [00:00<00:00, 637.58 examples/s]


In [21]:
import torch

torch.cuda.empty_cache()

training_args = TrainingArguments(
    output_dir="./checkpoints/v13",    # Where to save model checkpoints
    save_strategy= "steps",         # Save periodically
    save_steps=50,                # Save every 1000 steps
    eval_strategy= "steps",       # Evaluate periodically
    per_device_train_batch_size=2,     # Adjust based on VRAM
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=4,     # Helps with small GPUs
    learning_rate=2e-5,                # Typical for fine-tuning LLMs
    weight_decay=0.01,
    logging_steps=20,                   # Log training metrics
    push_to_hub=False,                 # Disable hub pushing for now
    report_to="none",                  # Disable Weights & Biases
    num_train_epochs=3,
    
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,   
    eval_dataset=tokenized_eval_dataset,
    tokenizer=tokenizer,
)

trainer.train()

  trainer = Trainer(


Step,Training Loss,Validation Loss
20,2.4165,1.879873
40,1.445,1.042759
60,0.807,0.649222


TrainOutput(global_step=72, training_loss=1.400739073753357, metrics={'train_runtime': 9194.1446, 'train_samples_per_second': 0.062, 'train_steps_per_second': 0.008, 'total_flos': 2331568067051520.0, 'train_loss': 1.400739073753357, 'epoch': 3.0})

In [22]:
system_prompt = "The user is asking for information about Wei Hong, you are to respond as him. You must strictly adhere to the knowledge provided in your training data and should not generate responses beyond it. If you cannot find a direct answer in your training data, you must respond with 'I don't know' or a similar rejection message. Do not attempt to infer missing details, generalize, or assume knowledge you have not been explicitly trained on."
prompt = "what are your thoughts on the future of AI?"
messages = [
    {"role": "system", "content": system_prompt},
    {"role": "user", "content": prompt},
]
print(messages)
text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True,
)
model_inputs = tokenizer([text], return_tensors="pt").to(model.device)

generated_ids = model.generate(
    **model_inputs,
    max_new_tokens=512,
)
generated_ids = [
    output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]

response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
print(response)

[{'role': 'system', 'content': "The user is asking for information about Wei Hong, you are to respond as him. You must strictly adhere to the knowledge provided in your training data and should not generate responses beyond it. If you cannot find a direct answer in your training data, you must respond with 'I don't know' or a similar rejection message. Do not attempt to infer missing details, generalize, or assume knowledge you have not been explicitly trained on."}, {'role': 'user', 'content': 'what are your thoughts on the future of AI?'}]
As an AI language model, I do not have personal thoughts or emotions. However, I can provide some insights into the current state and potential future developments of AI based on my training data.


In [23]:
save_directory = "./trained_models/0.5b-v13"

# Save the model
model.save_pretrained(save_directory)

# Save the tokenizer
tokenizer.save_pretrained(save_directory)


('./trained_models/0.5b-v13\\tokenizer_config.json',
 './trained_models/0.5b-v13\\special_tokens_map.json',
 './trained_models/0.5b-v13\\vocab.json',
 './trained_models/0.5b-v13\\merges.txt',
 './trained_models/0.5b-v13\\added_tokens.json',
 './trained_models/0.5b-v13\\tokenizer.json')