# Data preprocessing

In [17]:
import json
import os
DATA_PATH = "../data"   

DATASET = "dataset-v4.md"
JSONL_FILE = "finetune_{}.jsonl".format(DATASET.split(".")[0])    
# Read the raw markdown file
with open(os.path.join(DATA_PATH, DATASET), "r", encoding="utf-8") as infile:
    data = infile.read()

# Split on a chosen delimiter, for example, a double newline between examples
raw_examples = [entry.strip() for entry in data.split("\n\n") if entry.strip()]

raw_examples = list(filter(lambda x: "---" in x, raw_examples))

# Option 1: If your file is already in prompt/response pairs separated by a specific marker,
# you can split further. For example, assuming a delimiter like "----" splits prompt and response:
processed_examples = []



for example in raw_examples:
    if "---" in example:
        prompt, response = example.split("\n---\n", 1)
        processed_examples.append({
            "instruction": prompt.strip(),
            "response": response.strip()
        })
    else:
        # If not, you can treat the entire example as text (for language modeling)
        processed_examples.append({"text": example})

# Write out to a JSONL file

if not os.path.exists(os.path.join(DATA_PATH, JSONL_FILE)):

    with open(os.path.join(DATA_PATH, JSONL_FILE), "w", encoding="utf-8") as outfile:
        for example in processed_examples:
            outfile.write(json.dumps(example, ensure_ascii=False) + "\n")

In [10]:
import os
DATA_PATH = "C:\\Projects\\mlp\\data\\"

from datasets import load_dataset
from transformers import AutoTokenizer

def tokenize_dataset(dataset_name, tokenizer, max_length=256):
    """
    General function to tokenize a dataset using the provided tokenizer.
    
    Args:
        dataset: The dataset to tokenize
        tokenizer: The tokenizer to use
        max_length: Maximum sequence length for tokenization
        
    Returns:
        Tokenized dataset with train and validation splits
    """
    jsonl_file = f"finetune_dataset-{dataset_name}.jsonl"
    dataset = load_dataset("json", data_files={"train":DATA_PATH + jsonl_file})
    def tokenize_example(example):
        instruction = example["instruction"]
        response = example["response"]

        # Concatenate the instruction and response, using a separator.
        text = instruction + "\n" + response

        output = tokenizer(text, truncation=True, max_length=max_length, padding="max_length")
        labels = output["input_ids"].copy()

        # Obtain the tokenized instruction length (without special tokens if needed)
        instruction_token_len = len(tokenizer(instruction, add_special_tokens=False)["input_ids"])
        
        # Mask the instruction tokens so loss is computed only on the response
        labels[:instruction_token_len] = [-100] * instruction_token_len

        output["labels"] = labels
        return output
    
    # Apply tokenization
    tokenized_dataset = dataset.map(tokenize_example)
    
    # Split into train and validation
    split_dataset = tokenized_dataset["train"].train_test_split(test_size=0.1)

    
    # Apply tokenization with our general function
    train_dataset, validation_dataset = split_dataset["train"], split_dataset["test"]
    print(len(train_dataset))
    print(len(validation_dataset))
    print("-----")
    print(tokenizer.decode(train_dataset[80]["input_ids"]))
    print("-----")
    print(tokenizer.decode(validation_dataset[1]["input_ids"]))
    
    return train_dataset, validation_dataset



# Finetuning

In [33]:
import torch

torch.cuda.is_available()

True

In [85]:
from peft import get_peft_model, LoraConfig, TaskType
from transformers import AutoModelForCausalLM, AutoTokenizer

    
# Load the model's original tokenizer
model_name = "Qwen/Qwen2.5-0.5B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Define LoRA configuration
peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=64,                     # Rank of the update matrices
    lora_alpha=64,           # Parameter for scaling
    lora_dropout=0.1,        # Dropout probability for LoRA layers
    bias="none",             # Don't train bias parameters
    target_modules=["q_proj", "v_proj", "o_proj"],
)

print("Loading base model...")


model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype="auto",
    device_map={"":0}
)

model = get_peft_model(model, peft_config)


Loading base model...


In [71]:
train_dataset, validation_dataset = tokenize_dataset('v2', tokenizer)

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/487 [00:00<?, ? examples/s]

438
49
-----
DTFORMAT: MMDDYYYY
in 5 years 12/08
DTSTART:{ref + (years=5, month=12, day=8)}<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|

In [47]:
model.device

device(type='cuda', index=0)

In [86]:
from transformers import Trainer, TrainingArguments


# Define training arguments
training_args = TrainingArguments(
    output_dir="{}_{}".format(DATASET.split(".")[0].removeprefix("dataset-"), model_name.split("/")[-1]),
    per_device_train_batch_size=8 ,      # Adjust batch size as needed
    per_device_eval_batch_size=8,
    logging_steps=10,
    evaluation_strategy="steps",        # Evaluate based on training steps
    num_train_epochs=4,                 # Total training epochs
    save_steps=100,                     # Save checkpoint every X steps
    learning_rate=5e-5,
    weight_decay=0.01,
    fp16=True,                         # Disable mixed precision training    
)

# Instantiate the Trainer
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=train_dataset,       # Your prepared train dataset
    eval_dataset=validation_dataset,     # Optionally, your validation dataset
)

# Put the model in training mode
model.train()

# Start the training process
trainer.train()

  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss,Validation Loss
10,13.0258,6.385834
20,2.9149,0.514799
30,0.4105,0.307643
40,0.2395,0.163842
50,0.1486,0.107605
60,0.0778,0.077161
70,0.0772,0.064671
80,0.0586,0.057942
90,0.0517,0.054535
100,0.042,0.050128


TrainOutput(global_step=550, training_loss=0.32361112274229525, metrics={'train_runtime': 201.7792, 'train_samples_per_second': 21.707, 'train_steps_per_second': 2.726, 'total_flos': 2455441735680000.0, 'train_loss': 0.32361112274229525, 'epoch': 10.0})

In [89]:
# Set the model to evaluation mode
model.eval()

# Define a function for inference
def generate_text(prompt, max_length=100):
    # Tokenize the input prompt
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    
    # Generate text
    with torch.no_grad():  # Disable gradient calculation for inference
        outputs = model.generate(
            input_ids=inputs.input_ids,
            attention_mask=inputs.attention_mask,
            max_length=max_length,
            temperature=0.7,
            top_p=0.9,
            do_sample=True,
            pad_token_id=tokenizer.pad_token_id
        )
    
    # Decode the generated text
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return generated_text

# Example usage
test_prompt = "jan 2nd"
generated_output = generate_text(test_prompt)
print("Input prompt:", test_prompt)
print("\nGenerated text:")
print(generated_output)


Input prompt: jan 2nd

Generated text:
jan 2nd
DTSTART:{ref + (day=1, month=1, 1J)}


In [25]:
from transformers import DistilBertTokenizer, DistilBertForQuestionAnswering
import torch


In [41]:
from peft import get_peft_model, LoraConfig, TaskType

# Define LoRA configuration
peft_config = LoraConfig(
    task_type=TaskType.QUESTION_ANS,
    r=64,                     # Rank of the update matrices
    lora_alpha=64,           # Parameter for scaling
    lora_dropout=0.1,        # Dropout probability for LoRA layers
    bias="none",             # Don't train bias parameters
    target_modules=["q_lin", "k_lin", "v_lin"],
)

print("Loading base model...")

model_name =  'distilbert-base-cased-distilled-squad'
tokenizer = DistilBertTokenizer.from_pretrained(model_name)


model = DistilBertForQuestionAnswering.from_pretrained(
    model_name,
    torch_dtype="auto",
    device_map={"":0}
)

# Print model modules to identify the correct target modules for LoRA
# for name, module in model.named_modules():
#     print(f"Module name: {name}, Type: {type(module).__name__}")

model = get_peft_model(model, peft_config)

Loading base model...


In [27]:
model.device

device(type='cuda', index=0)

In [39]:
def tokenize_dataset(dataset_name, tokenizer, max_length=256):
    """
    General function to tokenize a dataset using the provided tokenizer.
    
    Args:
        dataset: The dataset to tokenize
        tokenizer: The tokenizer to use
        max_length: Maximum sequence length for tokenization
        
    Returns:
        Tokenized dataset with train and validation splits
    """
    jsonl_file = f"finetune_dataset-{dataset_name}.jsonl"
    dataset = load_dataset("json", data_files={"train":DATA_PATH + jsonl_file})
    def tokenize_example(example):
        # This method automatically inserts the [CLS] token at the start,
        # separates the two parts with a [SEP] token, and generates token_type_ids.
        return tokenizer(
            example["instruction"],
            example["response"],
            truncation=True,
            max_length=256,
            padding="max_length"
        )
    
    # Apply tokenization
    tokenized_dataset = dataset.map(tokenize_example)
    def remove_labels(batch):
        if "labels" in batch:
            del batch["labels"]
        return batch

    tokenized_dataset = tokenized_dataset.map(remove_labels)
    
    # Split into train and validation
    split_dataset = tokenized_dataset["train"].train_test_split(test_size=0.1)

    
    # Apply tokenization with our general function
    train_dataset, validation_dataset = split_dataset["train"], split_dataset["test"]
    print(len(train_dataset))
    print(len(validation_dataset))
    print("-----")
    print(tokenizer.decode(train_dataset[80]["input_ids"]))
    print("-----")
    print(tokenizer.decode(validation_dataset[1]["input_ids"]))
    
    return train_dataset, validation_dataset

In [40]:
train_dataset, validation_dataset =  tokenize_dataset('v2', tokenizer)

KeyError: 'labels'

In [43]:
from transformers import Trainer, TrainingArguments

# Define training arguments
training_args = TrainingArguments(
    output_dir="{}_{}".format('v2', model_name.split("/")[-1]),
    per_device_train_batch_size=8 ,      # Adjust batch size as needed
    per_device_eval_batch_size=8,
    logging_steps=10,
    evaluation_strategy="steps",        # Evaluate based on training steps
    num_train_epochs=4,                 # Total training epochs
    save_steps=100,                     # Save checkpoint every X steps
    learning_rate=5e-5,
    weight_decay=0.01,
    fp16=True,                         # Disable mixed precision training    
)

# Instantiate the Trainer
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=train_dataset,       # Your prepared train dataset
    eval_dataset=validation_dataset,     # Optionally, your validation dataset
)

# Put the model in training mode
model.train()

# Start the training process
trainer.train()

  trainer = Trainer(
No label_names provided for model class `PeftModelForQuestionAnswering`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


ValueError: The model did not return a loss from the inputs, only the following keys: start_logits,end_logits. For reference, the inputs it received are input_ids,attention_mask.