# Data preprocessing

In [1]:
import json
import os
DATA_PATH = "../data"   

DATASET = "dataset-v4.md"
JSONL_FILE = "finetune_{}.jsonl".format(DATASET.split(".")[0])    
# Read the raw markdown file
with open(os.path.join(DATA_PATH, DATASET), "r", encoding="utf-8") as infile:
    data = infile.read()

# Split on a chosen delimiter, for example, a double newline between examples
raw_examples = [entry.strip() for entry in data.split("\n\n") if entry.strip()]

raw_examples = list(filter(lambda x: "---" in x, raw_examples))

# Option 1: If your file is already in prompt/response pairs separated by a specific marker,
# you can split further. For example, assuming a delimiter like "----" splits prompt and response:
processed_examples = []



for example in raw_examples:
    if "---" in example:
        prompt, response = example.split("\n---\n", 1)
        processed_examples.append({
            "instruction": prompt.strip(),
            "response": response.strip()
        })
    else:
        # If not, you can treat the entire example as text (for language modeling)
        processed_examples.append({"text": example})

# Write out to a JSONL file

if not os.path.exists(os.path.join(DATA_PATH, JSONL_FILE)):

    with open(os.path.join(DATA_PATH, JSONL_FILE), "w", encoding="utf-8") as outfile:
        for example in processed_examples:
            outfile.write(json.dumps(example, ensure_ascii=False) + "\n")

In [None]:
import os
DATA_PATH = "C:\\Projects\\mlp\\data\\"

from datasets import load_dataset

def tokenize_dataset(dataset_name, tokenizer, max_length=256):
    """
    General function to tokenize a dataset using the provided tokenizer.
    
    Args:
        dataset: The dataset to tokenize
        tokenizer: The tokenizer to use
        max_length: Maximum sequence length for tokenization
        
    Returns:
        Tokenized dataset with train and validation splits
    """
    jsonl_file = f"finetune_dataset-{dataset_name}.jsonl"
    dataset = load_dataset("json", data_files={"train":DATA_PATH + jsonl_file})
    def tokenize_example(example):
        instruction = example["instruction"]
        response = example["response"]

        # Concatenate the instruction and response, using a separator.
        text = instruction + "\n" + response

        output = tokenizer(text, truncation=True, max_length=max_length, padding="max_length")
        labels = output["input_ids"].copy()


        # Obtain the tokenized instruction length (without special tokens if needed)
        instruction_token_len = len(tokenizer(instruction, add_special_tokens=False)["input_ids"])
        
        # Mask the instruction tokens so loss is computed only on the response
        labels[:instruction_token_len] = [-100] * instruction_token_len

        output["labels"] = labels
        return output
    
    # Apply tokenization
    tokenized_dataset = dataset.map(tokenize_example)
    
    # Split into train and validation
    split_dataset = tokenized_dataset["train"].train_test_split(test_size=0.1)

    
    # Apply tokenization with our general function
    train_dataset, validation_dataset = split_dataset["train"], split_dataset["test"]
    print(len(train_dataset))
    print(len(validation_dataset))
    print("-----")
    print(tokenizer.decode(train_dataset[80]["input_ids"]))
    print("-----")
    print(tokenizer.decode(validation_dataset[1]["input_ids"]))
    
    return train_dataset, validation_dataset



# Finetuning

In [None]:
from unsloth import FastModel
import torch

fourbit_models = [
    # 4bit dynamic quants for superior accuracy and low memory use
    "unsloth/gemma-3-1b-it-unsloth-bnb-4bit",
    "unsloth/gemma-3-4b-it-unsloth-bnb-4bit",
    "unsloth/gemma-3-12b-it-unsloth-bnb-4bit",
    "unsloth/gemma-3-27b-it-unsloth-bnb-4bit",

    # Other popular models!
    "unsloth/Llama-3.1-8B",
    "unsloth/Llama-3.2-3B",
    "unsloth/Llama-3.3-70B",
    "unsloth/mistral-7b-instruct-v0.3",
    "unsloth/Phi-4",
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastModel.from_pretrained(
    model_name = "unsloth/gemma-3-4b-it",
    max_seq_length = 2048, # Choose any for long context!
    load_in_4bit = True,  # 4 bit quantization to reduce memory
    load_in_8bit = False, # [NEW!] A bit more accurate, uses 2x memory
    full_finetuning = False, # [NEW!] We have full finetuning now!
    # token = "hf_...", # use one if using gated models
)




from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

# 4bit pre quantized models we support for 4x faster downloading + no OOMs.
fourbit_models = [
    "unsloth/Meta-Llama-3.1-8B-bnb-4bit",      # Llama-3.1 15 trillion tokens model 2x faster!
    "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit",
    "unsloth/Meta-Llama-3.1-70B-bnb-4bit",
    "unsloth/Meta-Llama-3.1-405B-bnb-4bit",    # We also uploaded 4bit for 405b!
    "unsloth/Mistral-Nemo-Base-2407-bnb-4bit", # New Mistral 12b 2x faster!
    "unsloth/Mistral-Nemo-Instruct-2407-bnb-4bit",
    "unsloth/mistral-7b-v0.3-bnb-4bit",        # Mistral v3 2x faster!
    "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
    "unsloth/Phi-3.5-mini-instruct",           # Phi-3.5 2x faster!
    "unsloth/Phi-3-medium-4k-instruct",
    "unsloth/gemma-2-9b-bnb-4bit",
    "unsloth/gemma-2-27b-bnb-4bit",            # Gemma 2x faster!
] # More models at https://huggingface.co/unsloth

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Phi-3.5-mini-instruct",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)



model = FastModel.get_peft_model(
    model,
    finetune_vision_layers     = False, # Turn off for just text!
    finetune_language_layers   = True,  # Should leave on!
    finetune_attention_modules = True,  # Attention good for GRPO
    finetune_mlp_modules       = True,  # SHould leave on always!

    r = 8,           # Larger = higher accuracy, but might overfit
    lora_alpha = 8,  # Recommended alpha == r at least
    lora_dropout = 0,
    bias = "none",
    random_state = 3407,
)

from trl import SFTTrainer, SFTConfig
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    eval_dataset = None, # Can set up evaluation!
    args = SFTConfig(
        dataset_text_field = "text",
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4, # Use GA to mimic batch size!
        warmup_steps = 5,
        # num_train_epochs = 1, # Set this for 1 full training run.
        max_steps = 30,
        learning_rate = 2e-4, # Reduce to 2e-5 for long training runs
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        report_to = "none", # Use this for WandB etc
    ),
)

OSError: [WinError 126] The specified module could not be found. Error loading "c:\Projects\mlp\unsloth\Lib\site-packages\torch\lib\torch_python.dll" or one of its dependencies.

In [24]:
from peft import get_peft_model, LoraConfig, TaskType
from transformers import AutoModelForCausalLM, AutoTokenizer

    
# Load the model's original tokenizer
model_name = "unsloth/phi-4-GGUF"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Example: Print all modules in the model
# for name, module in model.named_modules():
#     print(name)

# Define LoRA configuration
peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=32,                     # Rank of the update matrices
    lora_alpha=64,           # Parameter for scaling
    lora_dropout=0.1,        # Dropout probability for LoRA layers
    bias="none",             # Don't train bias parameters
    target_modules=["o_proj"],
)

print("Loading base model...")


model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype="auto",
    device_map={"":0}
)

model = get_peft_model(model, peft_config)

Loading base model...


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [22]:
train_dataset, validation_dataset = tokenize_dataset('v4', tokenizer)

473
53
-----
first sat of feb
DTSTART:{ref + (month=2, day=1, 1SA)}<|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftext|><|endoftex

In [23]:
model.device

device(type='cuda', index=0)

In [25]:
from transformers import Trainer, TrainingArguments


# Define training arguments
training_args = TrainingArguments(
    output_dir="{}_{}".format(DATASET.split(".")[0].removeprefix("dataset-"), model_name.split("/")[-1]),
    per_device_train_batch_size=8 ,      # Adjust batch size as needed
    per_device_eval_batch_size=8,
    logging_steps=10,
    evaluation_strategy="steps",        # Evaluate based on training steps
    num_train_epochs=5,                 # Total training epochs
    save_steps=100,                     # Save checkpoint every X steps
    learning_rate=5e-5,
    weight_decay=0.01,
    fp16=True,                         # Disable mixed precision training    
)

# Instantiate the Trainer
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=train_dataset,       # Your prepared train dataset
    eval_dataset=validation_dataset,     # Optionally, your validation dataset
)

# Put the model in training mode
model.train()

# Start the training process
trainer.train()

  trainer = Trainer(
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


KeyboardInterrupt: 

In [49]:
# Set the model to evaluation mode
model.eval()

# Define a function for inference
def generate_text(prompt, max_length=100):
    # Tokenize the input prompt
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    
    # Generate text
    with torch.no_grad():  # Disable gradient calculation for inference
        outputs = model.generate(
            input_ids=inputs.input_ids,
            attention_mask=inputs.attention_mask,
            max_length=max_length,
            temperature=0.7,
            top_p=0.9,
            do_sample=True,
            pad_token_id=tokenizer.pad_token_id
        )
    
    # Decode the generated text
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return generated_text

# Example usage
test_prompt = "next mon 1045"
generated_output = generate_text(test_prompt)
print("Input prompt:", test_prompt)
print("\nGenerated text:")
print(generated_output)


Both `max_new_tokens` (=2048) and `max_length`(=100) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


Input prompt: next mon 1045

Generated text:
next mon 10453
DTSTART:{ref + (day=1, mon=10)}T1045300
DURATION:PT1H


In [71]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, 
 
torch.random.manual_seed(0)

model_path = "microsoft/Phi-4-mini-instruct"

model = AutoModelForCausalLM.from_pretrained(
    model_path,
    device_map="auto",
    torch_dtype="auto",
    trust_remote_code=True,
)
tokenizer = AutoTokenizer.from_pretrained(model_path)


from peft import get_peft_model, LoraConfig, TaskType

peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=32,
    lora_alpha=64,
    lora_dropout=0.1,
    bias="none",
    target_modules=["q_proj", "v_proj"],
)

model = get_peft_model(model, peft_config)


print(model.device)

def tokenize_example(example):
    instruction = example["instruction"]
    response = example["response"]

    # Concatenate the instruction and response, using a separator.
    text = instruction + "\n" + response

    output = tokenizer(text, truncation=True, max_length=max_length, padding="max_length")
    labels = output["input_ids"].copy()

    # Obtain the tokenized instruction length (without special tokens if needed)
    instruction_token_len = len(tokenizer(instruction, add_special_tokens=False)["input_ids"])
    
    # Mask the instruction tokens so loss is computed only on the response
    labels[:instruction_token_len] = [-100] * instruction_token_len

    output["labels"] = labels
    return output


from datasets import load_dataset
from transformers import AutoTokenizer

# Load the dataset from the JSONL file
dataset = load_dataset("json", data_files={"train": "..\\data\\finetuning_dataset-v4.jsonl"})

# Apply tokenization
tokenized_dataset = dataset.map(tokenize_example)

# Split into train and validation
split_dataset = tokenized_dataset.train_test_split(test_size=0.1)

train_dataset, validation_dataset = split_dataset["train"], split_dataset["test"]

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
