In [None]:
# Reinstall specific compatible versions (transformers provides PreTrainedModel export)
!pip uninstall -y transformers peft trl
!pip install transformers==4.55.0 peft==0.18.0 trl accelerate bitsandbytes datasets


In [None]:
# Diagnostics: versions and PreTrainedModel availability
import sys, pkgutil, importlib
import torch, transformers
print('Python:', sys.version)
print('Torch:', torch.__version__)
print('Transformers:', transformers.__version__, '->', transformers.__file__)
print('Has PreTrainedModel attr:', hasattr(transformers,'PreTrainedModel'))
print('Accelerate:', importlib.import_module('accelerate').__version__)
try:
    print('TRL:', importlib.import_module('trl').__version__)
except Exception as e:
    print('TRL import error:', e)
try:
    print('PEFT:', importlib.import_module('peft').__version__)
except Exception as e:
    print('PEFT import error:', e)
print('BitsAndBytes present:', pkgutil.find_loader('bitsandbytes') is not None)
print('\nFirst few sys.path entries:')
for p in sys.path[:8]:
    print(' ', p)
try:
    from transformers import PreTrainedModel
    print('Imported PreTrainedModel OK')
except Exception as e:
    print('PreTrainedModel import failed:', e)


In [None]:
# STEPS:
# TRAINING STEPS: INSTALL HUGGING FACE API, IMPORT MISTRAL 7B, RUN SFT TRAINER, SAVE MODEL, WE CAN UPDATE FOR RLHF LATER

# DATACLEANING STEPS: REFORMAT PROMPT RESPONSE PAIRS INTO USABLE FORMAT FOR HUGGING FACE
  # don't worry too much about file paths, etc. Just put in placeholders for now since I'm going to download the notebook and run it locally and I'll update it during training


In [None]:
# this is how hugging face expects input output pairs, have script clean up provided datasets into this format
<|im_start|>user # user just means user input
{prompt}
<|im_end|>
<|im_start|>assistant # assistant just means model output
{output}
<|im_end|>

# example prompt
<|im_start|>user
Explain gradient descent.
<|im_end|>

# example output
<|im_start|>assistant
Gradient descent is an iterative optimization algorithm...
<|im_end|>

In [None]:
import torch
print("PyTorch version:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
print("CUDA version compiled:", torch.version.cuda)
print("cuDNN version:", torch.backends.cudnn.version())

In [None]:
import torch
from datasets import Dataset
from peft import LoraConfig, TaskType
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, TrainingArguments
from trl import SFTTrainer, SFTConfig

In [None]:
# 1. Configuration
MODEL_ID = "mistralai/Mistral-7B-v0.1"
OUTPUT_DIR = "./mistral_finetuned_v1"

# 2. Load Model with 4-bit quantization
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4",
)

model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    quantization_config=bnb_config,
    device_map="auto",        # Don't do model.to(...)
)

# 3. Tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
tokenizer.pad_token = tokenizer.eos_token  # Fix for Mistral

# 4. Configure LoRA
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM,
    target_modules=["q_proj", "v_proj"]
)

# 5. Apply LoRA to the model
from peft import get_peft_model
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()  # Debug info

In [None]:
import json
from pathlib import Path
from datasets import Dataset

# Old tokens we want to strip out of the prompt/completion
USER_TOKEN = "<|im_start|>user"
ASSISTANT_TOKEN = "<|im_start|>assistant"
END_TOKEN = "<|im_end|>"

def load_json_files(json_dir):
    """
    Loads all .json files in a directory and returns a list of dicts
    with 'prompt' and 'completion' fields.
    """
    json_dir = Path(json_dir)
    examples = []

    for file in json_dir.glob("*.json"):
        with open(file, "r") as f:
            data = json.load(f)
            examples.extend(data)

    return examples


def clean(text: str) -> str:
    """
    Remove legacy ChatML tokens from text and trim whitespace.
    """
    return (
        text.replace(USER_TOKEN, "")
            .replace(ASSISTANT_TOKEN, "")
            .replace(END_TOKEN, "")
            .strip()
    )


def merge_prompt_completion(example):
    """
    Format dataset entries into correct Mistral instruction format.
    """
    user = clean(example["prompt"])
    assistant = clean(example["completion"])
    return f"<s>[INST] {user} [/INST] {assistant}</s>"


def build_mistral_dataset(json_dir):
    raw = load_json_files(json_dir)
    merged = [{"text": merge_prompt_completion(ex)} for ex in raw]
    return Dataset.from_list(merged)


# ======== USAGE EXAMPLE ========
DATASET_DIRECTORY = "."   # change this to your directory

dataset = build_mistral_dataset(DATASET_DIRECTORY)
print(dataset[0])  # verify formatting


In [None]:
sft_config = SFTConfig(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    num_train_epochs=3,
    learning_rate=2e-4,
    logging_steps=10,
    save_steps=100,
    max_length=1024,          # max token length
    dataset_text_field="text" # your column name; default is already "text"
)

# 5. Initialize SFTTrainer
trainer = SFTTrainer(
    model=model,              # your LoRA-wrapped Mistral
    args=sft_config,
    train_dataset=dataset,    # the Dataset you just built with "text" column
    processing_class=tokenizer,  # <-- replaces `tokenizer=` in newer TRL
    peft_config=None,         # you already did get_peft_model(), so no need here
)

In [None]:
# 6. Run Training
print("Starting training...")
trainer.train()
print(f"Model saved to {OUTPUT_DIR}")

In [None]:
# Load the cleaned and trimmed MassSynth dataset
from datasets import load_dataset
dataset = load_dataset("json", data_files="mistral_med_sft_scaled.jsonl")
print(dataset['train'][0])

In [None]:
sft_config = SFTConfig(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    num_train_epochs=3,
    learning_rate=2e-4,
    logging_steps=10,
    save_steps=100,
    max_length=1024,          # max token length
    dataset_text_field="text" # your column name; default is already "text"
)
trainer = SFTTrainer(
    model=model,              # your LoRA-wrapped Mistral
    args=sft_config,
    train_dataset=dataset['train'],    # the Dataset you just built with "text" column
    processing_class=tokenizer,  # <-- replaces `tokenizer=` in newer TRL
    peft_config=None,         # you already did get_peft_model(), so no need here
)
OUTPUT_DIR = "./mistral_finetuned_v2"
# 6. Run Training
print("Starting training...")
trainer.train()
print(f"Model saved to {OUTPUT_DIR}")

In [19]:
model.save_pretrained(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)

('./mistral_finetuned_v2\\tokenizer_config.json',
 './mistral_finetuned_v2\\special_tokens_map.json',
 './mistral_finetuned_v2\\tokenizer.json')