In [2]:
import json

input_file = "../data/projects.json"
output_file = "../data/projects.jsonl"

with open(input_file, "r", encoding="utf-8") as f:
    projects = json.load(f)

with open(output_file, "w", encoding="utf-8") as f:
    for p in projects:
        record = {
            "text": p["text"],
            "metadata": {
                "projekt": p["projekt"],
                "kategorie": p["kategorie"],
                "datum": p["datum"]
            }
        }
        f.write(json.dumps(record, ensure_ascii=False) + "\n")


In [3]:
from transformers import AutoTokenizer, AutoModelForCausalLM

model_name = "Qwen/Qwen2.5-3B"

tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    trust_remote_code=True
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    torch_dtype="auto",
    trust_remote_code=True
)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
W0204 08:15:08.463000 2284 site-packages/torch/distributed/elastic/multiprocessing/redirects.py:29] NOTE: Redirects are currently not supported in Windows or MacOs.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



In [4]:
from peft import LoraConfig, get_peft_model

lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 3,686,400 || all params: 3,089,625,088 || trainable%: 0.11931544750584314


In [5]:
# -------------------------------
# Dataset laden
# -------------------------------
from datasets import load_dataset, Dataset

# Originales Dataset
dataset = load_dataset("json", data_files="../data/projects.jsonl")["train"]

# -------------------------------
# Dataset für flexible Instructions erweitern
# -------------------------------
all_examples = []

for ex in dataset:
    projekt = ex.get("metadata", {}).get("projekt", "")
    kategorie = ex.get("metadata", {}).get("kategorie", "")
    datum = ex.get("metadata", {}).get("datum", "")

    # Verschiedene mögliche Fragen / Instructions pro Sample
    instructions = [
        f"Erkläre das Projekt {projekt} in einem Satz.",
        f"Wofür war das Projekt {projekt} zuständig?",
        f"Welche Probleme traten beim Projekt {projekt} auf?",
        f"Nenne die Kategorie des Projekts {projekt}.",
        f"Wann fand das Projekt {projekt} statt?"
    ]

    # Für jede Instruction ein neues Sample erzeugen
    for instr in instructions:
        all_examples.append({
            "text": ex["text"],
            "metadata": ex["metadata"],
            "instruction": instr
        })

# Neues, erweitertes Dataset
dataset_expanded = Dataset.from_list(all_examples)

# -------------------------------
# Tokenisierung
# -------------------------------
def tokenize_fn(example):
    prompt = (
        f"### Instruction:\n"
        f"{example['instruction']}\n"
        f"Beziehe dich nur auf die Trainingsdaten, erfinde nichts.\n\n"
        f"### Response:\n"
        f"{example['text']}"
    )

    tokenized_example = tokenizer(
        prompt,
        truncation=True,
        max_length=512
    )

    # Labels für Causal LM
    tokenized_example["labels"] = tokenized_example["input_ids"]
    return tokenized_example

# batched=False → ein Sample pro Map
tokenized = dataset_expanded.map(tokenize_fn, batched=False)

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/14625 [00:00<?, ? examples/s]

In [6]:
# Trainer Setup
from transformers import Trainer, TrainingArguments
from transformers import DataCollatorForSeq2Seq

# Optional: DataCollator für kleine Batches / MPS
data_collator = DataCollatorForSeq2Seq(tokenizer, return_tensors="pt")

training_args = TrainingArguments(
    output_dir="./qwen2-lora",
    per_device_train_batch_size=1,   # MPS sehr klein
    gradient_accumulation_steps=8,   # effektiv größere Batch
    num_train_epochs=3,
    learning_rate=1e-4,              # stabil für LoRA
    fp16=False,   # MPS unterstützt kein FP16 in Trainer
    bf16=False,   # MPS blockiert BF16
    save_strategy="epoch",
    logging_steps=10,
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized,
    tokenizer=tokenizer,
    data_collator=data_collator
)

In [7]:
# start training
trainer.train()
trainer.save_model("./qwen2-lora-finetuned")


  0%|          | 0/5484 [00:00<?, ?it/s]



{'loss': 3.6415, 'grad_norm': 0.9296875, 'learning_rate': 9.981765134938001e-05, 'epoch': 0.01}
{'loss': 3.0912, 'grad_norm': 1.3359375, 'learning_rate': 9.963530269876004e-05, 'epoch': 0.01}
{'loss': 2.3658, 'grad_norm': 1.8671875, 'learning_rate': 9.945295404814005e-05, 'epoch': 0.02}
{'loss': 1.7466, 'grad_norm': 1.71875, 'learning_rate': 9.927060539752007e-05, 'epoch': 0.02}
{'loss': 1.4318, 'grad_norm': 2.109375, 'learning_rate': 9.908825674690008e-05, 'epoch': 0.03}
{'loss': 1.2845, 'grad_norm': 1.9609375, 'learning_rate': 9.89059080962801e-05, 'epoch': 0.03}
{'loss': 1.063, 'grad_norm': 1.984375, 'learning_rate': 9.872355944566011e-05, 'epoch': 0.04}
{'loss': 0.9982, 'grad_norm': 2.15625, 'learning_rate': 9.854121079504012e-05, 'epoch': 0.04}
{'loss': 1.0415, 'grad_norm': 1.4609375, 'learning_rate': 9.835886214442014e-05, 'epoch': 0.05}
{'loss': 0.9298, 'grad_norm': 1.6640625, 'learning_rate': 9.817651349380015e-05, 'epoch': 0.05}
{'loss': 0.9564, 'grad_norm': 1.7734375, 'learni



{'loss': 0.6427, 'grad_norm': 1.3046875, 'learning_rate': 6.663019693654268e-05, 'epoch': 1.0}
{'loss': 0.5619, 'grad_norm': 1.625, 'learning_rate': 6.644784828592269e-05, 'epoch': 1.01}
{'loss': 0.5373, 'grad_norm': 1.171875, 'learning_rate': 6.62654996353027e-05, 'epoch': 1.01}
{'loss': 0.613, 'grad_norm': 1.296875, 'learning_rate': 6.608315098468272e-05, 'epoch': 1.02}
{'loss': 0.5004, 'grad_norm': 1.5078125, 'learning_rate': 6.590080233406273e-05, 'epoch': 1.02}
{'loss': 0.4756, 'grad_norm': 1.3046875, 'learning_rate': 6.571845368344275e-05, 'epoch': 1.03}
{'loss': 0.5488, 'grad_norm': 1.3203125, 'learning_rate': 6.553610503282276e-05, 'epoch': 1.03}
{'loss': 0.582, 'grad_norm': 1.03125, 'learning_rate': 6.535375638220278e-05, 'epoch': 1.04}
{'loss': 0.5704, 'grad_norm': 1.65625, 'learning_rate': 6.51714077315828e-05, 'epoch': 1.04}
{'loss': 0.59, 'grad_norm': 1.0234375, 'learning_rate': 6.49890590809628e-05, 'epoch': 1.05}
{'loss': 0.6494, 'grad_norm': 1.1171875, 'learning_rate': 



{'loss': 0.6181, 'grad_norm': 1.375, 'learning_rate': 3.326039387308534e-05, 'epoch': 2.0}
{'loss': 0.5152, 'grad_norm': 1.9765625, 'learning_rate': 3.307804522246535e-05, 'epoch': 2.01}
{'loss': 0.5035, 'grad_norm': 1.875, 'learning_rate': 3.2895696571845366e-05, 'epoch': 2.01}
{'loss': 0.5434, 'grad_norm': 2.609375, 'learning_rate': 3.271334792122538e-05, 'epoch': 2.02}
{'loss': 0.5408, 'grad_norm': 2.21875, 'learning_rate': 3.25309992706054e-05, 'epoch': 2.02}
{'loss': 0.4811, 'grad_norm': 1.2890625, 'learning_rate': 3.2348650619985414e-05, 'epoch': 2.03}
{'loss': 0.5692, 'grad_norm': 1.28125, 'learning_rate': 3.216630196936542e-05, 'epoch': 2.03}
{'loss': 0.4645, 'grad_norm': 1.734375, 'learning_rate': 3.198395331874544e-05, 'epoch': 2.04}
{'loss': 0.5239, 'grad_norm': 2.28125, 'learning_rate': 3.1801604668125455e-05, 'epoch': 2.05}
{'loss': 0.4693, 'grad_norm': 1.8828125, 'learning_rate': 3.161925601750547e-05, 'epoch': 2.05}
{'loss': 0.4734, 'grad_norm': 2.671875, 'learning_rate'

In [None]:
# Inference Pipeline
from transformers import pipeline

finetuned_model = "../models/qwen2-lora-finetuned"

pipe = pipeline(
    "text-generation",
    model=finetuned_model,
    tokenizer=tokenizer,
    device_map="auto",
    torch_dtype="auto"
)

# Beispiel-Prompt mit instruction-style
prompt = (
    "### Instruction:\n"
    "Wie ist der aktuelle Stand der OAuth2-Migration beim Projekt Atlas?\n\n"
    "### Response:\n"
)

output = pipe(prompt, max_new_tokens=100)
print(output[0]["generated_text"])



Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]