In [1]:
# jane_austen_finetune.py
# Minimal fine-tuning pipeline: fine-tune a small causal LLM (TinyLlama 1.1B) on pg31100.txt (Jane Austen corpus)
# Requires: transformers, datasets, peft, accelerate, bitsandbytes, trl

from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments
from trl import SFTTrainer
from peft import LoraConfig
import torch, re, json




In [2]:
# ---------- 1. Preprocess corpus ----------
SRC_FILE = "pg31100.txt"

text = open(SRC_FILE, encoding="utf-8", errors="ignore").read()
text = re.sub(r'\r', '', text)
text = re.sub(r'\n{3,}', '\n\n', text)
text = text.strip()

# split into moderate chunks (~3k chars)
chunks = []
for i in range(0, len(text), 3000):
    chunks.append(text[i:i+3000])
    if i % 100 == 0:
        print(i)
cut = int(0.97 * len(chunks))
open("train.jsonl","w",encoding="utf-8").writelines(json.dumps({"text":t})+"\n" for t in chunks[:cut])
open("val.jsonl","w",encoding="utf-8").writelines(json.dumps({"text":t})+"\n" for t in chunks[cut:])



0
3000
6000
9000
12000
15000
18000
21000
24000
27000
30000
33000
36000
39000
42000
45000
48000
51000
54000
57000
60000
63000
66000
69000
72000
75000
78000
81000
84000
87000
90000
93000
96000
99000
102000
105000
108000
111000
114000
117000
120000
123000
126000
129000
132000
135000
138000
141000
144000
147000
150000
153000
156000
159000
162000
165000
168000
171000
174000
177000
180000
183000
186000
189000
192000
195000
198000
201000
204000
207000
210000
213000
216000
219000
222000
225000
228000
231000
234000
237000
240000
243000
246000
249000
252000
255000
258000
261000
264000
267000
270000
273000
276000
279000
282000
285000
288000
291000
294000
297000
300000
303000
306000
309000
312000
315000
318000
321000
324000
327000
330000
333000
336000
339000
342000
345000
348000
351000
354000
357000
360000
363000
366000
369000
372000
375000
378000
381000
384000
387000
390000
393000
396000
399000
402000
405000
408000
411000
414000
417000
420000
423000
426000
429000
432000
435000
438000
441000
44400

In [3]:
# ---------- 2. Load datasets ----------
train = load_dataset("json", data_files="train.jsonl", split="train")
val   = load_dataset("json", data_files="val.jsonl",   split="train")

Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [4]:
# ---------- 3. Model & tokenizer ----------
MODEL = "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"
tokenizer = AutoTokenizer.from_pretrained(MODEL, use_fast=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

In [5]:
# ---------- 4. LoRA configuration ----------
peft_cfg = LoraConfig(
    r=16, lora_alpha=32, lora_dropout=0.05,
    target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"],
    task_type="CAUSAL_LM"
)

model = AutoModelForCausalLM.from_pretrained(
    MODEL,
    load_in_4bit=True,
    dtype=torch.bfloat16,
    device_map="auto"
)

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


model.safetensors:   0%|          | 0.00/4.40G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/129 [00:00<?, ?B/s]

In [6]:
# ---------- 5. Training ----------
args = TrainingArguments(
    output_dir="austen-lora",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=8,
    num_train_epochs=2,
    learning_rate=2e-4,
    eval_strategy="steps",
    eval_steps=500,
    logging_steps=100,
    save_steps=1000,
    bf16=True,
    report_to="none"
)

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    peft_config=peft_cfg,
    train_dataset=train,
    eval_dataset=val,
    dataset_text_field="text",
    max_seq_length=1024,
    packing=True,
    args=args,
)

trainer.train()
trainer.model.save_pretrained("austen-lora-final")
tokenizer.save_pretrained("austen-lora-final")

TypeError: SFTTrainer.__init__() got an unexpected keyword argument 'tokenizer'

In [None]:
# ---------- 6. Sampling ----------
from peft import PeftModel
base = AutoModelForCausalLM.from_pretrained(MODEL, torch_dtype=torch.bfloat16, device_map="auto")
adapter = PeftModel.from_pretrained(base, "austen-lora-final")

prompt = "A letter was delivered to Miss Bennet, whose surprise equalled her curiosity."
inputs = tokenizer(prompt, return_tensors="pt").to(adapter.device)
gen = adapter.generate(**inputs, max_new_tokens=180, temperature=0.7, top_p=0.9, do_sample=True)
print(tokenizer.decode(gen[0], skip_special_tokens=True))