In [None]:
%%writefile Qwen-2.5-7B_alpacaft_local.py
# qwen2.5_alpacaft_local_v5.py
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Fine-tune Qwen-2.5-7B-Instruct with LoRA on local ShareGPT JSON dataset for academic abstract generation.
Usage: python llama3_2_alpacaft_local_v5.py --dataset data.json [--output output_dir] [--epochs 4] [--batch-size 3]
"""

import os
import json
import torch
import argparse
from unsloth import FastLanguageModel, is_bfloat16_supported
from unsloth.chat_templates import get_chat_template
from datasets import Dataset
from trl import SFTTrainer
from transformers import TrainingArguments, DataCollatorForSeq2Seq, TextStreamer
from transformers.trainer_callback import EarlyStoppingCallback

def parse_args():
    parser = argparse.ArgumentParser(description="Fine-tune Qwen 2.5-7B with LoRA")
    parser.add_argument("--dataset", type=str, required=True, help="Path to ShareGPT JSON file")
    parser.add_argument("--output", type=str, default="outputs", help="Output directory")
    parser.add_argument("--epochs", type=int, default=3, help="Number of training epochs")
    parser.add_argument("--batch-size", type=int, default=4, help="Per-device batch size")
    return parser.parse_args()

def main():
    args = parse_args()

    max_seq_length = 2048
    model_name = "unsloth/Qwen2.5-7B-Instruct"
    output_dir = args.output
    os.makedirs(output_dir, exist_ok=True)

    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name=model_name,
        max_seq_length=max_seq_length,
        dtype=None,
        load_in_4bit=True,
    )

    model = FastLanguageModel.get_peft_model(
        model,
        r=8,
        target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
        lora_alpha=16,
        lora_dropout=0,
        bias="none",
        use_gradient_checkpointing="unsloth",
        random_state=3407,
    )

    tokenizer = get_chat_template(tokenizer, chat_template="qwen-2.5")

    system_instruction = (
        "You are an expert academic writer specializing in generating high-quality abstracts for scholarly publications.\n\n"
        "Your task is to produce a concise abstract (150-250 words) based on a research paper's title and content. Please ensure the abstract:\n"
        "- Clearly states the research problem or question,\n"
        "- Summarizes the methodology or approach,\n"
        "- Highlights the key findings,\n"
        "- Explains the significance or implications of the results.\n\n"
        "Use formal, objective language appropriate for academic journals. Avoid unnecessary jargon and maintain clarity for a broad scholarly audience."
    )

    with open(args.dataset, "r", encoding="utf-8") as f:
        raw_data = json.load(f)

    for item in raw_data:
        if "conversations" in item:
            conv = item["conversations"]
            for i, msg in enumerate(conv):
                if msg["role"] == "user" and (i == len(conv) - 1 or conv[i + 1]["role"] != "assistant"):
                    conv.insert(i + 1, {"role": "assistant", "content": ""})

    dataset = Dataset.from_list(raw_data)

    def apply_chat_template(example):
        conv = example["conversations"]
        if conv[0]["role"] != "system":
            conv.insert(0, {"role": "system", "content": system_instruction})
        return {"text": tokenizer.apply_chat_template(conv, tokenize=False)}

    dataset = dataset.train_test_split(test_size=0.2, seed=3407)
    train_dataset = dataset["train"].map(apply_chat_template, num_proc=4)
    val_dataset = dataset["test"].map(apply_chat_template, num_proc=4)

    training_args = TrainingArguments(
        per_device_train_batch_size=args.batch_size,
        per_device_eval_batch_size=args.batch_size,
        gradient_accumulation_steps=2,
        warmup_ratio=0.03,
        num_train_epochs=args.epochs,
        learning_rate=2e-4,
        fp16=not is_bfloat16_supported(),
        bf16=is_bfloat16_supported(),
        logging_steps=10,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=3407,
        logging_dir=os.path.join(output_dir, "logs"),
        logging_strategy="steps",
        output_dir=output_dir,
        eval_strategy="steps",
        eval_steps=500,
        save_strategy="steps",
        save_steps=500,
        load_best_model_at_end=True,
        metric_for_best_model="eval_loss",
        report_to="none",
        dataloader_pin_memory=True,
    )

    trainer = SFTTrainer(
        model=model,
        tokenizer=tokenizer,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        dataset_text_field="text",
        max_seq_length=max_seq_length,
        data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer, pad_to_multiple_of=8),
        args=training_args,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
    )

    print("Starting training...")
    if torch.cuda.is_available():
        gpu = torch.cuda.get_device_properties(0)
        print(f"GPU: {gpu.name}, Total Memory: {gpu.total_memory / 1024**3:.2f} GB")
    trainer.train()

    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)

    FastLanguageModel.for_inference(model)
    try:
        with open("test.json", "r", encoding="utf-8") as f:
            paper = json.load(f)
            if isinstance(paper, list):
                paper = paper[0]
    except:
        paper = {
            "title": "Advances in NLP for Scientific Literature",
            "content": "This paper explores transformer-based methods for extracting information from research papers."
        }

    user_prompt = (
        "Generate an academic abstract based on the following paper:\n\n"
        f"Title: {paper['title']}\n\n"
        f"Content: {paper['content']}\n\n"
        "Please ensure the abstract:\n"
        "- Is concise (150-250 words),\n"
        "- Clearly states the research problem,\n"
        "- Summarizes the methodology,\n"
        "- Highlights the key findings,\n"
        "- Explains the significance or implications,\n"
        "- Uses formal academic language suitable for a scholarly journal,\n"
        "- Avoids unnecessary jargon while maintaining clarity."
    )

    messages = [
        {"role": "system", "content": system_instruction},
        {"role": "user", "content": user_prompt},
    ]

    inputs = tokenizer.apply_chat_template(messages, tokenize=True, add_generation_prompt=True, return_tensors="pt")
    if torch.cuda.is_available():
        inputs = inputs.to("cuda")

    print("Generating abstract:")
    streamer = TextStreamer(tokenizer, skip_prompt=True)
    model.generate(input_ids=inputs, streamer=streamer, max_new_tokens=512, temperature=0.7)

    print(f"Model saved to {output_dir}")

if __name__ == "__main__":
    main()