In [1]:
# 查看cuda版本
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2025 NVIDIA Corporation
Built on Tue_May_27_02:24:01_Pacific_Daylight_Time_2025
Cuda compilation tools, release 12.9, V12.9.86
Build cuda_12.9.r12.9/compiler.36037853_0


In [1]:
# DND故事生成LLM微调与浏览器部署.ipynb

# ====================================================================
# 第一步：环境设置
# ====================================================================
#
# 首先，我们需要安装所有必需的库。
# transformers: Hugging Face的核心库。
# accelerate: 用于简化多GPU或混合精度训练的工具。
# bitsandbytes: 用于模型量化，是实现QLoRA的关键。
# peft: 参数高效微调库，包含LoRA实现。
# datasets: Hugging Face的数据处理库。
# trl: 用于指令微调和SFT（Supervised Fine-tuning）的工具库。

# %pip install transformers accelerate bitsandbytes peft datasets trl
# %pip install torch torchvision --index-url https://download.pytorch.org/whl/cu129

import os
import torch
from datasets import load_dataset, Dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline
)
from peft import LoraConfig, PeftModel
from trl import SFTTrainer

# 检查GPU是否可用
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)} is available.")
else:
    print("Warning: GPU not available. Training will be very slow.")

GPU: NVIDIA GeForce RTX 4090 is available.


In [2]:
import os
import json
from datasets import Dataset, DatasetDict

def load_aligned_data(data_dir="D:\\Github\\CRD3\\baseline\\data\\aligned data", chunk_size=2):
    """
    Load aligned data and convert it to Hugging Face Dataset format
    
    Args:
        data_dir: Path to the aligned data directory
        chunk_size: Chunk size (2, 3, or 4)
        
    Returns:
        DatasetDict: Dictionary containing train, validation, and test sets
    """
    
    # Define dataset split file paths
    train_files_path = os.path.join(data_dir, "train_files")
    val_files_path = os.path.join(data_dir, "val_files")
    test_files_path = os.path.join(data_dir, "test_files")
    
    # Read split files
    with open(train_files_path, 'r') as f:
        train_episodes = [line.strip() for line in f.readlines()]
    
    with open(val_files_path, 'r') as f:
        val_episodes = [line.strip() for line in f.readlines()]
    
    with open(test_files_path, 'r') as f:
        test_episodes = [line.strip() for line in f.readlines()]
    
    # Build dataset splits dictionary
    splits = {
        'train': train_episodes,
        'validation': val_episodes,
        'test': test_episodes
    }
    
    # Build data storage structure
    dataset_splits = {
        'train': {'dialogue': [], 'summary': []},
        'validation': {'dialogue': [], 'summary': []},
        'test': {'dialogue': [], 'summary': []}
    }
    
    # Process each split
    for split_name, episodes in splits.items():
        print(f"Processing {split_name} split...")
        
        # Process each episode
        for episode in episodes:
            # Build aligned data file path pattern
            # File name format: {episode}_{chunk_size}_{offset}.json
            episode_dir = os.path.join(data_dir, f"c={chunk_size}")
            
            # Find all aligned data files for this episode
            episode_files = []
            for file in os.listdir(episode_dir):
                if file.startswith(f"{episode}_{chunk_size}_"):
                    episode_files.append(os.path.join(episode_dir, file))
            
            # Process each file
            for file_path in episode_files:
                try:
                    with open(file_path, 'r', encoding='utf-8') as f:
                        data = json.load(f)
                    
                    # Process each aligned data chunk
                    for item in data:
                        # Extract summary
                        summary = item['CHUNK']
                        
                        # Extract dialogue
                        dialogue_parts = []
                        for turn in item['TURNS']:
                            speaker = ', '.join(turn['NAMES']) if turn['NAMES'] else 'UNKNOWN'
                            utterances = ' '.join(turn['UTTERANCES'])
                            dialogue_parts.append(f"{speaker}: {utterances}")
                        
                        dialogue = ' '.join(dialogue_parts)
                        
                        # Add to dataset
                        dataset_splits[split_name]['dialogue'].append(dialogue)
                        dataset_splits[split_name]['summary'].append(summary)
                        
                except Exception as e:
                    print(f"Error processing {file_path}: {e}")
                    continue
    
    # Convert data to Hugging Face Dataset format
    dataset_dict = {}
    for split_name, data in dataset_splits.items():
        if len(data['dialogue']) > 0:
            dataset_dict[split_name] = Dataset.from_dict({
                'dialogue': data['dialogue'],
                'summary': data['summary']
            })
        else:
            print(f"Warning: {split_name} split is empty")
    
    return DatasetDict(dataset_dict)

def format_for_instruction_tuning(dataset_dict, instruction_template=None):
    """
    Format dataset for instruction tuning
    
    Args:
        dataset_dict: DatasetDict object
        instruction_template: Instruction template
        
    Returns:
        DatasetDict: Formatted dataset
    """
    
    if instruction_template is None:
        instruction_template = "Summarize the following D&D dialogue:\n\n{dialogue}"
    
    def format_example(example):
        # Clean dialogue data to ensure correct format
        dialogue = example['dialogue'].strip()
        summary = example['summary'].strip()
        
        # Create instruction
        instruction = instruction_template.format(dialogue=dialogue)
        
        # Use Mistral recommended format
        formatted_text = f"<s>[INST] {instruction} [/INST] {summary}</s>"
        
        return {
            'text': formatted_text,
            'dialogue': dialogue,
            'summary': summary
        }
    
    # Apply formatting to each split
    formatted_datasets = {}
    for split_name, dataset in dataset_dict.items():
        formatted_datasets[split_name] = dataset.map(format_example)
    
    return DatasetDict(formatted_datasets)

def filter_long_sequences(dataset_dict, max_length=2048):
    """
    Filter out sequences that exceed the maximum length
    
    Args:
        dataset_dict: DatasetDict object
        max_length: Maximum sequence length
        
    Returns:
        DatasetDict: Filtered dataset
    """
    filtered_datasets = {}
    for split_name, dataset in dataset_dict.items():
        # Filter out samples that exceed the maximum length
        filtered_dataset = dataset.filter(lambda example: len(example['text']) <= max_length)
        filtered_datasets[split_name] = filtered_dataset
        print(f"{split_name}: {len(dataset)} -> {len(filtered_dataset)} samples (filtered out {len(dataset) - len(filtered_dataset)} long sequences)")
    
    return DatasetDict(filtered_datasets)

# Usage example
print("Loading CRD3 aligned data...")
dataset_dict = load_aligned_data(chunk_size=2)

# Format for instruction tuning
formatted_datasets = format_for_instruction_tuning(dataset_dict)

# Filter long sequences (optional)
# formatted_datasets = filter_long_sequences(formatted_datasets, max_length=2048)

# Display dataset information
print("\nDataset Information:")
for split_name, dataset in formatted_datasets.items():
    print(f"{split_name}: {len(dataset)} examples")

# Display sample
print("\nSample from training set:")
print(formatted_datasets['train'][0]['text'][:300] + "...")

Loading CRD3 aligned data...
Processing train split...
Processing validation split...
Processing test split...


Map:   0%|          | 0/13960 [00:00<?, ? examples/s]

Map:   0%|          | 0/2109 [00:00<?, ? examples/s]

Map:   0%|          | 0/2500 [00:00<?, ? examples/s]


Dataset Information:
train: 13960 examples
validation: 2109 examples
test: 2500 examples

Sample from training set:
<s>[INST] Summarize the following D&D dialogue:

MATT: Hello everyone. My name is Matthew Mercer, voice actor and Dungeon Master for Critical Role on Geek & Sundry, where I take a bunch of other voice actors and run them through a fantastical fantasy adventure through the world of Dungeons & Dragons...


In [3]:
# 登录 huggingface
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [4]:
%pip install sentencepiece 

Defaulting to user installation because normal site-packages is not writeable
Looking in indexes: https://mirrors.aliyun.com/pypi/simple/
Note: you may need to restart the kernel to use updated packages.


In [16]:
# ====================================================================
# 第三步：模型微调 (QLoRA)
# ====================================================================
#
# 在这里，我们将使用QLoRA技术来微调一个基础模型。
#
# 1. 模型选择: 我们选择 `mistralai/Mistral-7B-Instruct-v0.2`。这是一个强大的指令微调模型。
# 2. 量化配置: 使用BitsAndBytesConfig进行4-bit量化，以大幅减少显存占用。
# 3. LoRA配置: 定义LoRA参数，如秩(r)、alpha和目标模块。

# --- 模型和分词器加载 ---
base_model_name = "microsoft/phi-2"
new_model_name = "phi2-dnd-storyteller" # 微调后模型的名称

# --- 4-bit量化配置 ---
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4", # 使用NF4类型进行量化
    bnb_4bit_compute_dtype=torch.bfloat16, # 计算时使用bfloat16以保持精度
    bnb_4bit_use_double_quant=False, # 不使用双重量化
)

# --- 加载基础模型 ---
model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    quantization_config=bnb_config,
    device_map="auto", # 自动将模型分配到可用的GPU上
    trust_remote_code=True,
)
model.config.use_cache = False # 在训练时禁用缓存
model.config.pretraining_tp = 1

# --- 加载分词器 ---
tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token # 将pad token设置成eos token
tokenizer.padding_side = "right"

# --- LoRA配置 ---
peft_config = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=64, # LoRA的秩，是关键参数。64是一个比较均衡的选择。
    bias="none",
    task_type="CAUSAL_LM",
    # 通常针对模型的注意力层和线性层进行适配
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj"]
)

# --- 训练参数配置 ---
training_arguments = TrainingArguments(
    output_dir="./results", # 训练输出目录
    num_train_epochs=1, # 训练轮次。对于高质量数据，1-3轮即可。
    per_device_train_batch_size=4, # 每个GPU的batch size。根据显存大小调整。
    gradient_accumulation_steps=1, # 梯度累积步数
    optim="paged_adamw_32bit", # 使用分页优化器节省显存
    save_steps=50, # 每50步保存一次checkpoint
    logging_steps=10, # 每10步记录一次日志
    learning_rate=2e-4, # 学习率
    weight_decay=0.001,
    fp16=False,
    bf16=True, # 在4090上开启bf16以获得最佳性能
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="constant", # 学习率调度器
)



config.json:   0%|          | 0.00/735 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model-00001-of-00002.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/564M [00:00<?, ?B/s]

ValueError: Some modules are dispatched on the CPU or the disk. Make sure you have enough GPU RAM to fit the quantized model. If you want to dispatch the model on the CPU or the disk while keeping these modules in 32-bit, you need to set `llm_int8_enable_fp32_cpu_offload=True` and pass a custom `device_map` to `from_pretrained`. Check https://huggingface.co/docs/transformers/main/en/main_classes/quantization#offload-between-cpu-and-gpu for more details. 

In [None]:
# --- 初始化SFTTrainer ---
trainer = SFTTrainer(
    model=model,
    train_dataset=formatted_datasets['train'],
    peft_config=peft_config,
    # dataset_text_field="text",
    # max_seq_length=512,  # 限制序列长度以节省内存
    # tokenizer=tokenizer,
    args=training_arguments,
    # packing=False,
)

# --- 开始训练 ---
print("开始微调...")
trainer.train()
print("微调完成！")

# --- 保存微调后的适配器 ---
trainer.model.save_pretrained(new_model_name)
print(f"适配器已保存至 ./{new_model_name}")

Adding EOS to train dataset:   0%|          | 0/13960 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/13960 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/13960 [00:00<?, ? examples/s]

开始微调...


OutOfMemoryError: CUDA out of memory. Tried to allocate 512.00 MiB. GPU 0 has a total capacity of 23.99 GiB of which 0 bytes is free. Of the allocated memory 53.67 GiB is allocated by PyTorch, and 156.02 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:

# ====================================================================
# 第四步：模型测试与合并
# ====================================================================
#
# 训练完成后，我们需要测试一下模型的效果，并将其与基础模型合并，
# 得到一个完整的、可以直接部署的模型。

# --- 加载基础模型和微调后的LoRA适配器进行推理 ---
# 清理内存
del model
del trainer
torch.cuda.empty_cache()

# 重新加载基础模型和适配器
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map="auto",
)
# 加载LoRA权重
model = PeftModel.from_pretrained(base_model, new_model_name)
# 合并LoRA权重到基础模型
model = model.merge_and_unload()

# 重新加载分词器
tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# --- 使用Pipeline进行推理测试 ---
print("\n--- 测试微调后的模型 ---")
prompt = "[INST] 扮演地下城主，描述一个玩家在暴风雨的夜晚接近一座闹鬼的城堡。 [/INST]"
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)
result = pipe(prompt)
print(result[0]['generated_text'])

# --- 保存完整模型以备后续转换 ---
# 这一步会保存一个包含所有权重的完整模型文件夹，会比较大。
merged_model_path = "merged-mistral-7b-dnd"
model.save_pretrained(merged_model_path)
tokenizer.save_pretrained(merged_model_path)
print(f"完整模型已保存至 ./{merged_model_path}")


In [None]:
# ====================================================================
# 第五步：模型转换为浏览器可用格式 (ONNX)
# ====================================================================
#
# 为了让模型在Transformers.js中运行，我们需要将其转换为ONNX (Open Neural Network Exchange) 格式。
# 这需要安装 `optimum` 库。

%pip install optimum onnx onnxruntime

# --- 使用optimum进行转换 ---
# 注意：转换过程可能需要较长时间和较多内存。
from optimum.exporters.onnx import main_export

onnx_output_path = "onnx-mistral-7b-dnd"

try:
    main_export(
        model_name_or_path=merged_model_path,
        output=onnx_output_path,
        task="text-generation",
        fp16=True, # 使用FP16以减小模型大小
    )
    print(f"\n模型已成功转换为ONNX格式，并保存在 ./{onnx_output_path}")
except Exception as e:
    print(f"\nONNX转换失败: {e}")
    print("转换步骤可能需要手动调试，取决于库的版本。请参考Hugging Face Optimum文档。")

In [None]:
# ====================================================================
# 第六步：在浏览器中运行
# ====================================================================
#
# 转换完成后，您就有了一个可以在浏览器中使用的模型了。
# 您需要创建一个简单的网页来加载和使用这个模型。
#
# 以下是一个HTML文件的示例 (`index.html`)：

"""
<!DOCTYPE html>
<html>
<head>
    <title>D&D Story Generator</title>
</head>
<body>
    <h1>D&D Story Generator</h1>
    <textarea id="prompt" rows="4" cols="50" placeholder="输入你的D&D场景提示..."></textarea>
    <br>
    <button id="generate">生成故事</button>
    <h2>生成结果:</h2>
    <p id="output"></p>

    <script type="module">
        import { pipeline } from 'https://cdn.jsdelivr.net/npm/@xenova/transformers@2.16.0';

        const generateButton = document.getElementById('generate');
        const promptInput = document.getElementById('prompt');
        const output = document.getElementById('output');

        // 定义一个异步函数来加载和运行模型
        async function generate() {
            const prompt = "[INST] " + promptInput.value + " [/INST]";
            output.textContent = '正在加载模型并生成中...';

            try {
                // 加载你本地的ONNX模型文件夹
                // 你需要一个本地服务器来托管这些文件 (例如：python -m http.server)
                // 这里的路径 './onnx-mistral-7b-dnd' 需要与你服务器上的路径对应
                const generator = await pipeline('text-generation', './onnx-mistral-7b-dnd', {
                    // 如果模型转换时被量化了，需要在这里指定
                    // quantized: true 
                });

                const result = await generator(prompt, {
                    max_new_tokens: 150,
                    // 其他生成参数...
                });

                output.textContent = result[0].generated_text;

            } catch (error) {
                output.textContent = '加载或生成时出错: ' + error;
                console.error(error);
            }
        }

        generateButton.addEventListener('click', generate);
    </script>
</body>
</html>
"""

# 要运行这个HTML文件：
# 1. 确保 `onnx-mistral-7b-dnd` 文件夹和 `index.html` 在同一个目录下。
# 2. 在该目录下打开终端，运行一个简单的HTTP服务器：
#    python -m http.server
# 3. 在浏览器中打开 http://localhost:8000/
#
# 第一次运行时，浏览器会下载模型文件，可能需要一些时间。
# 模型运行的性能取决于用户的电脑硬件。

print("\n--- 所有步骤完成 ---")
print("下一步是创建HTML文件，并使用本地服务器进行测试。")