In [None]:
!pip install datasets
!pip install bitsandbytes
import os
import torch
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from datasets import load_dataset, Dataset
from torch.utils.data import DataLoader, random_split
from transformers import (
    AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
)
from peft import LoraConfig, get_peft_model
from tqdm import tqdm
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup
from transformers import get_cosine_schedule_with_warmup
from torch.cuda.amp import GradScaler

Collecting datasets
  Downloading datasets-3.4.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.4.0-py3-none-any.whl (487 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m487.4/487.4 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py311-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading x

In [None]:
# ✅ 允许 TensorFloat32，加速计算
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True

# ✅ 解决 PyTorch 编译缓存问题，防止 `torch.compile()` 失败
torch._dynamo.config.cache_size_limit = 64

In [4]:
# 挂载谷歌网盘
from google.colab import drive
drive.mount('/content/drive')

data_path = '/content/drive/MyDrive/dataset/agriculture_qa_full.csv'

Mounted at /content/drive


In [5]:
class LLaMA3LoRATrainer:
    def __init__(self, model_name="meta-llama/Llama-3.2-1B-Instruct",
                 data_path="/content/drive/MyDrive/dataset/agriculture_qa_full.csv",
                 output_dir="/content/drive/MyDrive/llama3_lora_colab",
                 image_output_dir="/content/drive/MyDrive/image",
                 # !!!!!!!!!!!!!!!!!!!!!!!!!
                 # !!!!!!!!!!!!!!!!!!!!!!!!!
                 # ！！！！重要！！！！替换为你的 API 密钥
                 # access_token=""
                 ):
        """
        ✅ 初始化 LLaMA-3 训练器
        ✅ 80% 训练数据，20% 测试数据
        ✅ 生成 4 张训练图像
        """
        self.model_name = model_name
        self.data_path = data_path
        self.output_dir = output_dir
        self.image_output_dir = image_output_dir
        self.access_token = access_token

        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        print(f"✅ 运行设备: {self.device}")

        os.makedirs(self.output_dir, exist_ok=True)
        os.makedirs(self.image_output_dir, exist_ok=True)

        self.tokenizer = AutoTokenizer.from_pretrained(self.model_name, token=self.access_token)
        self.tokenizer.pad_token = self.tokenizer.eos_token

        # ✅ 加载数据集 & 拆分成训练集和测试集
        self.load_and_split_dataset()

        # ✅ 加载模型
        self.load_model()

        # ✅ 应用 LoRA
        self.apply_lora()

        self.train_loss_history = []
        self.test_loss_history = []
        self.learning_rates = []

        print(f"✅ 数据划分完成：训练数据 {len(self.train_dataset)}，测试数据 {len(self.test_dataset)}")

    def load_and_split_dataset(self):
        """
        ✅ 读取数据集，并划分为 80% 训练集 + 20% 测试集
        ✅ 存储 `full_dataset.csv`, `train_data.csv`, `test_data.csv`
        """
        dataset = load_dataset("csv", data_files=self.data_path)["train"]
        dataset = dataset.select(range(min(len(dataset), 30000)))  # 限制最大数据集大小
        dataset = dataset.map(self.format_instruction)

        # ✅ 转换为 `Dataset`，确保 `shuffle=True` 可用
        dataset = Dataset.from_dict(dataset.to_dict())

        train_size = int(0.8 * len(dataset))  # 80% 训练
        test_size = len(dataset) - train_size  # 20% 测试

        self.train_dataset, self.test_dataset = random_split(dataset, [train_size, test_size])

        # ✅ 存储所有数据到 CSV
        pd.DataFrame(dataset.to_dict()).to_csv(f"{self.output_dir}/full_dataset.csv", index=False)
        pd.DataFrame(self.train_dataset.dataset.to_dict()).to_csv(f"{self.output_dir}/train_data.csv", index=False)
        pd.DataFrame(self.test_dataset.dataset.to_dict()).to_csv(f"{self.output_dir}/test_data.csv", index=False)
        print(f"✅ 数据集已保存到 {self.output_dir}")

    @staticmethod
    def format_instruction(example):
        return {"text": f"### 指令: {example['question']} \n### 回答: {example['answers']}"}

    def load_model(self):
        bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_compute_dtype=torch.float16,
            bnb_4bit_use_double_quant=True,
            llm_int8_threshold=6.0,
            llm_int8_enable_fp32_cpu_offload=True
        )

        self.model = AutoModelForCausalLM.from_pretrained(
            self.model_name,
            torch_dtype=torch.float16,
            quantization_config=bnb_config,
            device_map="auto",
            token=self.access_token
        )
        print("✅ 模型加载完成（启用 4-bit 量化）")

    def apply_lora(self):
        """
        ✅ 只训练 LoRA 层，冻结其他层，提高训练效率
        """
        lora_config = LoraConfig(
            r=32,
            lora_alpha=64,
            lora_dropout=0.1,
            bias="none",
            task_type="CAUSAL_LM"
        )
        self.model = get_peft_model(self.model, lora_config)
        self.model.print_trainable_parameters()

    def collate_fn(self, batch):
        texts = [example["text"] for example in batch]
        inputs = self.tokenizer(
            texts, return_tensors="pt", padding="longest", truncation=True, max_length=512, pad_to_multiple_of=8
        )
        inputs = {key: val.to(self.device) for key, val in inputs.items()}
        return inputs

    def evaluate(self, test_dataloader):
        """✅ 在测试集上评估模型性能"""
        self.model.eval()
        total_loss = 0
        num_batches = 0

        with torch.no_grad():
            for batch in test_dataloader:
                batch = {k: v.to(self.device) for k, v in batch.items()}
                outputs = self.model(**batch, labels=batch["input_ids"])
                loss = outputs.loss.item()

                total_loss += loss
                num_batches += 1

        avg_loss = total_loss / num_batches
        self.test_loss_history.append(avg_loss)  # ✅ 记录每个 epoch 的测试损失

        print(f"✅ 测试集平均损失: {avg_loss:.4f}")
        self.model.train()  # 重新切换回训练模式

    def save_model(self):
        """✅ 训练完成后保存 LoRA 适配层"""
        print("💾 正在保存 LoRA 适配层...")
        self.model.save_pretrained(self.output_dir)
        self.tokenizer.save_pretrained(self.output_dir)
        print(f"✅ LoRA 适配层已保存到 {self.output_dir}")

    def train(self, num_epochs=10, batch_size=16, learning_rate=1e-4):
        optimizer = AdamW(self.model.parameters(), lr=learning_rate)

        train_dataloader = DataLoader(self.train_dataset, batch_size=batch_size, shuffle=True, collate_fn=self.collate_fn, drop_last=False)
        test_dataloader = DataLoader(self.test_dataset, batch_size=batch_size, shuffle=False, collate_fn=self.collate_fn, drop_last=False)

        num_training_steps = len(train_dataloader) * num_epochs
        scheduler = get_cosine_schedule_with_warmup(
            optimizer,
            num_warmup_steps=int(0.15 * num_training_steps),
            num_training_steps=num_training_steps
        )

        scaler = GradScaler()  # ✅ 启用 GradScaler
        self.model.train()

        for epoch in range(num_epochs):
            tqdm.write(f"🔄 Epoch {epoch + 1}/{num_epochs}")
            for batch in tqdm(train_dataloader, desc=f"Epoch {epoch + 1}", unit="batch"):
                batch = {k: v.to(self.device) for k, v in batch.items()}
                with torch.cuda.amp.autocast():
                    outputs = self.model(**batch, labels=batch["input_ids"], use_cache=False)
                    loss = outputs.loss

                scaler.scale(loss).backward()
                torch.nn.utils.clip_grad_norm_(self.model.parameters(), max_norm=1.0)  # ✅ 限制梯度最大值

                scaler.step(optimizer)
                scaler.update()
                scheduler.step()
                optimizer.zero_grad()

                self.train_loss_history.append(loss.item())
                self.learning_rates.append(optimizer.param_groups[0]["lr"])

            self.evaluate(test_dataloader)
            self.plot_training_metrics()

        print("✅ 训练完成")
        self.save_model()



    def plot_training_metrics(self):
        """
        ✅ 绘制 4 张训练图，并保存到 Google Drive
        """
        # 1、损失函数
        plt.figure()
        plt.plot(self.train_loss_history, label="Training Loss", color="red")
        plt.title("Training Loss Curve")
        plt.xlabel("Steps")
        plt.ylabel("Loss")
        plt.legend()
        plt.savefig(f"{self.image_output_dir}/training_loss.png")
        plt.close()

        # 2、学习率
        plt.figure()
        plt.plot(self.learning_rates, label="Learning Rate", color="blue")
        plt.title("Learning Rate Curve")
        plt.xlabel("Steps")
        plt.ylabel("Learning Rate")
        plt.legend()
        plt.savefig(f"{self.image_output_dir}/learning_rate.png")
        plt.close()

        # 3、过拟合检测
        plt.figure()
        plt.plot(self.train_loss_history, label="Train Loss", color="red")
        plt.plot(
            np.linspace(0, len(self.train_loss_history), len(self.test_loss_history)),
            self.test_loss_history, label="Test Loss", color="green"
        )  # ✅ 确保 x 轴数据点匹配
        plt.title("Overfitting Detection")
        plt.xlabel("Steps")
        plt.ylabel("Loss")
        plt.legend()
        plt.savefig(f"{self.image_output_dir}/overfitting_detection.png")
        plt.close()

        # 4、梯度更新幅度
        plt.figure()
        gradient_magnitudes = [abs(loss) for loss in self.train_loss_history]  # 计算梯度幅度
        plt.plot(gradient_magnitudes, label="Gradient Update Magnitude", color="purple")
        plt.title("Gradient Update Magnitude Curve")
        plt.xlabel("Steps")
        plt.ylabel("Gradient Magnitude")
        plt.legend()
        plt.savefig(f"{self.image_output_dir}/gradient_magnitude.png")
        plt.close()

        print("✅ 训练图像已保存到 Google Drive")

In [None]:
if __name__ == "__main__":
    from google.colab import drive
    drive.mount('/content/drive')

    trainer = LLaMA3LoRATrainer()
    trainer.train(num_epochs=12, batch_size=16, learning_rate=1e-4)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
✅ 运行设备: cuda


tokenizer_config.json:   0%|          | 0.00/54.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/22615 [00:00<?, ? examples/s]

✅ 数据集已保存到 /content/drive/MyDrive/llama3_lora_colab


config.json:   0%|          | 0.00/877 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.47G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

  scaler = GradScaler()  # ✅ 启用 GradScaler


✅ 模型加载完成（启用 4-bit 量化）
trainable params: 3,407,872 || all params: 1,239,222,272 || trainable%: 0.2750
✅ 数据划分完成：训练数据 18092，测试数据 4523
🔄 Epoch 1/12


  with torch.cuda.amp.autocast():
Epoch 1: 100%|██████████| 1131/1131 [02:57<00:00,  6.36batch/s]


✅ 测试集平均损失: 0.8325
✅ 训练图像已保存到 Google Drive
🔄 Epoch 2/12


Epoch 2: 100%|██████████| 1131/1131 [02:57<00:00,  6.38batch/s]


✅ 测试集平均损失: 0.7629
✅ 训练图像已保存到 Google Drive
🔄 Epoch 3/12


Epoch 3: 100%|██████████| 1131/1131 [02:56<00:00,  6.40batch/s]


✅ 测试集平均损失: 0.7382
✅ 训练图像已保存到 Google Drive
🔄 Epoch 4/12


Epoch 4: 100%|██████████| 1131/1131 [02:57<00:00,  6.36batch/s]


✅ 测试集平均损失: 0.7220
✅ 训练图像已保存到 Google Drive
🔄 Epoch 5/12


Epoch 5: 100%|██████████| 1131/1131 [02:57<00:00,  6.36batch/s]


✅ 测试集平均损失: 0.7149
✅ 训练图像已保存到 Google Drive
🔄 Epoch 6/12


Epoch 6: 100%|██████████| 1131/1131 [02:57<00:00,  6.36batch/s]


✅ 测试集平均损失: 0.7096
✅ 训练图像已保存到 Google Drive
🔄 Epoch 7/12


Epoch 7: 100%|██████████| 1131/1131 [02:57<00:00,  6.36batch/s]


✅ 测试集平均损失: 0.7074
✅ 训练图像已保存到 Google Drive
🔄 Epoch 8/12


Epoch 8: 100%|██████████| 1131/1131 [02:57<00:00,  6.36batch/s]


✅ 测试集平均损失: 0.7059
✅ 训练图像已保存到 Google Drive
🔄 Epoch 9/12


Epoch 9: 100%|██████████| 1131/1131 [02:57<00:00,  6.36batch/s]


✅ 测试集平均损失: 0.7052
✅ 训练图像已保存到 Google Drive
🔄 Epoch 10/12


Epoch 10: 100%|██████████| 1131/1131 [02:57<00:00,  6.36batch/s]


✅ 测试集平均损失: 0.7050
✅ 训练图像已保存到 Google Drive
🔄 Epoch 11/12


Epoch 11: 100%|██████████| 1131/1131 [02:57<00:00,  6.38batch/s]


✅ 测试集平均损失: 0.7050
✅ 训练图像已保存到 Google Drive
🔄 Epoch 12/12


Epoch 12:  72%|███████▏  | 816/1131 [02:08<00:50,  6.21batch/s]