In [None]:
import os, json, pandas as pd, shutil
from pathlib import Path

# === 路径根据的数据集实际调整 ===
IMG_DIR  = Path("/kaggle/input/captions-filtered/dataset")        # 图片原始目录
CSV_PATH = Path("/kaggle/input/captions-filtered/captions_filtered.csv")  # 有 fname, caption
DATA_DIR = Path("/kaggle/working/dataset")                        # 训练用目录

DATA_DIR.mkdir(parents=True, exist_ok=True)

# 读取 CSV
df = pd.read_csv(CSV_PATH)

# 基本字段检查
assert "fname" in df.columns,  "CSV 缺少列: fname"
assert "caption" in df.columns, "CSV 缺少列: caption"

# 清洗：去 NaN / 去首尾空格
df["fname"]   = df["fname"].astype(str).str.strip()
df["caption"] = df["caption"].astype(str).fillna("").str.strip()

# 允许的图片后缀
allowed_ext = {".jpg", ".jpeg", ".png", ".webp"}
df["ext"] = df["fname"].apply(lambda x: Path(x).suffix.lower())
df = df[df["ext"].isin(allowed_ext)].copy()

# 拼路径并保留存在的图片
df["src"] = df["fname"].apply(lambda x: str(IMG_DIR / x))
df["exists"] = df["src"].apply(os.path.exists)
missing = df[~df["exists"]]
df = df[df["exists"]].copy()

# 去重（按文件名）
df = df.drop_duplicates(subset=["fname"], keep="first")

print(f"原始 CSV 行数: {len(df) + len(missing)}")
print(f"可用图片数  : {len(df)}")
print(f"缺失图片数  : {len(missing)}")
if not missing.empty:
    print("示例缺失：", missing["fname"].head(5).tolist())

# 复制图片 + 写 metadata.jsonl
records = []
for _, row in df.iterrows():
    src = row["src"]
    dst = DATA_DIR / Path(row["fname"]).name
    if str(dst) != src:
        shutil.copy2(src, dst)
    # metadata 条目
    caption = row["caption"] if row["caption"] else "a <lolicon> League of Legends style icon"
    records.append({"file_name": dst.name, "text": caption})

# 若为空提前报错，避免训练直接 0 examples
if not records:
    raise RuntimeError("没有可用样本：请检查 IMG_DIR / CSV 列名 / 文件是否存在")

meta_path = DATA_DIR / "metadata.jsonl"
with meta_path.open("w", encoding="utf-8") as f:
    for r in records:
        f.write(json.dumps(r, ensure_ascii=False) + "\n")

print(f"\n已写入: {meta_path} 共 {len(records)} 条")
print("前 3 行预览：")
with meta_path.open("r", encoding="utf-8") as f:
    for i, line in enumerate(f):
        print(line.rstrip())
        if i >= 2:
            break

# 再做一轮“训练前自检”：metadata.jsonl 里的文件是否都在 DATA_DIR
bad = []
for r in records[:2000]:  # 只抽样检查前 2000 条
    if not (DATA_DIR / r["file_name"]).exists():
        bad.append(r["file_name"])
print("\n抽样文件存在性检查 -> 异常数:", len(bad))
if bad:
    print("示例:", bad[:5])


In [None]:
import requests, pathlib

script_url = "https://raw.githubusercontent.com/huggingface/diffusers/main/examples/text_to_image/train_text_to_image_lora.py"
script_path = pathlib.Path("/kaggle/working/train_text_to_image_lora.py")
script_path.write_bytes(requests.get(script_url).content)

print("Saved:", script_path, "size:", script_path.stat().st_size)


In [None]:
# # —— 核心科学栈：避免 NumPy/Sklearn ABI 冲突 & 兼容常见依赖 ——
# !pip install -q --upgrade --force-reinstall \
#   numpy==1.26.4 scipy==1.11.4 scikit-learn==1.4.2

# # —— 训练依赖：与脚本版本一致（保持 diffusers==0.34.0） ——
# !pip install -q --upgrade --force-reinstall \
#   diffusers==0.34.0 transformers==4.52.4 accelerate==0.33.0 \
#   peft==0.12.0 safetensors==0.4.5

# # —— 避免 Transformers 去导入 TF/JAX 分支（可选：直接卸载） ——
# !pip uninstall -y -q tensorflow keras || true


In [None]:
# 版本自检（重启后再跑一次更靠谱）
import torch, torchvision, diffusers, transformers, numpy, sklearn, scipy
print("torch       :", torch.__version__)
print("torchvision :", torchvision.__version__)
print("diffusers   :", diffusers.__version__)
print("transformers:", transformers.__version__)
print("numpy       :", numpy.__version__)
print("sklearn     :", sklearn.__version__)
print("scipy       :", scipy.__version__)
print("has register_fake:", hasattr(torch.library, "register_fake"))


In [None]:
import requests, pathlib, re

SCRIPT = pathlib.Path("/kaggle/working/train_text_to_image_lora.py")
URL = "https://raw.githubusercontent.com/huggingface/diffusers/v0.34.0/examples/text_to_image/train_text_to_image_lora.py"

SCRIPT.write_bytes(requests.get(URL, timeout=60).content)

# 双保险：把可能存在的版本检查注释掉，防止后续被误改脚本
txt = SCRIPT.read_text()
txt2 = re.sub(r"check_min_version\(.*?\)\s*", "# (pinned out) check_min_version for v0.34.0\n", txt)
if txt2 != txt:
    SCRIPT.write_text(txt2)

print("Saved:", SCRIPT, "size:", SCRIPT.stat().st_size)
print("Snippet ok? ", "check_min_version" not in SCRIPT.read_text()[:400])


In [None]:
import os, shlex, subprocess, pathlib

BASE_MODEL = "runwayml/stable-diffusion-v1-5"
DATA_DIR   = "/kaggle/working/dataset"
OUTPUT_DIR = "/kaggle/working/lora_out"
pathlib.Path(OUTPUT_DIR).mkdir(parents=True, exist_ok=True)

env = os.environ.copy()
env["TRANSFORMERS_NO_TF"]="1"; env["TRANSFORMERS_NO_JAX"]="1"
env["HF_HUB_ENABLE_HF_TRANSFER"]="1"

cmd = f"""
accelerate launch /kaggle/working/train_text_to_image_lora.py \
  --pretrained_model_name_or_path {BASE_MODEL} \
  --train_data_dir {DATA_DIR} \
  --resolution 512 \
  --train_batch_size 2 \
  --gradient_accumulation_steps 4 \
  --mixed_precision fp16 \
  --rank 8 \
  --learning_rate 1e-4 \
  --lr_scheduler constant \
  --lr_warmup_steps 0 \
  --seed 42 \
  --max_train_steps 5000 \
  --checkpointing_steps 1000 \
  --dataloader_num_workers 2 \
  --output_dir {OUTPUT_DIR}
"""
print("Launching training…")
ret = subprocess.run(shlex.split(cmd), env=env, check=False)
print("Exit code:", ret.returncode)


In [None]:
import shutil, os, pathlib
src = "/kaggle/working/lora_out"
dst = "/kaggle/working/lora_export.zip"
if pathlib.Path(src).exists():
    shutil.make_archive(dst.replace(".zip",""), "zip", src)
    print("Exported:", dst)
else:
    print("No lora_out to export.")
