In [None]:

# 00: 环境与加载器自检（可重复执行）
import sys, os, json, platform, subprocess, torch
print("Python:", sys.version)
print("CUDA available:", torch.cuda.is_available())
print("GPU name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU")
print("pip freeze (key pkgs):")
import importlib, pkgutil
for name in ["datasets","transformers","peft","bitsandbytes","fugashi","SudachiPy","sudachidict_core","huggingface_hub"]:
    try:
        mod = importlib.import_module(name)
        print(f"  {name}:", getattr(mod, "__version__", "n/a"))
    except Exception as e:
        print(f"  {name}: MISSING -> {e}")


In [None]:

# 01: 安装缺失的日语分词依赖（安全可重复）
!pip -q install fugashi unidic-lite SudachiPy sudachidict-core sentencepiece > /dev/null
import fugashi, sudachipy
print("fugashi/SudachiPy installed OK.")


In [None]:

# 02: 修正 sys.path，保证 `src` 可导入（避免 No module named 'src'）
import sys, os
repo_root = os.path.abspath(".")
if repo_root not in sys.path:
    sys.path.append(repo_root)
print("sys.path ok:", repo_root in sys.path)


In [None]:

# 03: 验证我们自带的 parquet 加载器能工作，并打印 commit
from src.data.load_jglue import load_jnli, load_jsquad
from importlib import import_module
lj = import_module("src.data.load_jglue")
print("JGLUE COMMIT:", getattr(lj, "COMMIT", "N/A"))

ds_jnli_val = load_jnli("validation"); ds_js_val = load_jsquad("validation")
print("JNLI val size:", len(ds_jnli_val))
print("JSQuAD val size:", len(ds_js_val))
assert len(ds_jnli_val) > 1000 and len(ds_js_val) > 1000, "Dataset sizes look too small—check network/HF 可用性"


In [None]:

# 04: 运行基线 sanity（不训练）
!python -m src.eval.jnli_eval --split validation
!python -m src.eval.jsquad_eval --split validation
print("Sanity baselines done.")
