<a href="https://colab.research.google.com/github/vcolombo/colab_fetch_kaggle/blob/handle_competitions_optimize_ram/KaggleImport.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# === Configuration ===
SOURCE_TYPE = "competition"   # "dataset" or "competition"
DATASET_PATH = "alonhaviv/the-maestro-dataset-v3-0-0"
COMPETITION_SLUG = "alaska2-image-steganalysis"
COPY_MODE = False             # For datasets: True=copy (persistent), False=symlink (non-persistent).
# NOTE: If COPY_MODE is False, dataset data will NOT persist after Colab runtime ends.

# === Imports (safe pre-Kaggle) ===
from pathlib import Path
import shutil, glob, subprocess, os, json, stat
from google.colab import drive, files
import kagglehub
from requests.exceptions import HTTPError

# === Helpers ===
def safe_remove(path: Path):
    if path.exists() or path.is_symlink():
        if path.is_dir() and not path.is_symlink():
            shutil.rmtree(path)
        else:
            path.unlink()

def instruct_join(slug: str, username: str | None):
    print(
        "\nAccess denied (403). Fix:\n"
        f"  1) Open https://www.kaggle.com/competitions/{slug}\n"
        "  2) Click 'Join Competition' and accept the rules and DATA terms.\n"
        "  3) Complete phone/ID verification if prompted.\n"
        "  4) Account → Create New API Token, replace ~/.kaggle and ~/.config/kaggle kaggle.json (chmod 600).\n"
        f"  5) Ensure this notebook uses Kaggle user: {username}\n"
    )

# === Mount Google Drive ===
drive.mount("/content/drive")

# === Kaggle API key setup BEFORE importing Kaggle API ===
os.environ.pop("KAGGLE_USERNAME", None)
os.environ.pop("KAGGLE_KEY", None)
os.environ["KAGGLE_CONFIG_DIR"] = "/root/.kaggle"

cfg1 = Path("/root/.kaggle");        cfg1.mkdir(parents=True, exist_ok=True)
cfg2 = Path("/root/.config/kaggle"); cfg2.mkdir(parents=True, exist_ok=True)
kjson1 = cfg1 / "kaggle.json"
kjson2 = cfg2 / "kaggle.json"

if kjson1.exists() and kjson2.exists():
    print("kaggle.json already exists — skipping upload.")
else:
    print("Upload kaggle.json (Account → Create New API Token)")
    up = files.upload()
    if "kaggle.json" in up:
        data = up["kaggle.json"]
        kjson1.write_bytes(data)
        kjson2.write_bytes(data)
        for p in (kjson1, kjson2):
            os.chmod(p, stat.S_IRUSR | stat.S_IWUSR)  # 0600
        print("kaggle.json written to both paths and permissions set.")
    else:
        raise RuntimeError("kaggle.json not provided.")

# Identity check (no key printed)
try:
    kaggle_user = json.loads(kjson1.read_text()).get("username")
    print("Using Kaggle user:", kaggle_user)
except Exception:
    kaggle_user = None
    print("Warning: could not read kaggle.json username")

# === Now import Kaggle API and authenticate ===
from kaggle.api.kaggle_api_extended import KaggleApi
api = KaggleApi(); api.authenticate()

# === Paths ===
drive_root = Path("/content/drive/MyDrive/kaggle")
drive_root.mkdir(parents=True, exist_ok=True)

if SOURCE_TYPE == "dataset":
    # Download to local KaggleHub cache (ephemeral)
    src = Path(kagglehub.dataset_download(DATASET_PATH))
    dst = drive_root / src.name
    safe_remove(dst)

    if COPY_MODE:
        print(f"Copying dataset to: {dst}")
        subprocess.run(["rsync", "-a", "--info=progress2", f"{src}/", f"{dst}/"], check=True)
    else:
        print(f"Creating symlink: {dst} -> {src}")
        dst.symlink_to(src, target_is_directory=True)

    print("Dataset ready at:", dst)
    print("Note: Set COPY_MODE=True for Drive persistence.")

elif SOURCE_TYPE == "competition":
    # Download single ZIP to Drive, then system unzip (low RAM)
    dest = drive_root / COMPETITION_SLUG
    dest.mkdir(parents=True, exist_ok=True)

    try:
        print(f"Downloading competition: {COMPETITION_SLUG}")
        api.competition_download_files(COMPETITION_SLUG, path=str(dest), quiet=True)
    except HTTPError as e:
        status = getattr(getattr(e, "response", None), "status_code", None)
        print("Download failed:", f"HTTP {status}" if status else repr(e))
        if status == 403:
            instruct_join(COMPETITION_SLUG, kaggle_user)
        raise SystemExit

    zips = glob.glob(str(dest / "*.zip"))
    if not zips:
        print("No ZIP files found after download.")
    for zp in zips:
        print(f"Unzipping: {Path(zp).name}")
        subprocess.run(["unzip", "-q", "-o", zp, "-d", str(dest)], check=True)
        os.remove(zp)

    print("Competition files ready at:", dest)

else:
    raise ValueError("SOURCE_TYPE must be 'dataset' or 'competition'.")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Upload kaggle.json (Account → Create New API Token)


Saving kaggle.json to kaggle.json
kaggle.json written to both paths and permissions set.
Using Kaggle user: vincentcolombo
Downloading competition: alaska2-image-steganalysis
