In [1]:
# Parameter block (edit these before running anything else)
CONFIG = {
    "repo_url": "https://github.com/wppqywq/CLIP_text_encoder.git",  # GitHub repo to clone
    "repo_branch": "main",
    "repo_dir": "/content/comp545_final",
    "drive_mount": "/content/drive",
    "data_root": "/content/drive/MyDrive/comp545_data", # put ur zip here
    "output_root": "/content/drive/MyDrive/comp545_outputs",
    "packages": [
        "open-clip-torch",
        "torch",
        "torchvision",
        "pillow",
        "numpy",
        "matplotlib",
        "pandas",
    ],
    "smoke_limit": 1000,  # number of images for the quick smoke test
    "smoke_distill": (0.0,),
    "smoke_output": "vg_smoke",
    "full_limit": None,  # set to None for the full dataset
    "full_distill": (0.0, 0.1),
    "full_output": "vg_full",
    "adapter_steps": 300,
    "adapter_batch": 32,
    "chunk_words": 8,
    "chunk_stride": 4,
    "chunk_threshold": 12,
    "text_pooling": "attn",
}



In [2]:
# Install dependencies (no-op if already satisfied)
if CONFIG["packages"]:
    import subprocess
    import sys

    cmd = [sys.executable, "-m", "pip", "install", "--quiet", *CONFIG["packages"]]
    subprocess.run(cmd, check=True)
else:
    print("No extra packages listed.")


In [3]:
# Mount Google Drive and clone the repository if needed
import subprocess
from pathlib import Path

from google.colab import drive

MOUNT_POINT = Path(CONFIG["drive_mount"])
if not MOUNT_POINT.is_dir():
    drive.mount(str(MOUNT_POINT))

repo_dir = Path(CONFIG["repo_dir"]).resolve()
if not repo_dir.exists():
    repo_dir.parent.mkdir(parents=True, exist_ok=True)
    clone_cmd = [
        "git",
        "clone",
        CONFIG["repo_url"],
        str(repo_dir),
        "--branch",
        CONFIG["repo_branch"],
        "--single-branch",
    ]
    subprocess.run(clone_cmd, check=True)
else:
    print(f"Repo already present at {repo_dir}")


Mounted at /content/drive


In [4]:
# Configure project paths and ensure modules import correctly
import os
import sys

from pathlib import Path

repo_root = Path(CONFIG["repo_dir"]).resolve()
data_root = Path(CONFIG["data_root"]).resolve()
output_root = Path(CONFIG["output_root"]).resolve()

os.environ["VG_COLAB_REPO_ROOT"] = str(repo_root)
os.environ["VG_COLAB_DATA_ROOT"] = str(data_root)
os.environ["VG_COLAB_OUTPUT_ROOT"] = str(output_root)

repo_root.mkdir(parents=True, exist_ok=True)
data_root.mkdir(parents=True, exist_ok=True)
output_root.mkdir(parents=True, exist_ok=True)

if str(repo_root) not in sys.path:
    sys.path.insert(0, str(repo_root))

print("repo_root:", repo_root)
print("data_root:", data_root)
print("output_root:", output_root)


repo_root: /content/comp545_final
data_root: /content/drive/MyDrive/comp545_data
output_root: /content/drive/MyDrive/comp545_outputs


In [5]:
import zipfile
from pathlib import Path

# Define paths based on user query
zip_path = Path("/content/drive/MyDrive/comp545_data/visual_genome_raw.zip")
target_dir = Path("/content/drive/MyDrive/comp545_data/")

# Ensure the target directory exists
target_dir.mkdir(parents=True, exist_ok=True)

if zip_path.is_file():
    print(f"Extracting {zip_path} to {target_dir}")
    with zipfile.ZipFile(zip_path, 'r') as zf:
        zf.extractall(target_dir)
    print("Extraction complete.")
else:
    print(f"Error: Zip file not found at {zip_path}")

Extracting /content/drive/MyDrive/comp545_data/visual_genome_raw.zip to /content/drive/MyDrive/comp545_data
Extraction complete.


In [None]:
zip_path = Path('/content/drive/MyDrive/comp545_data/visual_genome_raw/images.zip')
target_dir = Path("/content/drive/MyDrive/comp545_data/visual_genome/")
print(f"Extracting {zip_path} to {target_dir}")
with zipfile.ZipFile(zip_path, 'r') as zf:
    zf.extractall(target_dir)

zip_path = Path('/content/drive/MyDrive/comp545_data/visual_genome_raw/images2.zip')
print(f"Extracting {zip_path} to {target_dir}")
with zipfile.ZipFile(zip_path, 'r') as zf:
    zf.extractall(target_dir)
print("Extraction complete.")

Extracting /content/drive/MyDrive/comp545_data/visual_genome_raw/images.zip to /content/drive/MyDrive/comp545_data/visual_genome


## Dataset Checklist
Ensure Drive contains the Visual Genome archives under `CONFIG['data_root']`:
```
visual_genome_raw/
  region_descriptions.json (or .zip)
  image_data.json (or .zip)
  VG_100K.zip
  VG_100K_2.zip
visual_genome/
  images/VG_100K/
  images/VG_100K_2/
```
If you already processed splits elsewhere, copy `visual_genome/visual_genome_splits.json` here. Otherwise, run the next cell to prepare everything from scratch.


In [None]:
# Optional helper: unzip image archives from Drive into the expected directory
import zipfile

zip_map = {
    "images.zip": data_root / "visual_genome" / "images",
    "images2.zip": data_root / "visual_genome" / "images",
}

for zip_name, target_dir in zip_map.items():
    archive_path = data_root / "visual_genome_raw" / zip_name
    if not archive_path.is_file():
        print(f"Archive not found: {archive_path}")
        continue
    target_dir.mkdir(parents=True, exist_ok=True)
    with zipfile.ZipFile(archive_path, "r") as zf:
        print(f"Extracting {zip_name} -> {target_dir}")
        zf.extractall(target_dir)
print("Done extracting image archives.")


In [None]:
# Optional: download/process/verify Visual Genome (set RUN_PROCESS=True when needed)
RUN_PROCESS = False
PROCESS_CONFIG = {
    "max_images": 5000,
    "max_regions_per_image": 6,
    "min_region_words": 3,
    "validation_ratio": 0.1,
    "test_ratio": 0.1,
    "seed": 42,
}

if RUN_PROCESS:
    from src.config.runtime import resolve_paths
    from src.data.visual_genome import (
        VisualGenomeProcessConfig,
        download_visual_genome,
        process_visual_genome,
        verify_visual_genome,
    )

    paths = resolve_paths("colab")
    process_cfg = VisualGenomeProcessConfig(**PROCESS_CONFIG)
    download_visual_genome(paths, include_images=True, force=False)
    processed_path = process_visual_genome(paths, process_cfg)
    print("Processed splits saved to:", processed_path)
    verify_visual_genome(paths)
else:
    print("Skipping data preparation. Toggle RUN_PROCESS=True if required.")


In [None]:
# Helper utilities for running experiments and plotting metrics
from typing import Iterable

import matplotlib.pyplot as plt
import pandas as pd
from IPython.display import display

from src.config.runtime import resolve_paths
from src.training.vg_adapter import AdapterExperimentConfig, run_visual_genome_adapter


def run_experiment(limit, distill_weights: Iterable[float], output_name: str):
    config = AdapterExperimentConfig(
        output_name=output_name,
        limit_images=limit,
        distill_weights=tuple(distill_weights),
        adapter_steps=CONFIG["adapter_steps"],
        adapter_batch=CONFIG["adapter_batch"],
        chunk_words=CONFIG["chunk_words"],
        chunk_stride=CONFIG["chunk_stride"],
        chunk_threshold=CONFIG["chunk_threshold"],
        text_pooling=CONFIG["text_pooling"],
    )
    paths = resolve_paths("colab")
    results = run_visual_genome_adapter(paths, config)
    metrics_path = results.get("metrics_path")
    print("Metrics stored at:", metrics_path)
    summary_df = pd.DataFrame(results["summary"])
    return results, summary_df


def _collect_value_columns(summary_df: pd.DataFrame):
    base_cols = ["baseline", "chunk_baseline"]
    adapter_cols = sorted(col for col in summary_df.columns if col.startswith("adapter_"))
    return [col for col in base_cols + adapter_cols if col in summary_df]


def plot_summary(summary_df: pd.DataFrame, title: str):
    value_cols = _collect_value_columns(summary_df)
    melted = summary_df.melt(
        id_vars=["metric", "split"],
        value_vars=value_cols,
        var_name="variant",
        value_name="recall",
    )
    splits = sorted(melted["split"].unique())
    fig, axes = plt.subplots(len(splits), 1, figsize=(8, 4 * len(splits)), sharex=True)
    if len(splits) == 1:
        axes = [axes]
    for ax, split in zip(axes, splits):
        sub = melted[melted["split"] == split]
        pivot = sub.pivot(index="metric", columns="variant", values="recall")
        for variant in pivot.columns:
            ax.plot(pivot.index, pivot[variant], marker="o", label=variant)
        ax.set_title(f"{split} split")
        ax.set_ylabel("Recall (%)")
        ax.grid(alpha=0.3)
    axes[-1].set_xlabel("Metric")
    axes[0].legend()
    fig.suptitle(title)
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()


def plot_distribution(summary_df: pd.DataFrame, title: str):
    value_cols = _collect_value_columns(summary_df)
    plt.figure(figsize=(8, 4))
    for col in value_cols:
        plt.hist(summary_df[col], bins=10, alpha=0.4, label=col)
    plt.title(title)
    plt.xlabel("Recall (%)")
    plt.ylabel("Frequency")
    plt.legend()
    plt.grid(alpha=0.3)
    plt.show()



## 3. Smoke Test
Run a smaller experiment to confirm everything is wired correctly. This keeps runtime manageable before committing to the full dataset.


In [None]:
smoke_results, smoke_summary = run_experiment(
    limit=CONFIG["smoke_limit"],
    distill_weights=CONFIG["smoke_distill"],
    output_name=CONFIG["smoke_output"],
)
print("Smoke test metrics head:")
print(smoke_summary.head())


In [None]:
plot_summary(smoke_summary, title="Smoke Test Recall Curves")
plot_distribution(smoke_summary, title="Smoke Test Recall Distribution")


## 4. Full Experiment
Once the smoke test looks good, launch the full dataset run below. This may take significantly longer.


In [None]:
full_results, full_summary = run_experiment(
    limit=CONFIG["full_limit"],
    distill_weights=CONFIG["full_distill"],
    output_name=CONFIG["full_output"],
)
print("Full run metrics head:")
print(full_summary.head())


In [None]:
plot_summary(full_summary, title="Full Experiment Recall Curves")
plot_distribution(full_summary, title="Full Experiment Recall Distribution")


## 5. Compare Smoke vs Full
The cell below merges both runs (if available) to compare adapter performance.


In [None]:
def merge_runs(smoke_df: pd.DataFrame, full_df: pd.DataFrame) -> pd.DataFrame:
    smoke_df = smoke_df.copy()
    smoke_df["run"] = "smoke"
    full_df = full_df.copy()
    full_df["run"] = "full"
    return pd.concat([smoke_df, full_df], ignore_index=True)

if "smoke_summary" in globals() and "full_summary" in globals():
    combined = merge_runs(smoke_summary, full_summary)
    display(combined.head())
    value_cols = _collect_value_columns(smoke_summary)
    for split in sorted(combined["split"].unique()):
        subset = combined[combined["split"] == split]
        fig, ax = plt.subplots(figsize=(10, 4))
        positions = list(range(len(value_cols)))
        width = 0.35
        smoke_vals = [float(subset[subset["run"] == "smoke"][col].mean()) for col in value_cols]
        full_vals = [float(subset[subset["run"] == "full"][col].mean()) for col in value_cols]
        ax.bar([p - width / 2 for p in positions], smoke_vals, width=width, label="smoke")
        ax.bar([p + width / 2 for p in positions], full_vals, width=width, label="full")
        ax.set_xticks(positions)
        ax.set_xticklabels(value_cols, rotation=45)
        ax.set_ylabel("Mean Recall (%)")
        ax.set_title(f"Smoke vs Full comparison ({split} split)")
        ax.grid(alpha=0.3, axis="y")
        ax.legend()
        plt.tight_layout()
        plt.show()
else:
    print("Both smoke_summary and full_summary must be available to compare.")


## 6. Wrap Up
- Push code changes back to GitHub once satisfied.
- Large data stays on Drive; processed results land under `CONFIG['output_root']`.
- Adjust the configuration cell at the top to try different adapters, chunking strategies, or distillation weights.
