In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
ROOT = "/content/drive/MyDrive/Omni-CAD-subset-complete"

In [7]:
!git clone https://github.com/veoery/CMU16825_Final_project.git

fatal: destination path 'CMU16825_Final_project' already exists and is not an empty directory.


In [None]:
!cd /content/CMU16825_Final_project
!git pull

In [13]:
import os
import glob
import json
from pathlib import Path
import sys

# Add the project root to sys.path to import local modules
sys.path.append("/content/CMU16825_Final_project")

# Test 1: Check directory structure
def test_directory_structure():
    """Verify your data directories exist."""
    print("=" * 60)
    print("TEST 1: Checking directory structure")
    print("=" * 60)

    required_dirs = {
        "truncated_json": f"{ROOT}/json_truncated",
        "full_json": f"{ROOT}/json",
        "images": f"{ROOT}/img",
        "point_clouds": f"{ROOT}/pointcloud",
    }

    for name, path in required_dirs.items():
        if os.path.isdir(path):
            num_files = len(os.listdir(path))
            print(f"✓ {name}: {path} ({num_files} folders)")
        else:
            print(f"✗ {name}: {path} (NOT FOUND)")

    return required_dirs


# Test 2: List sample files
def test_list_files(required_dirs):
    """List sample files from each directory."""
    print("\n" + "=" * 60)
    print("TEST 2: Listing sample files")
    print("=" * 60)

    for name, path in required_dirs.items():
        print(f"\n{name} (top-level entries):")
        top_level_entries = sorted(os.listdir(path))
        for entry in top_level_entries[:3]:
            print(f"  - {entry} {'(dir)' if os.path.isdir(os.path.join(path, entry)) else '(file)'}")

        # Now, if it's a JSON directory, pick one of the top-level directories and list some actual json files inside it
        if name == "truncated_json" or name == "full_json":
            if top_level_entries:
                sample_sub_dir_name = top_level_entries[0]
                sample_sub_dir_path = os.path.join(path, sample_sub_dir_name)
                if os.path.isdir(sample_sub_dir_path):
                    print(f"  - (First 3 .json files in {sample_sub_dir_name}):")
                    json_files_in_subdir = []
                    # Use iglob to be efficient, only take the first 3
                    for json_file in glob.iglob(os.path.join(sample_sub_dir_path, '**', '*.json'), recursive=True):
                        json_files_in_subdir.append(json_file)
                        if len(json_files_in_subdir) >= 3:
                            break
                    for f in sorted(json_files_in_subdir):
                        print(f"    - {Path(f).name}")
                else:
                    print(f"    (No subdirectories to list JSON files in {sample_sub_dir_name})")
        elif top_level_entries: # For images/pointclouds, list first 3 files if they are directly in the root
            actual_files = sorted([f for f in glob.iglob(os.path.join(path, '*')) if os.path.isfile(f)])
            if actual_files:
                print(f"  - (First 3 files in {name}):")
                for f in actual_files[:3]:
                    print(f"    - {Path(f).name}")


# Test 3: Load sample data
def test_load_sample_data(required_dirs):
    """Try loading sample JSON files."""
    print("\n" + "=" * 60)
    print("TEST 3: Loading sample JSON data")
    print("=" * 60)

    sample_file = None
    truncated_dir = required_dirs["truncated_json"]
    top_level_entries = sorted(os.listdir(truncated_dir))

    if top_level_entries:
        sample_sub_dir_name = top_level_entries[0]
        sample_sub_dir_path = os.path.join(truncated_dir, sample_sub_dir_name)
        if os.path.isdir(sample_sub_dir_path):
            # Find the first JSON file in the sample subdirectory
            json_files_in_subdir = glob.glob(os.path.join(sample_sub_dir_path, '**', '*.json'), recursive=True)
            if json_files_in_subdir:
                sample_file = sorted(json_files_in_subdir)[0]

    if sample_file:
        try:
            with open(sample_file, 'r') as f:
                data = json.load(f)
            print(f"✓ Successfully loaded: {Path(sample_file).name}")
            print(f"  Data type: {type(data)}")
            if isinstance(data, dict):
                print(f"  Keys: {list(data.keys())[:5]}")
            elif isinstance(data, list):
                print(f"  Length: {len(data)}")
        except Exception as e:
            print(f"✗ Error loading {sample_file}: {e}")
    else:
        print("No sample JSON file found to load for testing.")


# Test 4: Create path lists
def test_create_path_lists(required_dirs):
    """Create path lists like in your training script."""
    print("\n" + "=" * 60)
    print("TEST 4: Creating path lists")
    print("=" * 60)

    truncated_dir = required_dirs["truncated_json"]
    full_dir = required_dirs["full_json"]

    # truncated_paths = []
    # full_paths = []

    truncated_paths = sorted(glob.glob(truncated_dir + "/**/*.json", recursive=True))
    full_paths = []
    for tr in truncated_paths:
        base = os.path.basename(tr).split("_tr_")[0] + ".json"
        # Reconstruct the full path preserving the subdirectory structure
        rel_dir = os.path.dirname(tr).replace(truncated_dir,"").lstrip("/")
        full_path = os.path.join(full_dir, rel_dir, base) if rel_dir else os.path.join(full_dir, base)
        full_paths.append(full_path)


    # top_level_entries = sorted(os.listdir(truncated_dir))
    # if top_level_entries:
    #     sample_sub_dir_name = top_level_entries[0]
    #     sample_sub_dir_path = os.path.join(truncated_dir, sample_sub_dir_name)
    #     if os.path.isdir(sample_sub_dir_path):
    #         # Find a few JSON files in the sample subdirectory
    #         sample_json_files = sorted(glob.glob(os.path.join(sample_sub_dir_path, '**', '*.json'), recursive=True))[:3]
    #         for tr_path in sample_json_files:
    #             relative_path = os.path.relpath(tr_path, truncated_dir)
    #             full_path = os.path.join(full_dir, relative_path)
    #             exists = os.path.exists(full_path)
    #             status = "✓" if exists else "✗"
    #             print(f"{status} {Path(tr_path).name} -> {Path(full_path).name}")
    #             truncated_paths.append(tr_path)
    #             full_paths.append(full_path)

    if not truncated_paths:
        print("No truncated JSON files found in sample subdirectory to create path lists.")
    else:
        print(f"Created path lists for {len(truncated_paths)} sample files.")

    return truncated_paths, full_paths


# Test 5: Test dataloader import (only in Colab with dependencies)
def test_dataloader_import():
    """Test if dataloader can be imported."""
    print("\n" + "=" * 60)
    print("TEST 5: Testing dataloader import")
    print("=" * 60)

    try:
        from cad_mllm.data import get_autocomplete_dataloader
        print("✓ Successfully imported get_autocomplete_dataloader")
        return True
    except ImportError as e:
        print(f"⚠ Could not import (expected if dependencies not installed): {e}")
        return False


# Test 6: Test encoders (only if models are loaded)
def test_encoders():
    """Test if encoders can be imported."""
    print("\n" + "=" * 60)
    print("TEST 6: Testing encoder imports")
    print("=" * 60)

    try:
        from cad_mllm.encoders import ImageEncoder, MichelangeloPointEncoder
        print("✓ Successfully imported ImageEncoder")
        print("✓ Successfully imported MichelangeloPointEncoder")
        return True
    except ImportError as e:
        print(f"⚠ Could not import encoders: {e}")
        return False


def main():
    """Run all tests."""
    print("\n" + "=" * 60)
    print("CAD-MLLM DATALOADER TEST SCRIPT")
    print("=" * 60)

    # Test directory structure
    required_dirs = test_directory_structure()

    # Test file listing
    test_list_files(required_dirs)

    # Test loading data
    test_load_sample_data(required_dirs)

    # Test path creation
    truncated_paths, full_paths = test_create_path_lists(required_dirs)

    # Test imports
    dataloader_ok = test_dataloader_import()
    encoders_ok = test_encoders()

    print("\n" + "=" * 60)
    print("TEST SUMMARY")
    print("=" * 60)

    if dataloader_ok and encoders_ok:
        print("✓ All imports successful!")
        print("\nYou can now use the dataloader like this:")
        print("""
from cad_mllm.data import get_autocomplete_dataloader
from cad_mllm.encoders import ImageEncoder, MichelangeloPointEncoder

# Initialize encoders (optional, can pass None)
image_encoder = ImageEncoder(freeze=True)

# Create dataloader
train_loader = get_autocomplete_dataloader(
    truncated_paths,
    full_paths,
    image_dir="/path/to/images",
    pc_dir="/path/to/point_clouds",
    tokenizer=tokenizer,
    image_encoder=image_encoder,
    batch_size=4
)

# Training loop
for batch in train_loader:
    batch = {k: (v.to(device) if torch.is_tensor(v) else v) for k, v in batch.items()}
    outputs = model(**batch)
    loss = outputs.loss
    loss.backward()
        """)
    else:
        print("⚠ Some imports failed, but the code structure should be correct")
        print("  This may be due to missing dependencies")

    print("=" * 60)


if __name__ == "__main__":
    main()


CAD-MLLM DATALOADER TEST SCRIPT
TEST 1: Checking directory structure
✓ truncated_json: /content/drive/MyDrive/Omni-CAD-subset-complete/json_truncated (100 folders)
✓ full_json: /content/drive/MyDrive/Omni-CAD-subset-complete/json (100 folders)
✓ images: /content/drive/MyDrive/Omni-CAD-subset-complete/img (100 folders)
✓ point_clouds: /content/drive/MyDrive/Omni-CAD-subset-complete/pointcloud (100 folders)

TEST 2: Listing sample files

truncated_json (top-level entries):
  - 0000 (dir)
  - 0001 (dir)
  - 0002 (dir)
  - (First 3 .json files in 0000):
    - 00000347_00005_tr_03.json
    - 00000669_00019_tr_03.json
    - 00002009_00012_tr_01.json

full_json (top-level entries):
  - 0000 (dir)
  - 0001 (dir)
  - 0002 (dir)
  - (First 3 .json files in 0000):
    - 00007265_00005.json
    - 00007282_00001.json
    - 00009435_00002.json

images (top-level entries):
  - 0000 (dir)
  - 0001 (dir)
  - 0002 (dir)

point_clouds (top-level entries):
  - 0000 (dir)
  - 0001 (dir)
  - 0002 (dir)

TE