In [4]:
# Extract a single PT file
import tarfile
with tarfile.open("./dataspace/polyhaven_tiny/cubediff_train.tar", "r") as tar:
    for member in tar.getmembers():
        if member.name.endswith('.pt'):
            tar.extract(member, path="/tmp")
            print(f"Extracted {member.name} to /tmp")
            break

Extracted quarry_01_puresky.quarry_01_puresky.pt to /tmp


In [5]:
# Try to load it with different methods
import torch
import numpy as np
import pickle

file_path = "/tmp/quarry_01_puresky.quarry_01_puresky.pt"  # Use the actual extracted file name

# Try torch.load
try:
    data = torch.load(file_path)
    print(f"Torch.load succeeded: {type(data)}, shape: {data.shape if hasattr(data, 'shape') else 'no shape'}")
except Exception as e:
    print(f"Torch.load failed: {e}")

# Try numpy load
try:
    data = np.load(file_path, allow_pickle=True)
    print(f"Numpy load succeeded: {type(data)}, shape: {data.shape if hasattr(data, 'shape') else 'no shape'}")
except Exception as e:
    print(f"Numpy load failed: {e}")

# Try pickle
try:
    with open(file_path, 'rb') as f:
        data = pickle.load(f)
    print(f"Pickle succeeded: {type(data)}, shape: {data.shape if hasattr(data, 'shape') else 'no shape'}")
except Exception as e:
    print(f"Pickle failed: {e}")

Torch.load succeeded: <class 'torch.Tensor'>, shape: torch.Size([6, 4, 64, 64])
Numpy load succeeded: <class 'numpy.lib.npyio.NpzFile'>, shape: no shape
Pickle failed: A load persistent id instruction was encountered,
but no persistent_load function was specified.


In [3]:
import tarfile, os
# Add to your notebook to verify tar structure
with tarfile.open("./dataspace/polyhaven_tiny/cubediff_train.tar", "r") as tar:
    files = tar.getnames()
    pt_files = [f for f in files if f.endswith('.pt')]
    txt_files = [f for f in files if f.endswith('.txt')]
    
    print(f"Total files: {len(files)}")
    print(f"PT files: {len(pt_files)}")
    print(f"TXT files: {len(txt_files)}")
    
    # Check for matching base names
    pt_bases = [os.path.splitext(os.path.basename(f))[0] for f in pt_files]
    txt_bases = [os.path.splitext(os.path.basename(f))[0] for f in txt_files]
    
    matched = set(pt_bases).intersection(set(txt_bases))
    print(f"Matched pairs: {len(matched)} out of {len(pt_files)} PT files")

Total files: 1302
PT files: 651
TXT files: 651
Matched pairs: 651 out of 651 PT files
