# Convert MATLAB Datasets to Python Format

This notebook converts MATLAB `.mat` files from the OUTPUT folder to Python NumPy format compatible with existing datasets.

**Usage:**
1. Set the input folder path (where the `.mat` files are located)
2. Set the output folder path (where converted datasets will be saved)
3. Optionally set whether to add a prefix to test datasets
4. Run all cells

The converted datasets will have the same format as the older datasets in the `data/` folder.


In [1]:
# Import required libraries
import numpy as np
import scipy.io as sio
import h5py
from pathlib import Path
from typing import Dict, Any, Tuple
import sys

# Import local modules
try:
    from mat73_loader import load_matlab_v73
except ImportError:
    try:
        sys.path.append(str(Path.cwd() / '2d-dispersion-py'))
        from mat73_loader import load_matlab_v73
    except ImportError:
        print("WARNING: mat73_loader not found. Make sure it's in the same directory or in 2d-dispersion-py/")
        raise


## Configuration

Set your input and output paths here:


In [None]:
# ============================================================================
# CONFIGURATION - Modify these paths as needed
# ============================================================================

# Path to folder containing .mat files (input)
input_folder = r"D:\Research\NO-2D-Metamaterials\OUTPUT\train dataset 8"

# Path to folder where converted datasets will be saved (output)
output_folder = r"D:\Research\NO-2D-Metamaterials\data\train"

# Prefix to add to converted dataset folder names (for test datasets)
# Set to empty string "" or None if no prefix needed
# For test datasets, use "test_" to mark them clearly
dataset_prefix = "train_8_"

# Complex dtype for eigenvectors (options: 'complex128' or 'complex64')
complex_precision = 'complex128'

# ============================================================================

# Convert to Path objects
input_path = Path(input_folder)
output_path = Path(output_folder)

print(f"Input folder: {input_path}")
print(f"Output folder: {output_path}")
print(f"Dataset prefix: '{dataset_prefix}'")
print(f"Complex precision: {complex_precision}")
print()

# Validate input path
if not input_path.exists():
    raise FileNotFoundError(f"Input folder does not exist: {input_path}")

# Create output folder if it doesn't exist
output_path.mkdir(parents=True, exist_ok=True)
print(f"Output folder ready: {output_path}")


Input folder: D:\Research\NO-2D-Metamaterials\OUTPUT\train dataset 8
Output folder: D:\Research\NO-2D-Metamaterials\data_251117\train
Dataset prefix: 'train_8_'
Complex precision: complex128

Output folder ready: D:\Research\NO-2D-Metamaterials\data_251117\train


In [2]:
## Conversion Functions

def extract_scalar(val: Any) -> float:
    """Extract scalar value from various nested structures."""
    if np.isscalar(val):
        return float(val)
    elif isinstance(val, np.ndarray):
        if val.ndim == 0:
            return float(val.item())
        else:
            return float(val.flatten()[0])
    else:
        return float(val)


def parse_const(const_raw: Dict) -> Dict[str, Any]:
    """Parse const structure to Python dictionary."""
    const = {}
    
    for key, val in const_raw.items():
        if isinstance(val, np.ndarray):
            if val.size == 1:
                const[key] = extract_scalar(val)
            elif val.dtype.kind == 'U' or val.dtype.kind == 'S':
                # String array
                const[key] = ''.join(chr(c) for c in val.flatten() if c != 0).strip()
            else:
                const[key] = val
        else:
            const[key] = val
    
    return const


def embed_2const_wavelet(wavevector_x: np.ndarray, wavevector_y: np.ndarray, 
                         size: int = 32) -> np.ndarray:
    """Embed 2 constant wavevectors into spatial domain using wavelet-like encoding."""
    N_wv = len(wavevector_x)
    waveforms = np.zeros((N_wv, size, size), dtype=np.float32)
    
    x = np.linspace(0, 1, size)
    y = np.linspace(0, 1, size)
    X, Y = np.meshgrid(x, y)
    
    for i in range(N_wv):
        kx = wavevector_x[i]
        ky = wavevector_y[i]
        waveforms[i] = np.sin(2 * np.pi * kx * X) * np.cos(2 * np.pi * ky * Y)
    
    return waveforms


def embed_integer_wavelet(bands: np.ndarray, size: int = 32) -> np.ndarray:
    """Embed integer band indices into spatial domain using wavelet-like encoding."""
    N_bands = len(bands)
    bands_fft = np.zeros((N_bands, size, size), dtype=np.float32)
    
    x = np.linspace(0, 1, size)
    y = np.linspace(0, 1, size)
    X, Y = np.meshgrid(x, y)
    
    for i, band in enumerate(bands):
        bands_fft[i] = np.sin(band * np.pi * X) * np.cos(band * np.pi * Y)
    
    return bands_fft


def reshape_eigenvectors_to_spatial(eigenvector_data: np.ndarray, N_pix: int) -> Tuple[np.ndarray, np.ndarray]:
    """Reshape eigenvector data from DOF format to spatial format and split into x/y components."""
    N_struct, N_eig, N_wv, N_dof = eigenvector_data.shape
    N_nodes = int(np.sqrt(N_dof / 2))
    
    eigenvector_data = eigenvector_data.transpose(0, 2, 1, 3)  # (N_struct, N_wv, N_eig, N_dof)
    
    eigvec_x = np.zeros((N_struct, N_wv, N_eig, N_pix, N_pix), dtype=eigenvector_data.dtype)
    eigvec_y = np.zeros((N_struct, N_wv, N_eig, N_pix, N_pix), dtype=eigenvector_data.dtype)
    
    for struct_idx in range(N_struct):
        for wv_idx in range(N_wv):
            for eig_idx in range(N_eig):
                dof_vec = eigenvector_data[struct_idx, wv_idx, eig_idx, :]
                u_dof = dof_vec[0::2]
                v_dof = dof_vec[1::2]
                eigvec_x[struct_idx, wv_idx, eig_idx, :, :] = \
                    u_dof.reshape(N_nodes, N_nodes)[:N_pix, :N_pix]
                eigvec_y[struct_idx, wv_idx, eig_idx, :, :] = \
                    v_dof.reshape(N_nodes, N_nodes)[:N_pix, :N_pix]
    
    return eigvec_x, eigvec_y


def load_mat_file(mat_path: Path) -> Dict[str, Any]:
    """Load MATLAB dataset from .mat file."""
    try:
        data = sio.loadmat(str(mat_path), squeeze_me=False)
        data = {k: v for k, v in data.items() if not k.startswith('__')}
    except NotImplementedError:
        data = load_matlab_v73(str(mat_path), verbose=False)
    
    return data


def convert_mat_file(mat_path: Path, output_dir: Path, complex_precision: str = 'complex128', 
                     dataset_prefix: str = ""):
    """Convert a single MATLAB file to Python format."""
    # Determine complex dtype
    if complex_precision.lower() in ('128', 'complex128'):
        eigen_complex_dtype = np.complex128
    elif complex_precision.lower() in ('64', 'complex64'):
        eigen_complex_dtype = np.complex64
    else:
        eigen_complex_dtype = np.complex128
    
    # Load dataset
    data = load_mat_file(mat_path)
    
    # Extract const
    if 'const' in data:
        const = parse_const(data['const'])
        N_pix = int(extract_scalar(const.get('N_pix', 32)))
        N_eig = int(extract_scalar(const.get('N_eig', 6)))
    else:
        const = {}
        N_pix = 32
        N_eig = 6
    
    # Extract designs
    designs = data['designs']
    
    # Handle different dimension orders
    if designs.ndim == 4:
        if designs.shape[1] == 3:  # (N_struct, 3, N_pix, N_pix)
            designs_first_pane = designs[:, 0, :, :]  # (N_struct, N_pix, N_pix)
        else:
            designs_first_pane = designs[:, :, 0, :].transpose(2, 0, 1)
    else:
        designs_first_pane = designs
    
    N_struct = designs_first_pane.shape[0]
    
    # Create output subdirectory for this file
    file_stem = mat_path.stem
    if dataset_prefix:
        file_stem = f"{dataset_prefix}{file_stem}"
    file_output_dir = output_dir / file_stem
    file_output_dir.mkdir(exist_ok=True, parents=True)
    
    # Save designs (float16)
    np.save(file_output_dir / 'designs.npy', designs_first_pane.astype(np.float16))
    
    # Save design_params (float64)
    design_params = np.array([N_eig, N_pix], dtype=np.float64)
    np.save(file_output_dir / 'design_params.npy', design_params)
    
    # Extract wavevector data
    if 'WAVEVECTOR_DATA' in data:
        wavevector_data = data['WAVEVECTOR_DATA']
        
        # Handle different dimension orders: (N_struct, 2, N_wv) -> (N_struct, N_wv, 2)
        if wavevector_data.shape[1] == 2 and wavevector_data.shape[2] != 2:
            wavevector_data = wavevector_data.transpose(0, 2, 1)  # (N_struct, N_wv, 2)
        
        N_wv = wavevector_data.shape[1]
        
        # Save wavevectors (float16)
        np.save(file_output_dir / 'wavevectors.npy', wavevector_data.astype(np.float16))
        
        # Compute waveforms from first structure's wavevectors
        waveforms = embed_2const_wavelet(
            wavevector_data[0, :, 0],
            wavevector_data[0, :, 1],
            size=N_pix
        )
        np.save(file_output_dir / 'waveforms.npy', waveforms.astype(np.float16))
    
    # Extract eigenvalue data
    if 'EIGENVALUE_DATA' in data:
        eigenvalue_data = data['EIGENVALUE_DATA']
        
        # Handle dimension order: (N_struct, N_eig, N_wv) -> (N_struct, N_wv, N_eig)
        if eigenvalue_data.shape[1] == N_eig and eigenvalue_data.shape[2] != N_eig:
            eigenvalue_data = eigenvalue_data.transpose(0, 2, 1)  # (N_struct, N_wv, N_eig)
        
        # Save eigenvalue data (float16)
        np.save(file_output_dir / 'eigenvalue_data.npy', eigenvalue_data.astype(np.float16))
        
        # Compute bands_fft
        bands = np.arange(1, N_eig + 1)
        bands_fft = embed_integer_wavelet(bands, size=N_pix)
        np.save(file_output_dir / 'bands_fft.npy', bands_fft.astype(np.float16))
    
    # Extract eigenvector data
    if 'EIGENVECTOR_DATA' in data:
        eigenvector_data = data['EIGENVECTOR_DATA']
        
        # Handle structured dtype (real/imag)
        if eigenvector_data.dtype.names and 'real' in eigenvector_data.dtype.names:
            eigenvector_data = eigenvector_data['real'] + 1j * eigenvector_data['imag']
        
        # Handle dimension order: (N_struct, N_eig, N_wv, N_dof) -> (N_struct, N_wv, N_eig, N_dof)
        if eigenvector_data.shape[1] == N_eig and eigenvector_data.shape[2] != N_eig:
            pass  # Already in (N_struct, N_eig, N_wv, N_dof) format
        elif eigenvector_data.shape[1] != N_eig:
            eigenvector_data = eigenvector_data.transpose(0, 2, 1, 3)
        
        # Reshape to spatial format
        eigvec_x, eigvec_y = reshape_eigenvectors_to_spatial(eigenvector_data, N_pix)
        
        # Save eigenvector components
        np.save(file_output_dir / 'eigenvector_data_x.npy', eigvec_x.astype(eigen_complex_dtype))
        np.save(file_output_dir / 'eigenvector_data_y.npy', eigvec_y.astype(eigen_complex_dtype))
    
    return file_output_dir


In [32]:
## Run Conversion
# This cell will convert all `.mat` files found in the input folder.

# Find all .mat files in the input folder
mat_files = sorted(input_path.glob('*.mat'))

if not mat_files:
    print(f"ERROR: No .mat files found in {input_path}")
else:
    print(f"Found {len(mat_files)} .mat files to convert")
    print("=" * 80)
    
    converted_folders = []
    errors = []
    
    for i, mat_file in enumerate(mat_files, 1):
        print(f"[{i}/{len(mat_files)}] Converting: {mat_file.name}")
        try:
            output_subdir = convert_mat_file(
                mat_file, 
                output_path, 
                complex_precision=complex_precision,
                dataset_prefix=dataset_prefix or ""
            )
            converted_folders.append(output_subdir.name)
            print(f"    [OK] Saved to: {output_subdir}")
        except Exception as e:
            error_msg = f"    ERROR converting {mat_file.name}: {e}"
            print(error_msg)
            errors.append((mat_file.name, str(e)))
            import traceback
            traceback.print_exc()
    
    # Summary
    print("\n" + "=" * 80)
    print("Conversion Summary")
    print("=" * 80)
    print(f"Input folder: {input_path}")
    print(f"Output folder: {output_path}")
    print(f"Files converted: {len(converted_folders)}/{len(mat_files)}")
    
    if converted_folders:
        print(f"\nSuccessfully converted datasets ({len(converted_folders)}):")
        for folder_name in converted_folders:
            print(f"  - {folder_name}")
    
    if errors:
        print(f"\nErrors ({len(errors)}):")
        for filename, error in errors:
            print(f"  - {filename}: {error}")
    
    print("=" * 80)


Found 40 .mat files to convert
[1/40] Converting: out_binarized_1.mat
    [OK] Saved to: D:\Research\NO-2D-Metamaterials\data_251117\train\train_8_out_binarized_1
[2/40] Converting: out_binarized_10.mat
    [OK] Saved to: D:\Research\NO-2D-Metamaterials\data_251117\train\train_8_out_binarized_10
[3/40] Converting: out_binarized_11.mat
    [OK] Saved to: D:\Research\NO-2D-Metamaterials\data_251117\train\train_8_out_binarized_11
[4/40] Converting: out_binarized_12.mat
    [OK] Saved to: D:\Research\NO-2D-Metamaterials\data_251117\train\train_8_out_binarized_12
[5/40] Converting: out_binarized_13.mat
    [OK] Saved to: D:\Research\NO-2D-Metamaterials\data_251117\train\train_8_out_binarized_13
[6/40] Converting: out_binarized_14.mat
    [OK] Saved to: D:\Research\NO-2D-Metamaterials\data_251117\train\train_8_out_binarized_14
[7/40] Converting: out_binarized_15.mat
    [OK] Saved to: D:\Research\NO-2D-Metamaterials\data_251117\train\train_8_out_binarized_15
[8/40] Converting: out_binarized_

In [33]:
# Verify a converted dataset (use first one if available)
if 'converted_folders' in locals() and converted_folders:
    verify_folder = output_path / converted_folders[0]
    print(f"Verifying dataset: {verify_folder.name}")
    print("-" * 80)
    
    # Check all expected files exist
    expected_files = [
        'designs.npy',
        'design_params.npy',
        'wavevectors.npy',
        'waveforms.npy',
        'eigenvalue_data.npy',
        'eigenvector_data_x.npy',
        'eigenvector_data_y.npy',
        'bands_fft.npy'
    ]
    
    print("Files present:")
    for fname in expected_files:
        fpath = verify_folder / fname
        if fpath.exists():
            arr = np.load(fpath)
            print(f"  ✓ {fname:25s} shape={str(arr.shape):20s} dtype={arr.dtype}")
        else:
            print(f"  ✗ {fname:25s} MISSING")
    
    # Compare with old dataset format
    print("\n" + "-" * 80)
    print("Format comparison with old dataset:")
    old_dataset_path = Path(r"D:\Research\NO-2D-Metamaterials\data\set_b1_1200n")
    if old_dataset_path.exists():
        print(f"Comparing with: {old_dataset_path.name}")
        for fname in expected_files:
            new_fpath = verify_folder / fname
            old_fpath = old_dataset_path / fname
            if new_fpath.exists() and old_fpath.exists():
                new_arr = np.load(new_fpath)
                old_arr = np.load(old_fpath)
                shape_match = new_arr.shape[1:] == old_arr.shape[1:] if len(new_arr.shape) == len(old_arr.shape) else False
                dtype_match = new_arr.dtype == old_arr.dtype
                status = "✓" if (shape_match or fname in ['design_params.npy']) and dtype_match else "?"
                print(f"  {status} {fname:25s} new={new_arr.shape}, old={old_arr.shape}")
    else:
        print("Old dataset not found for comparison")
else:
    print("No datasets converted yet. Run the conversion cell first.")


Verifying dataset: train_8_out_binarized_1
--------------------------------------------------------------------------------
Files present:
  ✓ designs.npy               shape=(150, 32, 32)        dtype=float16
  ✓ design_params.npy         shape=(2,)                 dtype=float64
  ✓ wavevectors.npy           shape=(150, 91, 2)         dtype=float16
  ✓ waveforms.npy             shape=(91, 32, 32)         dtype=float16
  ✓ eigenvalue_data.npy       shape=(150, 91, 6)         dtype=float16
  ✓ eigenvector_data_x.npy    shape=(150, 91, 6, 32, 32) dtype=complex128
  ✓ eigenvector_data_y.npy    shape=(150, 91, 6, 32, 32) dtype=complex128
  ✓ bands_fft.npy             shape=(6, 32, 32)          dtype=float16

--------------------------------------------------------------------------------
Format comparison with old dataset:
Comparing with: set_b1_1200n
  ✓ designs.npy               new=(150, 32, 32), old=(1200, 32, 32)
  ✓ design_params.npy         new=(2,), old=(1, 6)
  ? wavevectors.npy  