In [1]:
import os 
import pandas as pd
import numpy as np
from simsopt.field import Current
from simsopt.geo import SurfaceRZFourier
from simsopt._core import load
from pathlib import Path
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
import tensorflow as tf
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
from tqdm import tqdm

2025-06-27 23:21:25.730297: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-06-27 23:21:25.744913: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-06-27 23:21:25.749072: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [None]:
data_dir = Path('quasr_simsopt_files')
output_surface_dir = Path('surface_tfrecords')

MAX_COEFS = 441
MAX_NFP = 5

In [3]:
def _bytes_feature(value):
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _float_feature(value):
    return tf.train.Feature(float_list=tf.train.FloatList(value=value.flatten()))

def _int_feature(value):
    return tf.train.Feature(int64_list=tf.train.Int64List(value=value))

def serialize_surface(id: str, surface: np.ndarray, surface_mask: np.ndarray):
    feature = {
        'ID': _bytes_feature(id.encode('utf-8')),
        'surface_data': _float_feature(surface),
        'surface_mask': _int_feature(surface_mask)
    }

    example_proto = tf.train.Example(features=tf.train.Features(feature=feature))
    return example_proto.SerializeToString()

In [4]:
def process_surface(file_path):
    """
    Convert s.x and metadata to [N_modes, 5] array:
    columns = [m_norm, n_norm, is_cos, is_R, coeff_value]
    """
    try:
        id = str(file_path)[-12:-5]
        surfaces, coils = load(str(file_path))
        outer_surface = surfaces[-1]
        s = outer_surface.to_RZFourier() #the bottleneck in speed

        x = s.x  # the coeff vector
        nfp = s.nfp

        num_coefs = len(x)
        if num_coefs > MAX_COEFS:
            print(num_coefs)
        
        m = s.m  # mode numbers, shape (N_modes,)
        n = s.n
        
        num_modes = (len(m)+1)//2
        # Normalize mode indices
        max_m = np.max(np.abs(m)) or 1
        max_n = np.max(np.abs(n)) or 1

        # Type flags
        is_cos = np.concatenate([
            np.ones(num_modes, dtype=bool),  # R_cos
            np.zeros(num_modes-1, dtype=bool)  # Z_sin
        ])

        surface_set = np.zeros((MAX_COEFS, 5), dtype=np.float32)
        surface_mask = np.zeros((MAX_COEFS,), dtype=np.int64)
        nfp_norm = float(nfp) / MAX_NFP 

        for i in range(min(num_coefs, MAX_COEFS)):
            surface_set[i] = [
                m[i] / max_m,
                n[i] / max_n,
                float(is_cos[i]),
                x[i],
                nfp_norm
            ]
            surface_mask[i] = 1
            
        return serialize_surface(id, surface_set, surface_mask)

    except Exception as e:
        print(f"Failed on {file_path}: {e}")
        return None

In [None]:
def write_tfrecord_chunk(serialized_examples, output_path):
    with tf.io.TFRecordWriter(str(output_path)) as writer:
        for ex in serialized_examples:
            if ex:
                writer.write(ex)

def datasets_to_tfrecords(directory: Path, output_surface_dir: Path,
                               chunk_size=10000, num_workers=64):
    files = list(directory.glob("*.json"))
    total_files = len(files)
    output_surface_dir.mkdir(parents=True, exist_ok=True)

    for i in range(0, total_files, chunk_size):
        chunk_files = files[i:i + chunk_size]
        with ProcessPoolExecutor(max_workers=num_workers) as executor:
            serialized_examples = list(tqdm(
                executor.map(process_surface, chunk_files),
                total=len(chunk_files),
                desc=f"Chunk {i//chunk_size:03d}"
            ))

        serialized_surfaces = [ex for ex in serialized_examples if ex is not None]

        output_surface_path = output_surface_dir / f"surfaces_chunk_{i//chunk_size:03d}.tfrecord"

        write_tfrecord_chunk(serialized_surfaces, output_surface_path)
        print(f"✅ Saved {len(serialized_surfaces)} surface samples to {output_surface_dir}")

In [None]:
datasets_to_tfrecords(directory=data_dir, output_surface_dir=output_surface_dir)

Chunk 000: 100%|██████████| 3000/3000 [15:56<00:00,  3.14it/s]

✅ Saved 3000 surface samples to surface_tfrecords



