In [1]:
# Cell 3: Download and load QUASR device database
import gzip
import json
from io import BytesIO
import requests
import pandas as pd
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed, ProcessPoolExecutor
import time
import numpy as np
from simsopt.field import Current
from simsopt.geo import SurfaceRZFourier
from simsopt._core import load
from pathlib import Path
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
import tensorflow as tf
import shutil

2025-06-30 13:03:26.498665: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-06-30 13:03:26.524165: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-06-30 13:03:26.532326: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [None]:
SIMSOPT_DIR = 'quasr_simsopt_files'
LOG_CSV = "quasr_log.csv"
data_dir = Path('quasr_simsopt_files')
output_coil_dir = Path('coil_tfrecords')
output_surface_dir = Path('surface_tfrecords')
os.makedirs(SIMSOPT_DIR, exist_ok=True)

MAX_COILS = 6
FEATURES_PER_COIL = 100
NUM_THREAD_WORKERS = 128
NUM_PROCESS_WORKERS = 64
CHUNK_SIZE = 10000
MAX_RETRIES = 3
RETRY_DELAY = 2  # seconds

In [None]:
url = "https://quasr.flatironinstitute.org/database.json.gz"
print('Downloading device database...')
r = requests.get(url)
r.raise_for_status()

with gzip.open(BytesIO(r.content), 'rt', encoding='utf-8') as f:
    data = json.load(f)

df = pd.DataFrame(**data)
print(f"Loaded {len(df)} devices.")

df.to_hdf('QUASR_Stellarators.h5', key = 'full_dataset')

In [None]:
# Cell 4: Apply filters to select matching devices
filtered = df[
    (df["Nfourier_coil"] == 16) &
    (df['qs_error'] >= -4) &
    # (df["max_elongation"] <= 10) &
    # (df["aspect_ratio"] >= 4) & (df["aspect_ratio"] <= 10) &
    (df["nc_per_hp"] >= 1) & (df["nc_per_hp"] <= 6) &
    (df["nfp"] >= 1) & (df["nfp"] <= 5)
].copy()

print(f"{len(filtered)} devices match your criteria.")

314309 devices match your criteria.


In [None]:
def simsopt_url(device_id):
    pid = device_id.zfill(7)
    return f"https://quasr.flatironinstitute.org/simsopt_serials/{pid[:4]}/serial{pid}.json"

In [None]:
# Cell 6: Robust download with retries
def download_with_retries(url: str, path: str) -> bool:
    for attempt in range(1, MAX_RETRIES + 1):
        try:
            r = requests.get(url, timeout=30)
            if r.status_code == 200:
                with open(path, 'wb') as f:
                    f.write(r.content)
                return True
            else:
                print(f"{url} returned status {r.status_code} (attempt {attempt})")
        except Exception as e:
            print(f"Error on {url} (attempt {attempt}): {e}")
        time.sleep(RETRY_DELAY)
    return False

In [None]:
# Cell 7: Prepare log and list of device IDs to download
if os.path.exists(LOG_CSV):
    log_df = pd.read_csv(LOG_CSV, dtype=str)
else:
    log_df = pd.DataFrame(columns=["ID", 'simsopt_url', "status"])

processed = set(log_df["ID"])
device_ids = [str(d) for d in filtered["ID"] if str(d) not in processed] #this is where you change which df you want the device ids from
chunks = [device_ids[i:i+CHUNK_SIZE] for i in range(0, len(device_ids), CHUNK_SIZE)]
print(f"{len(device_ids)} devices to download in {len(chunks)} chunks.")

3000 devices to download in 1 chunks.


In [None]:
def process_device(dev_id):
        pid = dev_id.zfill(7)
        # vmec_path = os.path.join(VMEC_DIR, f"input.{pid}")
        simsopt_path = os.path.join(SIMSOPT_DIR, f"input_{pid}.json")
        # vmec_ok = os.path.exists(vmec_path) or download_with_retries(vmec_url(dev_id), vmec_path)
        simsopt_ok = os.path.exists(simsopt_path) or download_with_retries(simsopt_url(dev_id), simsopt_path)
        status = "success" if simsopt_ok else 'failed'
        return {
            "ID": dev_id,
            # "vmec_url": vmec_url(dev_id),
            'simsopt_url': simsopt_url(dev_id),
            "status": status
        }

In [None]:
def _bytes_feature(value):
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _float_feature(value):
    return tf.train.Feature(float_list=tf.train.FloatList(value=value.flatten()))

def _int_feature(value):
    return tf.train.Feature(int64_list=tf.train.Int64List(value=value))

def serialize_coil(id: str, coils: np.ndarray, coil_mask: np.ndarray):
    """
    coils: numpy array of shape (N+1, D), dtype float32
    """
    feature = {
        'ID': _bytes_feature(id.encode('utf-8')),
        'coil_data': _float_feature(coils),
        'coil_mask': _int_feature(coil_mask)
    }

    example_proto = tf.train.Example(features=tf.train.Features(feature=feature))
    return example_proto.SerializeToString()
    
def serialize_surface(id: str, surface: np.ndarray, surface_mask: np.ndarray):
    feature = {
        'ID': _bytes_feature(id.encode('utf-8')),
        'surface_data': _float_feature(surface),
        'surface_mask': _int_feature(surface_mask)
    }

    example_proto = tf.train.Example(features=tf.train.Features(feature=feature))
    return example_proto.SerializeToString()

In [None]:
def process_coils(file_path):
    try:
        id = str(file_path)[-12:-5]
        surfaces, coils = load(str(file_path))
        s = surfaces[-1]

        num_coils = len(coils) // (s.nfp * 2)
        num_coils = min(num_coils, MAX_COILS)

        log_scaler = np.log10(coils[0].current.scale)
        scaler_token = np.full((1, FEATURES_PER_COIL), log_scaler, dtype=np.float32)

        coil_array = np.zeros((MAX_COILS + 1, FEATURES_PER_COIL), dtype=np.float32)
        for i in range(num_coils):
            params = coils[i].x[-99:]  # 99 Fourier + 1 current
            curr = np.array(coils[i].current.current_to_scale.current)
            coil_array[i] = np.append(curr, params)

        coil_array[-1] = scaler_token  # context token

        coil_mask = np.array([1] * num_coils + [0] * (MAX_COILS - num_coils), dtype=np.int64)

        return serialize_coil(id, coil_array, coil_mask)
        
    except Exception as e:
        print(f"Failed on {file_path}: {e}")
        return None    

In [None]:
def process_surface(file_path):
    """
    Convert s.x and metadata to [N_modes, 5] array:
    columns = [m_norm, n_norm, is_cos, is_R, coeff_value]
    """
    try:
        id = str(file_path)[-12:-5]
        surfaces, coils = load(str(file_path))
        outer_surface = surfaces[-1]
        s = outer_surface.to_RZFourier() #the bottleneck in speed

        x = s.x  # the coeff vector
        nfp = s.nfp

        num_coefs = len(x)
        if num_coefs > MAX_COEFS:
            print(num_coefs)
        
        m = s.m  # mode numbers, shape (N_modes,)
        n = s.n
        
        num_modes = (len(m)+1)//2
        # Normalize mode indices
        max_m = np.max(np.abs(m)) or 1
        max_n = np.max(np.abs(n)) or 1

        # Type flags
        is_cos = np.concatenate([
            np.ones(num_modes, dtype=bool),  # R_cos
            np.zeros(num_modes-1, dtype=bool)  # Z_sin
        ])

        surface_set = np.zeros((MAX_COEFS, 5), dtype=np.float32)
        surface_mask = np.zeros((MAX_COEFS,), dtype=np.int64)
        nfp_norm = float(nfp) / MAX_NFP 

        for i in range(min(num_coefs, MAX_COEFS)):
            surface_set[i] = [
                m[i] / max_m,
                n[i] / max_n,
                float(is_cos[i]),
                x[i],
                nfp_norm
            ]
            surface_mask[i] = 1
            
        return serialize_surface(id, surface_set, surface_mask)

    except Exception as e:
        print(f"Failed on {file_path}: {e}")
        return None

In [None]:
def write_tfrecord_chunk(serialized_examples, output_path):
    with tf.io.TFRecordWriter(str(output_path)) as writer:
        for ex in serialized_examples:
            if ex:
                writer.write(ex)

def datasets_to_tfrecords(directory: Path, output_dir: Path, _type, idx,
                               chunk_size=CHUNK_SIZE, num_workers=NUM_PROCESS_WORKERS):
    files = list(directory.glob("*.json"))
    total_files = len(files)
    output_dir.mkdir(parents=True, exist_ok=True)

    if _type == 'coils':
        process_file = process_coils
    elif _type == 'surfaces':
        process_file = process_surface

    with ProcessPoolExecutor(max_workers=num_workers) as executor:
        serialized_examples = list(tqdm(
            executor.map(process_file, files),
            total=total_files,
            desc=f"Chunk {i//chunk_size:03d}"
        ))

    serialized_examples = [ex for ex in serialized_examples if ex is not None]

    output_path = output_dir / f"{_type}_chunk_{idx:03d}.tfrecord"

    write_tfrecord_chunk(serialized_examples, output_path)
    print(f"✅ Saved {len(serialized_exapmles)} {_type} samples to {output_dir}")


In [None]:
for idx, chunk in enumerate(chunks, start=1):
    print(f"\n=== Chunk {idx}/{len(chunks)}: {len(chunk)} devices ===")
    results = []

    os.makedirs(SIMSOPT_DIR, exist_ok=True)

    with ThreadPoolExecutor(max_workers=NUM_THREAD_WORKERS) as executor:
        futures = {executor.submit(process_device, dev): dev for dev in chunk}
        for fut in tqdm(as_completed(futures), total=len(futures), desc=f"Chunk {idx}"):
            results.append(fut.result())

    datasets_to_tfrecords(directory=data_dir, output_dir=output_coil_dir, _type='coils', idx=idx)
    datasets_to_tfrecords(directory=data_dir, output_dir=output_surface_dir, _type='surfaces', idx=idx)

    log_df = pd.concat([log_df, pd.DataFrame(results)], ignore_index=True)
    log_df.to_csv(LOG_CSV, index=False)
    success = sum(r["status"] == "success" for r in results)
    print(f"Chunk {idx} completed: {success}/{len(results)} successful.")

    try:
        shutil.rmtree(SIMSOPT_DIR)
    except:
        OSError as e:
        print(f'Error deleting directory: {e}')