In [1]:
# Cell 3: Download and load QUASR device database
import gzip
import json
from io import BytesIO
import os
import requests
import pandas as pd
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed
import time
import numpy as np

In [None]:
# VMEC_DIR = "quasr_vmec_files"
SIMSOPT_DIR = 'quasr_simsopt_files'
LOG_CSV = "quasr_log.csv"
# os.makedirs(VMEC_DIR, exist_ok=True)
os.makedirs(SIMSOPT_DIR, exist_ok=True)

NUM_WORKERS = 128
CHUNK_SIZE = 10000

MAX_RETRIES = 3
RETRY_DELAY = 2  # seconds

url = "https://quasr.flatironinstitute.org/database.json.gz"
print('Downloading device database...')
r = requests.get(url)
r.raise_for_status()

with gzip.open(BytesIO(r.content), 'rt', encoding='utf-8') as f:
    data = json.load(f)

df = pd.DataFrame(**data)
print(f"Loaded {len(df)} devices.")

In [None]:
df.to_hdf('QUASR_Stellarators.h5', key = 'full_dataset')
# df = pd.read_hdf('QUASR_Stellarators.h5', key = 'full_dataset')
# df_sampled = pd.read_hdf('QUASR_Stellarators.h5', key = 'general_filter_sample_3000')

In [4]:
# Cell 4: Apply filters to select matching devices
filtered = df[
    (df["Nfourier_coil"] == 16) &
    (df['qs_error'] >= -4) &
    # (df["max_elongation"] <= 10) &
    # (df["aspect_ratio"] >= 4) & (df["aspect_ratio"] <= 10) &
    (df["nc_per_hp"] >= 1) & (df["nc_per_hp"] <= 6) &
    (df["nfp"] >= 1) & (df["nfp"] <= 5)
].copy()

print(f"{len(filtered)} devices match your criteria.")

314309 devices match your criteria.


In [None]:
# Sampling for a smaller download set
# n_points = 3000
# indices = np.linspace(0, len(filtered)-1, n_points, dtype=int)
# df_sampled = filtered.iloc[indices]
# # df_sampled.to_hdf('QUASR_Stellarators.h5', key = 'general_filter_sample_3000')

In [None]:
# Cell 5: Define download helper functions
# def vmec_url(device_id):
#     '''
#     THIS WORKS
#     '''
#     pid = device_id.zfill(7)
#     return f"https://quasr.flatironinstitute.org/nml/{pid[:4]}/input.{pid}"

def simsopt_url(device_id):
    pid = device_id.zfill(7)
    return f"https://quasr.flatironinstitute.org/simsopt_serials/{pid[:4]}/serial{pid}.json"


In [None]:
# Cell 6: Robust download with retries
def download_with_retries(url: str, path: str) -> bool:
    for attempt in range(1, MAX_RETRIES + 1):
        try:
            r = requests.get(url, timeout=30)
            if r.status_code == 200:
                with open(path, 'wb') as f:
                    f.write(r.content)
                return True
            else:
                print(f"{url} returned status {r.status_code} (attempt {attempt})")
        except Exception as e:
            print(f"Error on {url} (attempt {attempt}): {e}")
        time.sleep(RETRY_DELAY)
    return False

In [None]:
# Cell 7: Prepare log and list of device IDs to download
if os.path.exists(LOG_CSV):
    log_df = pd.read_csv(LOG_CSV, dtype=str)
else:
    log_df = pd.DataFrame(columns=["ID", 'simsopt_url', "status"])

processed = set(log_df["ID"])
device_ids = [str(d) for d in filtered["ID"] if str(d) not in processed] #this is where you change which df you want the device ids from
chunks = [device_ids[i:i+CHUNK_SIZE] for i in range(0, len(device_ids), CHUNK_SIZE)]
print(f"{len(device_ids)} devices to download in {len(chunks)} chunks.")

3000 devices to download in 1 chunks.


In [None]:
# Cell 8: Chunked parallel download with retries
for idx, chunk in enumerate(chunks, start=1):
    print(f"\n=== Chunk {idx}/{len(chunks)}: {len(chunk)} devices ===")
    results = []

    def process_device(dev_id):
        pid = dev_id.zfill(7)
        # vmec_path = os.path.join(VMEC_DIR, f"input.{pid}")
        simsopt_path = os.path.join(SIMSOPT_DIR, f"input_{pid}.json")
        # vmec_ok = os.path.exists(vmec_path) or download_with_retries(vmec_url(dev_id), vmec_path)
        simsopt_ok = os.path.exists(simsopt_path) or download_with_retries(simsopt_url(dev_id), simsopt_path)
        status = "success" if simsopt_ok else 'failed'
        return {
            "ID": dev_id,
            # "vmec_url": vmec_url(dev_id),
            'simsopt_url': simsopt_url(dev_id),
            "status": status
        }

    with ThreadPoolExecutor(max_workers=NUM_WORKERS) as executor:
        futures = {executor.submit(process_device, dev): dev for dev in chunk}
        for fut in tqdm(as_completed(futures), total=len(futures), desc=f"Chunk {idx}"):
            results.append(fut.result())

    log_df = pd.concat([log_df, pd.DataFrame(results)], ignore_index=True)
    log_df.to_csv(LOG_CSV, index=False)
    success = sum(r["status"] == "success" for r in results)
    print(f"Chunk {idx} completed: {success}/{len(results)} successful.")


=== Chunk 1/1: 3000 devices ===


Chunk 1: 100%|██████████| 3000/3000 [02:13<00:00, 22.56it/s]

Chunk 1 completed: 3000/3000 successful.





In [None]:
print("\nDownload process complete.")
print(log_df["status"].value_counts())
# print(f"VMEC files in: {VMEC_DIR}")
print(f'SIMSOPt files in: {SIMSOPT_DIR}')
log_df.tail()


Download process complete.
status
success    3000
Name: count, dtype: int64
VMEC files in: quasr_vmec_files
SIMSOPt files in: quasr_simsopt_files


Unnamed: 0,ID,vmec_url,simsopt_url,status
2995,2622447,https://quasr.flatironinstitute.org/nml/2622/i...,https://quasr.flatironinstitute.org/simsopt_se...,success
2996,2655591,https://quasr.flatironinstitute.org/nml/2655/i...,https://quasr.flatironinstitute.org/simsopt_se...,success
2997,2657814,https://quasr.flatironinstitute.org/nml/2657/i...,https://quasr.flatironinstitute.org/simsopt_se...,success
2998,2545118,https://quasr.flatironinstitute.org/nml/2545/i...,https://quasr.flatironinstitute.org/simsopt_se...,success
2999,2533590,https://quasr.flatironinstitute.org/nml/2533/i...,https://quasr.flatironinstitute.org/simsopt_se...,success
