# Prepare Feynman Dataset

1. Download `Feynman_without_units.tar.gz` from https://space.mit.edu/home/tegmark/aifeynman.html.
2. Once obtained please place the archive in your `TORCH_DATA_DIR` directory. **DO NOT EXTRACT THE ARCHIVE**.
3. Run this notebook to generate `feynman.hdf5`.

In [None]:
import tarfile
import numpy as np
import h5py
import tqdm
from pathlib import Path
from os import environ

In [None]:
data_directory = Path(environ.get("TORCH_DATA_DIR", ".")).expanduser().resolve()
unprocessed_dataset = data_directory / "Feynman_without_units.tar.gz"

### 2. Only use some Filenames
We ignore equations with fewer than two variables, since they are trivial and a 
KAN can solve them with a single B-Spline.

In [None]:
filenames = [
    "I.6.2",
    "I.6.2b",
    "I.8.14",
    "I.9.18",
    "I.11.19",
    "I.12.11",
    "I.13.12",
    "I.15.3x",
    "I.15.3t",
    "I.16.6",
    "I.18.4",
    "I.26.2",
    "I.27.6",
    "I.29.16",
    "I.30.3",
    "I.30.5",
    "I.32.17",
    "I.37.4",
    "I.40.1",
    "I.44.4",
    "I.50.26",
    "II.2.42",
    "II.6.15a",
    "II.11.7",
    "II.11.27",
    "II.11.28",
    "II.34.29b",
    "II.35.18",
    "II.36.38",
    "II.38.3",
    "III.9.52",
    "III.10.19",
    "III.15.27",
    "III.17.37",
]

### 3. Load and save the data
We load the data and save it in a format that is easier to work with.

In [None]:
n_samples = 100_000
np.random.seed(0)

feynman_filename = data_directory / "feynman_tmp.hdf5"
with h5py.File(feynman_filename, "w") as f:
    with tarfile.open(unprocessed_dataset.as_posix(), "r:gz") as tar:
        for filename in tqdm.tqdm(filenames):
            member = tar.getmember("Feynman_without_units/" + filename)
            matrix = np.loadtxt(tar.extractfile(member))
            indices = np.random.permutation(matrix.shape[0])[:n_samples]
            group = f.create_group(filename)
            group.create_dataset(
                "x", dtype="float32", data=matrix[indices, :-1], compression="gzip"
            )
            group.create_dataset(
                "y", dtype="float32", data=matrix[indices, -1], compression="gzip"
            )

feynman_filename.rename(data_directory / "feynman.hdf5")