In [18]:
import subprocess
# get the access token from 1password
ACCESS_TOKEN = subprocess.run(["op","item", "get", "Zenodo", "--fields", "credential"], capture_output=True).stdout.decode().strip()

In [37]:
import requests
record_id = "3164691"
repository_url = f"https://zenodo.org/api/records/{record_id}"
response = requests.get(repository_url, params={'access_token': ACCESS_TOKEN})
print(response.status_code)
data = response.json()

200
{'created': '2019-05-22T19:44:25.250309+00:00', 'modified': '2020-01-24T19:24:53.854409+00:00', 'id': 3164691, 'conceptrecid': '2658763', 'doi': '10.5281/zenodo.3164691', 'conceptdoi': '10.5281/zenodo.2658763', 'doi_url': 'https://doi.org/10.5281/zenodo.3164691', 'metadata': {'title': 'Pythia8 Quark and Gluon Jets for Energy Flow', 'doi': '10.5281/zenodo.3164691', 'publication_date': '2019-05-02', 'description': '<p>Two&nbsp;datasets of quark and gluon jets generated with Pythia 8, one with all kinematically realizable quark jets and one that excludes charm and bottom quark jets (at the level of the hard process). The one without c and b jets was originally used in <a href="https://arxiv.org/abs/1810.05165">Energy Flow Networks: Deep Sets for Particle Jets</a>. Generation parameters are listed below:</p>\n\n<ul>\n\t<li>Pythia 8.226 (without bc jets), Pythia 8.235 (with bc jets),&nbsp;<span class="math-tex">\\(\\sqrt{s}=14\\,\\text{TeV} \\)</span></li>\n\t<li>Quarks&nbsp;from&nbsp;W

In [38]:
for key, value in data.items():
    print(f"{key}: {value}")

created: 2019-05-22T19:44:25.250309+00:00
modified: 2020-01-24T19:24:53.854409+00:00
id: 3164691
conceptrecid: 2658763
doi: 10.5281/zenodo.3164691
conceptdoi: 10.5281/zenodo.2658763
doi_url: https://doi.org/10.5281/zenodo.3164691
metadata: {'title': 'Pythia8 Quark and Gluon Jets for Energy Flow', 'doi': '10.5281/zenodo.3164691', 'publication_date': '2019-05-02', 'description': '<p>Two&nbsp;datasets of quark and gluon jets generated with Pythia 8, one with all kinematically realizable quark jets and one that excludes charm and bottom quark jets (at the level of the hard process). The one without c and b jets was originally used in <a href="https://arxiv.org/abs/1810.05165">Energy Flow Networks: Deep Sets for Particle Jets</a>. Generation parameters are listed below:</p>\n\n<ul>\n\t<li>Pythia 8.226 (without bc jets), Pythia 8.235 (with bc jets),&nbsp;<span class="math-tex">\\(\\sqrt{s}=14\\,\\text{TeV} \\)</span></li>\n\t<li>Quarks&nbsp;from&nbsp;WeakBosonAndParton:qg2gmZq, gluons from&n

In [39]:
import requests
from pathlib import Path
from tqdm import tqdm
import hashlib
import json
import datetime

def compute_md5(file_path):
    """
    Compute the MD5 checksum of a file.
    """
    md5 = hashlib.md5()
    with open(file_path, "rb") as file:
        for chunk in iter(lambda: file.read(4096), b""):
            md5.update(chunk)
    return "md5:{}".format(md5.hexdigest())

def download_zenodo_record(record_id, save_dir=".", access_token=ACCESS_TOKEN, do_not_save=False, max_files=-1):
    repository_url = f"https://zenodo.org/api/records/{record_id}"
    response = requests.get(repository_url, params={'access_token': ACCESS_TOKEN})
    if response.status_code == 200:
        data = response.json()
        files = data["files"]
        print(f"{record_id} has {len(files)} files.")
        if not do_not_save:
            # add download date to the metadata
            today = datetime.datetime.now().strftime("%Y-%m-%d")
            data["downloaded_on"] = today

            # save metadata for the record.
            out_metaname = Path(save_dir) / f"{record_id}.json"
            with open(out_metaname, "w") as f:
                json.dump(data, f)

            print("Saving files...")
            if max_files > 0:
                print(f"Downloading at most {max_files} files...")
            Path(save_dir).mkdir(parents=True, exist_ok=True)
            for idx, file_info in enumerate(files):
                if max_files > 0 and idx >= max_files:
                    break
                filename = file_info["key"]
                file_download_url = file_info["links"]["self"]
                file_size = file_info["size"]
                outname = Path(save_dir) / filename
                if outname.exists():
                    # check if the md5sum is the same
                    current_md5 = compute_md5(outname)
                    if current_md5 == file_info["checksum"]:
                        print(f"File {filename} already exists. Skipping.")
                        continue
                    else:
                        print(f"File {filename} already exists, but the checksum is different. Re-downloading.")
                        print(f"{current_md5}, {file_info['checksum']}")

                with tqdm(total=file_size, unit='B', unit_scale=True, unit_divisor=1024, desc=filename) as pbar:
                    def download_file(url, local_filename):
                        with requests.get(url, stream=True) as r:
                            r.raise_for_status()
                            with open(local_filename, 'wb') as f:
                                for chunk in r.iter_content(chunk_size=8192):
                                    f.write(chunk)
                                    pbar.update(len(chunk))
                    download_file(file_download_url, outname)
        else:
            print(", ".join([file_info["key"] for file_info in files]))
    else:
        print(f"Failed to get repository info: {response.status_code}")
        print(response.text)

In [40]:
download_zenodo_record(3164691, save_dir=".", do_not_save=False, max_files=1)

3164691 has 40 files.
Saving files...
Downloading at most 1 files...
File QG_jets_4.npz already exists. Skipping.


In [42]:
from dataclasses import dataclass

@dataclass
class ZenodoDataset:
    record_id: int
    title: str
    save_dir: str


In [43]:
HEP_datasets_in_Zenodo = [
    ZenodoDataset(3164691, "Pythia8 Quark and Gluon Jets for Energy Flow", "quark_gluon"),
    ZenodoDataset(2603256, "Top Quark Tagging Reference Dataset", "top_tagging"),
    ZenodoDataset(6619768, "JetClass: A Large-Scale Dataset for Deep Learning in Jet Physics", "jet_class"),
    ZenodoDataset(10246934, "Herwig dataset for HadML particle GAN training", "herwig"),
    ZenodoDataset(8370883, "MicroBooNE BNB Inclusive Overlay Sample (No Wire Info)", "microboone_nowire"),
    ZenodoDataset(8137810, "Long-lived particles (LLP) in a monitored-drift-tube like detector", "llp_in_mdt"),
    ZenodoDataset(6975118, "JetNet 30", "jetnet30"),
    ZenodoDataset(6975117, "JetNet 150", "jetnet150"),
    ZenodoDataset(6047873, "Particle-based Fast Jet Simulation at the LHC with Variational Autoencoders: generator-level and reconstruction-level jets dataset", "full_jet_dataset"),
    ZenodoDataset(6812533, "Large Radius Tracking Events in pp collisions with a Generic detector", "lrt_acts_generic"),\
    ZenodoDataset(3981290, "Supervised jet clustering reference data", "supervised_jet_clustering"),
]