In [1]:
pip install git+https://github.com/Open-Reaction-Database/ord-schema.git

Collecting git+https://github.com/Open-Reaction-Database/ord-schema.git
  Cloning https://github.com/Open-Reaction-Database/ord-schema.git to /private/var/folders/g5/gfzgn6710bl4fnkqby736jyh0000gn/T/pip-req-build-lyipf20n
  Running command git clone --filter=blob:none --quiet https://github.com/Open-Reaction-Database/ord-schema.git /private/var/folders/g5/gfzgn6710bl4fnkqby736jyh0000gn/T/pip-req-build-lyipf20n
  Resolved https://github.com/Open-Reaction-Database/ord-schema.git to commit b2b10d44a3a631165d376386d1c385336c64e733
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Note: you may need to restart the kernel to use updated packages.


In [2]:
from ord_schema.message_helpers import load_message, write_message
from ord_schema.proto import dataset_pb2, reaction_pb2
import pandas as pd
import csv

In [3]:
# load the binary ord file
dataset = load_message("test-full.pb.gz", dataset_pb2.Dataset)

In [4]:
# save the ord file as human readable text
write_message(dataset, "test-full.pbtxt")

In [7]:
from ord_schema.proto import reaction_pb2  # adapte selon ton install
import csv

def get_identifier_value(identifiers, id_type_code):
    for identifier in identifiers:
        if identifier.type == id_type_code:
            return identifier.value
    return ""

SMILES_TYPE_CODE = 2 

def get_smiles_from_components(components):
    smiles_list = []
    if components is None:
        return ""
    for comp in components:
        if hasattr(comp, "identifiers"):
            for iden in comp.identifiers:
                if getattr(iden, "type", None) == SMILES_TYPE_CODE:
                    smiles_list.append(getattr(iden, "value", ""))
    return ";".join(smiles_list) if smiles_list else ""



with open("test-full.csv", "w", newline="", encoding="utf-8") as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow([
        "reaction_id",
        "reaction_index_CUSTOM",
        "reaction_type",
        "inputs_metal_and_ligand_SMILES",
        "inputs_aryl_halide_SMILES",
        "inputs_amine_SMILES",
        "inputs_solvent_SMILES",
        "inputs_base_SMILES",
        "temperature_Celsius",
        "yield_percent"
    ])

    yield_type_code = 3  # code pour YIELD

    for reaction in dataset.reactions:
        reaction_id = getattr(reaction, "reaction_id", "")

        # Identifiants
        reaction_index_custom = get_identifier_value(reaction.identifiers, 1)
        reaction_type = get_identifier_value(reaction.identifiers, 5)

        # Inputs - normaliser les clés pour matcher le dictionnaire attendu
        expected_keys = {
            "metal and ligand": "",
            "aryl halide": "",
            "amine": "",
            "solvent": "",
            "base": ""
        }

        for key, val in reaction.inputs.items():
            norm_key = key.strip().lower()
            if norm_key in expected_keys:
                expected_keys[norm_key] = get_smiles_from_components(val.components)

        # Température
        temp_c = ""
        if (reaction.conditions and
            reaction.conditions.temperature and
            reaction.conditions.temperature.setpoint):
            temp_c = reaction.conditions.temperature.setpoint.value

        # Rendement
        yield_percent = ""
        for outcome in reaction.outcomes:
            for product in outcome.products:
                if getattr(product, "is_desired_product", False):
                    for meas in product.measurements:
                        if meas.type == yield_type_code:
                            raw_value = getattr(meas.percentage, "value", None)
                            if raw_value is not None:
                                yield_percent = round(1 * raw_value, 2)  # 0.84 → 84.0 %

        writer.writerow([
            reaction_id,
            reaction_index_custom,
            reaction_type,
            expected_keys["metal and ligand"],
            expected_keys["aryl halide"],
            expected_keys["amine"],
            expected_keys["solvent"],
            expected_keys["base"],
            temp_c,
            yield_percent
        ])


In [8]:
df = pd.read_csv("test-full.csv")
print(df.head())  # affiche les 5 premières lignes


                            reaction_id  reaction_index_CUSTOM reaction_type  \
0  ord-b55ea2ff1e2541c1b7de514f4de20a52                    NaN      Nano C-N   
1  ord-052b833de20b411eb46a675ca214d41a                    NaN      Nano C-N   
2  ord-d80c9edbb6694b28a867ea2e66efdf7b                    NaN      Nano C-N   
3  ord-c37e351d57ce449e89e22c421b5ae1d1                    NaN      Nano C-N   
4  ord-a0c79d7eb6b54659bfb1bdee6c9172a3                    NaN      Nano C-N   

   inputs_metal_and_ligand_SMILES  \
0                             NaN   
1                             NaN   
2                             NaN   
3                             NaN   
4                             NaN   

                           inputs_aryl_halide_SMILES  inputs_amine_SMILES  \
0  c1cc2c(cc1Cl)[C@@](OC(=O)N2)(C#CC3CC3)C(F)(F)F...                  NaN   
1  COC(=O)CC1CCc2cc(cc3c2n1c(=O)c(=O)[nH]3)Br;CS(...                  NaN   
2  c1cc2c(cc1Cl)[C@@](OC(=O)N2)(C#CC3CC3)C(F)(F)F...             