# Collecting data from ThermoML xml files

In [None]:
import os
from ThermoML import ThermoMLBuilder
import polars as pl
import numpy as np
from importlib import reload
import glob
from rdkit import Chem
from rdkit.Chem.rdMolDescriptors import CalcExactMolWt

thermoml_schema.py generator:

In [None]:
# pyxbgen -u ThermoML/ThermoML.xsd -m ThermoML/thermoml_schema

In [None]:
def RetrieveFiles():
    """Outputs a list of files path from reference DOIs (taken from https://trc.nist.gov/ThermoML/Browse)
    
    Parameters
    ----------
    ref: str
        file path for the file with the reference DOIs

    Outputs
    ----------
    files: list(str)
        list of paths for each reference file
    """


    dir = os.getcwd()
    files = [os.path.join(dir, file)
                for file in glob.glob('ThermoML/*/*.xml')]
    return files


## Raw data

Collecting data from all files

In [None]:
files = RetrieveFiles() 
files[0]

In [None]:
data = ThermoMLBuilder.build_dataset(files,'data','raw')

In [None]:
df = pl.read_parquet('../ePC-SAFT/data/thermoml/raw/dataden.parquet')

In [None]:
def mw(inchi):
    
    try:
        mol = Chem.MolFromInchi(inchi, removeHs=False)
        mol = Chem.AddHs(mol)
        mol_weight = CalcExactMolWt(mol)
    except:
        mol_weight = 0

    return mol_weight

In [None]:
mw1 = df['inchi1'].apply(mw).rename('mw1')

In [None]:
df = df.with_columns(mw1)

In [None]:
mw2 = df['inchi2'].apply(mw).rename('mw2')

In [None]:
df = df.with_columns(mw2)

In [None]:
df.write_parquet('../ePC-SAFT/data/thermoml/raw/dataden.parquet')