# Collecting data from ThermoML xml files

In [1]:
import os
from ThermoML import ThermoMLBuilder
import polars as pl
import numpy as np
from importlib import reload
import glob
from rdkit import Chem
from rdkit.Chem.rdMolDescriptors import CalcExactMolWt

thermoml_schema.py generator:

In [3]:
# pyxbgen -u ThermoML/ThermoML.xsd -m ThermoML/thermoml_schema

In [4]:
def RetrieveFiles():
    """Outputs a list of files path from reference DOIs (taken from https://trc.nist.gov/ThermoML/Browse)
    
    Parameters
    ----------
    ref: str
        file path for the file with the reference DOIs

    Outputs
    ----------
    files: list(str)
        list of paths for each reference file
    """


    dir = os.getcwd()
    files = [os.path.join(dir, file)
                for file in glob.glob('ThermoML/*/*.xml')]
    return files


## Raw data

Collecting data from all files

In [5]:
files = RetrieveFiles() 
files[0]

'/home/wildsonbbl/documents/code/themoml/ThermoML/10.1016/j.fluid.2019.06.015.xml'

In [9]:
data = ThermoMLBuilder.build_dataset(files,'data','raw')

files: 100%|██████████| 11923/11923 [3:21:53<00:00,  1.02s/it]   


In [2]:
df = pl.read_parquet('../ePC-SAFT/data/thermoml/raw/dataden.parquet')

In [9]:
def mw(inchi):
    
    try:
        mol = Chem.MolFromInchi(inchi, removeHs=False)
        mol = Chem.AddHs(mol)
        mol_weight = CalcExactMolWt(mol)
    except:
        mol_weight = 0

    return mol_weight

In [12]:
mw1 = df['inchi1'].apply(mw).rename('mw1')








[17:30:09] Explicit valence for atom # 29 S, 8, is greater than permitted
[17:30:09] ERROR: Explicit valence for atom # 29 S, 8, is greater than permitted

[17:30:09] Explicit valence for atom # 29 S, 8, is greater than permitted
[17:30:09] ERROR: Explicit valence for atom # 29 S, 8, is greater than permitted

[17:30:09] Explicit valence for atom # 29 S, 8, is greater than permitted
[17:30:09] ERROR: Explicit valence for atom # 29 S, 8, is greater than permitted

[17:30:09] Explicit valence for atom # 29 S, 8, is greater than permitted
[17:30:09] ERROR: Explicit valence for atom # 29 S, 8, is greater than permitted
















































































































[17:31:35] Explicit valence for atom # 23 S, 8, is greater than permitted
[17:31:35] ERROR: Explicit valence for atom # 23 S, 8, is greater than permitted

[17:31:35] Explicit valence for atom # 23 S, 8, is greater than permitted
[17:31:35] ERROR: Explicit v

In [28]:
df = df.with_columns(mw1)

In [29]:
mw2 = df['inchi2'].apply(mw).rename('mw2')

[17:46:07] Explicit valence for atom # 29 S, 8, is greater than permitted
[17:46:07] ERROR: Explicit valence for atom # 29 S, 8, is greater than permitted

[17:46:07] Explicit valence for atom # 29 S, 8, is greater than permitted
[17:46:07] ERROR: Explicit valence for atom # 29 S, 8, is greater than permitted

[17:46:07] Explicit valence for atom # 29 S, 8, is greater than permitted
[17:46:07] ERROR: Explicit valence for atom # 29 S, 8, is greater than permitted

[17:46:07] Explicit valence for atom # 29 S, 8, is greater than permitted
[17:46:07] ERROR: Explicit valence for atom # 29 S, 8, is greater than permitted

[17:47:28] Explicit valence for atom # 23 S, 8, is greater than permitted
[17:47:28] ERROR: Explicit valence for atom # 23 S, 8, is greater than permitted

[17:47:28] Explicit valence for atom # 23 S, 8, is greater than permitted
[17:47:28] ERROR: Explicit valence for atom # 23 S, 8, is greater than permitted

[17:47:28] Explicit valence for atom # 23 S, 8, is greater than 

In [31]:
df = df.with_columns(mw2)

In [55]:
df.write_parquet('../ePC-SAFT/data/thermoml/raw/dataden2.parquet')