# Collecting data from ThermoML xml files

In [1]:
import os
from ThermoML import MyThermoMLBuilder
import polars as pl
import numpy as np
from importlib import reload

## Phase Equilibria Composition

Collecting data from files with phase equilibria composition data

In [None]:
dir = os.getcwd()
files = []
with open('binaryPE.txt', 'r') as f:
    files = [os.path.join(dir, 'ThermoML', file.strip()+'.xml')
             for file in f.readlines()]


In [None]:
files[0]


In [None]:
property = ['Mole fraction']
[data, compound_dict] = build_dataset(files, property)


In [None]:
data.replace(['None', 'nan', "NaN"], np.nan, inplace=True)


In [None]:
data.to_parquet('datasetPE.parquet', engine='pyarrow', index=False)


In [None]:
compound_dict.to_json('compounddictPE.json', orient='index')


## Activity coeficient

Collecting data from files with activity coeficient at infinity dilution data

In [14]:
reload(MyThermoMLBuilder)

<module 'ThermoML.MyThermoMLBuilder' from '/home/wildsonbbl/documents/code/themoml/ThermoML/MyThermoMLBuilder.py'>

In [13]:
dir = os.getcwd()
files = []
with open('binaryAC.txt', 'r') as f:
    files = [os.path.join(dir, 'ThermoML', file.strip()+'.xml')
             for file in f.readlines()]
files[0]


'/home/wildsonbbl/documents/code/themoml/ThermoML/10.1016/j.fluid.2014.11.020.xml'

In [15]:
[data, compounds] = MyThermoMLBuilder.build_dataset(files)

In [17]:
compounds.describe()

describe,CommonName,StandardInChI
str,str,str
"""count""","""568""","""568"""
"""null_count""","""0""","""0"""
"""mean""",,
"""std""",,
"""min""","""(1-methylethen...","""InChI=1S/2C8H1..."
"""max""","""water""","""InChI=1S/Sb"""
"""median""",,


In [18]:
data.describe()

describe,filename,nDATA,c1,c2,phase_1,phase_2,"Pressure, kPa","Temperature, K",Mole fraction c1 phase_2,Activity coefficient c1 phase_2,Mole fraction c1 phase_1,Activity coefficient c1 phase_1,Mole fraction c2 phase_1,Activity coefficient c2 phase_1,Mole fraction c2 phase_2,Activity coefficient c2 phase_2,c3,Mass fraction c3 phase_2,Solvent: Mole fraction c1 phase_2,Activity coefficient c3 phase_2,"Henry's Law constant (mole fraction scale), kPa c1 phase_2",Mole fraction c3 phase_1,Solvent: Mole fraction c2 phase_2,Mole fraction c3 phase_2,Solvent: Mass fraction c3 phase_1,Solvent: Mole fraction c1 phase_1,"Molar enthalpy of solution, kJ/mol","Excess molar enthalpy (molar enthalpy of mixing), kJ/mol",Mass fraction c2 phase_2,"Vapor or sublimation pressure, kPa","Henry's Law constant (mole fraction scale), kPa c2 phase_2","Partial molar enthalpy, kJ/mol c1 phase_1","Molality, mol/kg c2 phase_1",Fugacity coefficient c1 phase_1,"Mass density, kg/m3","Solid-liquid equilibrium temperature, K","Solvent: Molality, mol/kg c2 phase_1",Mass fraction c1 phase_1,Mass fraction c2 phase_1,"Liquid-liquid equilibrium temperature, K","Amount concentration (molarity), mol/dm3 c1 phase_1","Molality, mol/kg c1 phase_1","Solvent: Molality, mol/kg c1 phase_1",Osmotic coefficient,"Boiling temperature at pressure P, K"
str,str,f64,str,str,str,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""count""","""50794""",50794.0,"""50794""","""50794""","""50794""","""50794""",50794.0,50794.0,50794.0,50794.0,50794.0,50794.0,50794.0,50794.0,50794.0,50794.0,"""50794""",50794.0,50794.0,50794.0,50794.0,50794.0,50794.0,50794.0,50794.0,50794.0,50794.0,50794.0,50794.0,50794.0,50794.0,50794.0,50794.0,50794.0,50794.0,50794.0,50794.0,50794.0,50794.0,50794.0,50794.0,50794.0,50794.0,50794.0,50794.0
"""null_count""","""0""",0.0,"""0""","""1105""","""0""","""33194""",16069.0,837.0,45293.0,46946.0,41795.0,42312.0,27750.0,28874.0,41580.0,43863.0,"""49669""",50766.0,50622.0,50711.0,49777.0,50227.0,50706.0,50732.0,50724.0,50778.0,50614.0,50363.0,50498.0,49700.0,50749.0,50789.0,50712.0,50752.0,50671.0,50732.0,50769.0,50789.0,50456.0,50790.0,50792.0,50572.0,50770.0,50742.0,50666.0
"""mean""",,40.767492,,,,,115.462434,334.293608,0.074572,122.994064,0.032893,226.363701,0.02989,35.930159,0.104644,45.003828,,0.0,0.507116,5.668759,1447.097405,0.220485,0.614545,0.111603,0.008036,0.65,6.291344,0.775511,0.287421,113.680996,2920.933333,-5.164,0.269482,46.232619,932.305366,285.713065,1.05732,0.0,0.032492,320.7,3.8,1.152346,1.832,1.585635,410.382812
"""std""",,66.872127,,,,,151.63004,28.81499,0.218123,6771.156172,0.153074,4382.763294,0.14438,212.669155,0.263141,237.293283,,0.0,0.280558,9.802903,2397.648642,0.287364,0.158129,0.315385,0.004171,0.236643,5.874693,0.590498,0.23837,222.045947,2426.999779,5.115304,0.472293,101.746683,185.764125,19.626419,0.874678,0.0,0.021586,17.078251,2.149605,0.966872,0.603467,0.553293,22.564386
"""min""","""/home/wildsonb...",1.0,"""(1-methylethen...","""(chloromethyl)...","""Air at 1 atmos...","""Crystal""",0.45,249.84,0.0,0.0,0.0,0.022,0.0,0.0156,0.0,0.0373,"""1-pentene""",0.0,0.098,0.146,0.376,0.0,0.5,0.0,0.00248,0.3,-9.34,-0.02678,0.0,7.2e-10,65.0,-11.18,0.0,1.73,683.79,264.04,0.134,0.0,0.0019,303.2,2.28,0.0019,1.125,0.51,373.0
"""max""","""/home/wildsonb...",594.0,"""water""","""water""","""Liquid mixture...","""Liquid mixture...",2239.0,1200.0,1.0,420000.0,1.0,149008.0,1.0,9781.0,1.0,9813.0,"""water""",0.0,0.95,43.0,16000.0,1.0,0.95,0.9911,0.011123,1.0,25.3,2.767,0.814,1361.3,9300.0,2.26,1.71,411.58,1260.0,362.5,2.571,0.0,0.085,343.2,5.32,3.0,2.571,2.522,472.0
"""median""",,23.0,,,,,101.0,333.15,0.0,2.37,0.0,2.12,0.0,2.76,0.0,3.21,,0.0,0.5,2.22,720.0,0.1119,0.5,0.0,0.011123,0.65,6.462,0.6367,0.206,47.115,2250.0,-5.38,0.0598,11.48,880.76,281.245,0.563,0.0,0.0323,318.2,3.8,0.899,1.8,1.558,407.5


In [19]:
data.write_parquet('datasetACdev.parquet')


In [20]:
compounds.write_parquet('compoundsACdev.parquet')


In [40]:
compounds = pl.read_parquet('compoundsAC.parquet')


In [21]:
compounds.filter(pl.col('CommonName') == 'choline chloride')


CommonName,StandardInChI
str,str
"""choline chlori...","""InChI=1S/C5H14..."
