# Collecting data from ThermoML xml files

In [None]:
import os
from ThermoML import ThermoMLBuilder
import polars as pl
import numpy as np
from importlib import reload

In [None]:
def RetrieveFiles(ref:str):
    """Outputs a list of files path from reference DOIs (taken from https://trc.nist.gov/ThermoML/Browse)
    
    Parameters
    ----------
    ref: str
        file path for the file with the reference DOIs

    Outputs
    ----------
    files: list(str)
        list of paths for each reference file
    """


    dir = os.getcwd()
    files = []
    with open(ref, 'r') as f:
        files = [os.path.join(dir, 'ThermoML', file.strip()+'.xml')
                for file in f.readlines()]
    return files


## Activity coeficient

Collecting data from files with activity coeficient data

In [None]:
files = RetrieveFiles('ThermoML/binaryAC.txt') 
files[0]

In [None]:
data = ThermoMLBuilder.build_dataset(files,'binaryAC.parquet','AC')

## Binary density

Collecting data from files with density data for binary systems

In [None]:
files = RetrieveFiles('ThermoML/binaryD.txt') 
files[0]

In [None]:
data = ThermoMLBuilder.build_dataset(files,'binaryD.parquet','density')

## Binary Vapor pressure

Collecting data from files with vapor pressure data for binary systems

In [None]:
files = RetrieveFiles('ThermoML/binaryVP.txt')
files[0]

In [None]:
data = ThermoMLBuilder.build_dataset(files,'binaryVP.parquet','VP')

## Pure Vapor pressure

Collecting data from files with vapor pressure data for binary systems

In [37]:
files = RetrieveFiles('ThermoML/pureVP.txt')
files[0]

'/home/wildsonbbl/documents/code/themoml/ThermoML/10.1021/je800300x.xml'

In [38]:
data = ThermoMLBuilder.build_dataset(files,'pureVP.parquet','pVP')

files: 100%|██████████| 1462/1462 [18:12<00:00,  1.34it/s] 


## Pure Density

Collecting data from files with vapor pressure data for binary systems

In [43]:
files = RetrieveFiles('ThermoML/pureD.txt')
files[0]

'/home/wildsonbbl/documents/code/themoml/ThermoML/10.1021/acs.jced.6b00269.xml'

In [42]:
data = ThermoMLBuilder.build_dataset(files,'pureD.parquet','pDensity')

files: 100%|██████████| 3465/3465 [1:24:34<00:00,  1.46s/it]  
