In [None]:
import pandas as pd
import os
import pubchempy as pcp
import requests
import re
import requests
import pandas as pd
import time
import requests
from bs4 import BeautifulSoup


# S1.1.4 Gathering Target Compound Properties

1. [Pulling Used CAS Numbers](#pulling-list-of-cas-numbers-with-corresponding-data)
2. [MW, BP, MP from PubChemPy](#helper-function-fetching-mw-bp-mp-from-pubchempy)
3. [Antoine + HVap from NIST](#helper-function-fetching-antoine-parameters-and-hvap-from-nist)
4. [Pulling all Parameters and Properties](#pulling-all-parameters-and-properties)
5. [Merging + Exporting](#merging--exporting)

#### Pulling List of CAS Numbers with Corresponding Data

In [14]:
def list_files_in_folder(folder_path):
    # List all files in the specified folder
    file_names = [f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f))]
    return file_names

In [15]:
folder_path = r'../parsed_dataframes'
df = pd.DataFrame({'CAS' : list_files_in_folder(folder_path)})
df['CAS'] = df['CAS'].str.replace('.xlsx', '')
df.head()

Unnamed: 0,CAS
0,100-21-0
1,100-41-4
2,100-42-5
3,101-68-8
4,102-71-6


#### Helper Function: Fetching MW, BP, MP from PubChemPy

In [16]:
def fetch_properties(cas):
    try:
        compounds = pcp.get_compounds(cas, 'name')
        if not compounds:
            raise ValueError("No compound found")

        cid = compounds[0].cid
        mw = compounds[0].molecular_weight

        url = f"https://pubchem.ncbi.nlm.nih.gov/rest/pug_view/data/compound/{cid}/JSON/"
        r = requests.get(url)
        data = r.json()

        root_sections = data.get("Record", {}).get("Section", [])
        melting_point = extract_property(root_sections, "Melting Point")
        boiling_point = extract_property(root_sections, "Boiling Point")

        return {
            'CAS': cas,
            'MolecularWeight': mw,
            'BoilingPoint': boiling_point,
            'MeltingPoint': melting_point
        }

    except Exception as e:
        print(f"Error with CAS {cas}: {e}")
        return {
            'CAS': cas,
            'MolecularWeight': None,
            'BoilingPoint': None,
            'MeltingPoint': None
        }
    
def extract_property(data, target):
    """Recursively search for melting/boiling point in the JSON."""
    if isinstance(data, list):
        for item in data:
            result = extract_property(item, target)
            if result:
                return result
    elif isinstance(data, dict):
        # Check if this node has the property of interest
        heading = data.get("TOCHeading", "").lower()
        if target.lower() in heading:
            for info in data.get("Information", []):
                value = info.get("Value", {}).get("StringWithMarkup", [{}])[0].get("String")
                if value:
                    return value
        # Recursively check subsections
        if "Section" in data:
            return extract_property(data["Section"], target)
    return None

#### Helper Function: Fetching Antoine Parameters and Hvap from NIST


In [None]:
def parse_temperature(temp_str):
    if not isinstance(temp_str, str):
        return None

    temp_str = temp_str.strip()

    # Handle sublimes, "less than", etc.
    if any(keyword in temp_str.lower() for keyword in ["sublime", "less than", "decomposes", "no data"]):
        return None

    # Regex patterns
    f_pattern = r"(-?\d+(?:\.\d+)?)\s*°F"
    c_pattern = r"(-?\d+(?:\.\d+)?)\s*°C"
    f_range_pattern = r"(-?\d+(?:\.\d+)?)\s*(?:to|-)\s*(-?\d+(?:\.\d+)?)\s*°F"
    c_range_pattern = r"(-?\d+(?:\.\d+)?)\s*(?:to|-)\s*(-?\d+(?:\.\d+)?)\s*°C"

    # Fahrenheit range
    match = re.search(f_range_pattern, temp_str)
    if match:
        f1, f2 = float(match.group(1)), float(match.group(2))
        return round(( (f1 + f2) / 2 - 32) * 5/9, 2)

    # Celsius range
    match = re.search(c_range_pattern, temp_str)
    if match:
        c1, c2 = float(match.group(1)), float(match.group(2))
        return round((c1 + c2) / 2, 2)

    # Single °F
    match = re.search(f_pattern, temp_str)
    if match:
        f = float(match.group(1))
        return round((f - 32) * 5/9, 2)

    # Single °C
    match = re.search(c_pattern, temp_str)
    if match:
        return round(float(match.group(1)), 2)

    # If all fails
    return None



def get_nist_data(cas_number):


    base_url = f"https://webbook.nist.gov/cgi/cbook.cgi?ID={cas_number}&Units=SI"
    headers = {'User-Agent': 'Mozilla/5.0'}
    r = requests.get(base_url, headers=headers)
    if r.status_code != 200:
        return None

    soup = BeautifulSoup(r.text, 'html.parser')
    result = {'CAS': cas_number, 'A': None, 'B': None, 'C': None, 'Hvap_kJmol': None}

    try:
        vp_link_tag = soup.find('a', string=re.compile("Phase change", re.I))
        if vp_link_tag:
            vp_url = "https://webbook.nist.gov" + vp_link_tag.get('href')
            vp_page = requests.get(vp_url, headers=headers)
            vp_soup = BeautifulSoup(vp_page.text, 'html.parser')

            # Find the correct Antoine table using aria-label
            table = vp_soup.find('table', {'class': 'data', 'aria-label': 'Antoine Equation Parameters'})
            if table:
                rows = table.find_all('tr')
                if len(rows) >= 2:
                    data_cells = rows[1].find_all('td')
                    result['A'] = float(data_cells[1].text.strip())
                    result['B'] = float(data_cells[2].text.strip())
                    result['C'] = float(data_cells[3].text.strip())
    except Exception as e:
        print(f"[{cas_number}] Antoine error: {e}")

    # ------ Hvap ------
    try:
        phase_link_tag = soup.find('a', string=re.compile("Phase change", re.I))
        if phase_link_tag:
            props_url = "https://webbook.nist.gov" + phase_link_tag.get('href')
            props_page = requests.get(props_url, headers=headers)
            props_soup = BeautifulSoup(props_page.text, 'html.parser')

            table = props_soup.find('table')
            if table:
                for row in table.find_all('tr'):
                    cols = [td.text.strip() for td in row.find_all('td')]
                    if len(cols) >= 2 and "vap" in cols[0].lower():
                        try:
                            hvap = float(cols[1].split()[0])
                            result['Hvap_kJmol'] = hvap
                            break
                        except:
                            continue
    except Exception as e:
        print(f"[{cas_number}] Hvap error: {e}")

    return result

#### Pulling all Parameters and Properties

In [21]:
unique_cas = df['CAS'].dropna().unique()
property_data = [fetch_properties(cas) for cas in unique_cas]
properties_df = pd.DataFrame(property_data)


properties_df["MeltingPoint"] = properties_df["MeltingPoint"].apply(parse_temperature)
properties_df["BoilingPoint"] = properties_df["BoilingPoint"].apply(parse_temperature)
properties_df["MolecularWeight"] = pd.to_numeric(properties_df["MolecularWeight"], errors="coerce")

results = []
for cas in properties_df['CAS'].dropna().unique():
    data = get_nist_data(cas)
    if data:
        results.append(data)
    time.sleep(1.5)

params_df = pd.DataFrame(results)

Error with CAS 1330-20-7: No compound found
Error with CAS 25376-45-8: No compound found
[108-94-1] Antoine error: could not convert string to float: '4.1033 ± 0.00099'


#### Merging + Exporting

In [22]:
df = df.merge(properties_df, on='CAS', how='left')
df = df.merge(params_df, on='CAS', how='left')
df.head()

Unnamed: 0,CAS,MolecularWeight,BoilingPoint,MeltingPoint,A,B,C,Hvap_kJmol
0,100-21-0,166.13,,,,,,
1,100-41-4,106.16,136.22,-95.0,4.40536,1695.026,-23.698,41.0
2,100-42-5,104.15,145.56,-30.83,4.0593,1459.909,-59.551,43.93
3,101-68-8,250.25,196.39,37.22,2.41991,969.926,-253.28,
4,102-71-6,149.19,,21.61,7.19251,4543.902,24.749,105.9


In [24]:
df.to_excel("target_compound_properties.xlsx", index=False)