# load extra data from pubchem to enhance performance of ML models

In [12]:
import json
import subprocess
import sys

def ensure_package(import_name: str, install_name: str | None = None) -> None:
    """Install a pip package if missing (best-effort)."""
    install_name = install_name or import_name
    result = subprocess.run(
        [sys.executable, '-m', 'pip', 'list', '--format=json'],
        check=True,
        capture_output=True,
        text=True,
    )
    installed = {pkg['name'].lower() for pkg in json.loads(result.stdout)}
    # Some packages have different import vs distribution names (e.g., rdkit-pypi -> rdkit)
    if import_name.lower() in installed or (install_name and install_name.lower() in installed):
        print(f'{import_name} already installed.')
        return
    subprocess.check_call([sys.executable, '-m', 'pip', 'install', install_name])
    print(f'Installed {install_name} (import as {import_name}).')

for pkg in [
    ('numpy', None),
    ('pandas', None),
    ('matplotlib', None),
    ('joblib', None),
    # rdkit wheels are commonly available as rdkit-pypi (import name is rdkit)
    ('rdkit', 'rdkit'),
]:
    ensure_package(*pkg)

numpy already installed.
pandas already installed.
matplotlib already installed.
joblib already installed.
rdkit already installed.


In [13]:
from __future__ import annotations

from pathlib import Path
from typing import Any, Callable, Dict, Iterable, Optional, Tuple

import multiprocessing as mp
import numpy as np
import pandas as pd

from joblib import Parallel, delayed

from rdkit import Chem, rdBase, RDLogger
from rdkit.Chem import AllChem, Crippen, Descriptors, Fragments, Lipinski, rdMolDescriptors, rdFingerprintGenerator
from rdkit.Chem.MACCSkeys import GenMACCSKeys
from rdkit.Chem.EState import AtomTypes as EAtomTypes

try:
    from rdkit.Chem.Scaffolds import MurckoScaffold
except Exception:
    MurckoScaffold = None

RDLogger.DisableLog('rdApp.*')
rdBase.DisableLog('rdApp.*')

DATA_DIR = Path('../../main-data')
PUBCHEM_DIR = DATA_DIR / 'melting_point_results.csv'
TRAIN_PATH = DATA_DIR / 'train.csv'
TEST_PATH = DATA_DIR / 'test.csv'  # optional
OUTPUT_DIR = Path('result/data')
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

MORGAN_BITS = 512
MORGAN_RADIUS = 2
USE_MACCS = True
COMPUTE_3D = True
MAX_ITERS_3D = 0  # 0 = no optimization, >0 enables a short UFF optimize

In [14]:
import re

# Read the data
pubchem_data = pd.read_csv(PUBCHEM_DIR)

# Find melting point column
mp_col = None
for c in pubchem_data.columns:
    if 'melt' in c.lower():
        mp_col = c
        break

print(f"Melting point column: {mp_col}")

# Regex patterns
NUMBER_PATTERN = re.compile(r"[-+]?\d+(?:\.\d+)?")
RANGE_PATTERN = re.compile(r"([-+]?\d+(?:\.\d+)?)\s*[-–—]\s*([-+]?\d+(?:\.\d+)?)")
UNIT_PATTERN = re.compile(r"°\s*[CF]|º\s*[CF]|\bdeg(?:rees)?\s*(?:C|F)\b|Fahrenheit|Celsius", re.IGNORECASE)

def celsiusToFrahance(f):
    """Convert Celsius to Fahrenheit."""
    return round((f * 9.0 / 5.0) + 32, 1)

def detect_unit(text):
    """Detect if temperature is in Fahrenheit or Celsius."""
    if re.search(r"°\s*C|º\s*C|\bdeg(?:rees)?\s*C\b|Celsius", text, re.IGNORECASE):
        return 'C'
    if re.search(r"°\s*F|º\s*F|\bdeg(?:rees)?\s*F\b|Fahrenheit", text, re.IGNORECASE):
        return 'F'
    return 'noUnit'

def extract_numeric_mp(text):
    """Extract numeric melting point, handling ranges by taking midpoint and converting units to Celsius."""
    if pd.isna(text):
        return None
    
    text = str(text).strip()
    unit = detect_unit(text)
    if unit == 'noUnit':
        return None
    # Check for range (e.g., "100-105" or "100 - 105")
    range_match = RANGE_PATTERN.search(text)
    if range_match:
        a, b = float(range_match.group(1)), float(range_match.group(2))
        mid = (a + b) / 2.0
        return celsiusToFrahance(mid) if unit == 'C' else mid
    
    # Extract first number found
    num_match = NUMBER_PATTERN.search(text)
    if num_match:
        value = float(num_match.group(0))
        return celsiusToFrahance(value) if unit == 'C' else value
    
    return None

# Apply extraction
pubchem_data['Tm'] = pubchem_data[mp_col].apply(extract_numeric_mp)

# Show results
print(f"Extracted {pubchem_data['Tm'].notna().sum()} numeric values from {len(pubchem_data)} rows")
pubchem_data[[mp_col, 'Tm']].head(10)

Melting point column: Melting_Point
Extracted 10563 numeric values from 11170 rows


Unnamed: 0,Melting_Point,Tm
0,34.5 °F,34.5
1,-54 °C,-65.2
2,-31.5 °F,-31.5
3,63 °F,63.0
4,204 °C,399.2
5,3 °F,3.0
6,-89.5 °F,-89.5
7,158 °C,316.4
8,31.5 °C,88.7
9,114 °C (237 °F),237.2


In [15]:
# remove missing
pubchem_data = pubchem_data[pubchem_data['Tm'].notna()].reset_index(drop=True)


# Save cleaned data
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
pubchem_data.to_csv(OUTPUT_DIR / 'melting_point_pubchem.csv', index=False)
