# Preprocessing: SMILES → Descriptor Features for Melting Point


This notebook computes descriptor features from SMILES for modeling melting point (`Tm`).


Descriptors include: α (H-bond donor ability), π (hydrophobic constant via logP), MR (Molar Refractivity), B2 (Sterimol width), and I3 (meta-substitution indicator).

In [3]:
import json
import subprocess
import sys

def ensure_package(import_name: str, install_name: str | None = None) -> None:
    """Install a pip package if missing (best-effort)."""
    install_name = install_name or import_name
    result = subprocess.run(
        [sys.executable, '-m', 'pip', 'list', '--format=json'],
        check=True,
        capture_output=True,
        text=True,
    )
    installed = {pkg['name'].lower() for pkg in json.loads(result.stdout)}
    # Some packages have different import vs distribution names (e.g., rdkit-pypi -> rdkit)
    if import_name.lower() in installed or (install_name and install_name.lower() in installed):
        print(f'{import_name} already installed.')
        return
    subprocess.check_call([sys.executable, '-m', 'pip', 'install', install_name])
    print(f'Installed {install_name} (import as {import_name}).')

# Install baseline modeling deps if missing (best-effort)
for pkg in [
    ('scikit-learn', 'scikit-learn'),
    ('lightgbm', 'lightgbm'),
    ('numpy', 'numpy'),
    ('pandas', 'pandas'),
    ('rdkit', 'rdkit'),
]:
    try:
        ensure_package(pkg[0], pkg[1])
    except Exception as exc:
        print(f"Package install check failed for {pkg}: {exc}")


scikit-learn already installed.
lightgbm already installed.
numpy already installed.
pandas already installed.
rdkit already installed.


In [4]:
# Descriptor computation helpers


from typing import Dict, Optional


import numpy as np

import rdkit
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import Crippen
from rdkit.Chem import rdMolDescriptors

def _embed_3d(mol: Chem.Mol, seed: int = 13) -> Chem.Mol:


    mol = Chem.AddHs(mol)


    params = AllChem.ETKDGv3()


    params.randomSeed = seed


    _ = AllChem.EmbedMolecule(mol, params)


    try:


        AllChem.UFFOptimizeMolecule(mol, maxIters=200)


    except Exception:


        pass


    return mol




def compute_b2(mol: Chem.Mol) -> Optional[float]:


    """Compute Sterimol B2 if available via mordred; else fallback to PCA width (approx)."""


    # Try mordred Sterimol.B2 if present


    try:


        SterimolMod = getattr(descriptors, "Sterimol")


        B2class = getattr(SterimolMod, "B2")


        calc = Calculator(B2class())


        val = calc(mol)[0]


        if val is not None:


            return float(val)


    except Exception as e:


        # Will try approximate method below


        pass




    # RDKit-based approximate B2: width along 2nd principal axis


    try:


        m3d = Chem.Mol(mol)


        if m3d.GetNumConformers() == 0:


            m3d = _embed_3d(Chem.Mol(mol))


        conf = m3d.GetConformer()


        coords = []


        for a in m3d.GetAtoms():


            pos = conf.GetAtomPosition(a.GetIdx())


            coords.append([pos.x, pos.y, pos.z])


        X = np.array(coords, dtype=float)


        X = X - X.mean(axis=0, keepdims=True)


        # PCA via SVD


        U, S, VT = np.linalg.svd(X, full_matrices=False)


        # Project onto principal axes


        Y = X @ VT.T


        # Width along 2nd axis (index 1) as max-min range


        b2 = float(Y[:, 1].max() - Y[:, 1].min())


        return b2


    except Exception:


        return None




def compute_I3_meta_indicator(mol: Chem.Mol) -> int:


    """Return 1 if any benzene (6-membered aromatic ring) is meta-disubstituted; else 0.
    A ring is meta-disubstituted if two ring atoms bearing exocyclic substituents are separated by 2 bonds along the ring.
    """


    ri = mol.GetRingInfo()


    rings = ri.AtomRings()


    for ring in rings:


        if len(ring) != 6:


            continue


        # Check aromaticity for all atoms in ring


        if not all(mol.GetAtomWithIdx(i).GetIsAromatic() for i in ring):


            continue


        ring_list = list(ring)  # RDKit provides ring atoms in order


        ring_set = set(ring_list)


        # Identify substituted ring atoms (have neighbor outside the ring)


        substituted = []


        for idx in ring_list:


            atom = mol.GetAtomWithIdx(idx)


            for nbr in atom.GetNeighbors():


                if nbr.GetIdx() not in ring_set:


                    substituted.append(idx)


                    break


        # Check any pair for meta separation (2 bonds along ring)


        n = len(ring_list)


        for i in range(len(substituted)):

            for j in range(i+1, len(substituted)):

                a = substituted[i]

                b = substituted[j]

                ia = ring_list.index(a)

                ib = ring_list.index(b)

                sep = abs(ia - ib)

                sep = min(sep, n - sep)

                if sep == 2:

                    return 1

    return 0



def compute_descriptors(smiles: str) -> Dict[str, Optional[float]]:

    """Compute α (HBD count), π (logP), MR (molar refractivity), B2 (width), I3 (meta indicator)."""

    mol = Chem.MolFromSmiles(smiles)

    if mol is None:

        return {"alpha": None, "pi": None, "MR": None, "B2": None, "I3": None}

    # For 3D-dependent features, work on a copy with Hs if needed

    try:

        m3d = Chem.AddHs(mol)

    except Exception:

        m3d = mol

    # Basic descriptors

    try:

        alpha = float(rdMolDescriptors.CalcNumHBD(mol))

    except Exception:

        alpha = None

    try:

        pi = float(Crippen.MolLogP(mol))

    except Exception:

        pi = None

    try:

        MR = float(Crippen.MolMR(mol))

    except Exception:

        MR = None

    # B2 and I3

    B2 = compute_b2(m3d)

    try:

        I3 = int(compute_I3_meta_indicator(mol))

    except Exception:

        I3 = None

    return {"alpha": alpha, "pi": pi, "MR": MR, "B2": B2, "I3": I3}


In [6]:
# CSV processing pipeline without pandas
import csv
from pathlib import Path
from typing import Optional, List, Dict
import pandas as pd

def reduce_memory_usage(df: pd.DataFrame) -> pd.DataFrame:
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    for col in df.columns:
        col_type = df[col].dtype
        if str(col_type) in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if pd.isna(c_min) or pd.isna(c_max):
                continue
            if str(col_type).startswith('int'):
                if c_min >= np.iinfo(np.int8).min and c_max <= np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min >= np.iinfo(np.int16).min and c_max <= np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min >= np.iinfo(np.int32).min and c_max <= np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
            else:
                if c_min >= np.finfo(np.float32).min and c_max <= np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
        elif col_type == object:
            num_unique = df[col].nunique(dropna=False)
            if num_unique / max(len(df), 1) < 0.5:
                df[col] = df[col].astype('category')
    return df

def _guess_column(headers: List[str], candidates: List[str]) -> Optional[str]:
    lc_map = {h.lower(): h for h in headers}
    for cand in candidates:
        for name_lc, orig in lc_map.items():
            if cand in name_lc:
                return orig
    return None


def process_csv(input_path: str, smiles_col: str = None, tm_col: str = None, output_path: str = None) -> List[Dict[str, object]]:
    p = Path(input_path)
    if not p.exists():
        raise FileNotFoundError(f"Input file not found: {input_path}")

    # Read input rows
    with p.open("r", newline="", encoding="utf-8") as f:
        reader = csv.DictReader(f)
        headers = reader.fieldnames or []
        if smiles_col is None:
            smiles_col = _guess_column(headers, ["smiles", "smile", "smile_string", "canonical_smiles"]) or "smiles"
        if tm_col is None:
            tm_col = _guess_column(headers, ["tm", "melt", "melting", "melting_point"]) or "tm"
        if smiles_col not in headers:
            raise ValueError(f"Could not find SMILES column. Got columns: {headers}")
        if tm_col not in headers:
            raise ValueError(f"Could not find Tm column. Got columns: {headers}")

        results: List[Dict[str, object]] = []
        for row in reader:
            smi = row.get(smiles_col, "")
            tm_val = row.get(tm_col, None)
            desc = compute_descriptors(smi)
            rec = {
                "smiles": smi,
                "tm": tm_val,
                **desc,
            }
            results.append(rec)

    # Prepare output path
    if output_path is None:
        out_dir = Path("/home/tp_ubuntu/project/newversion/pubchem-0/result/data")
        out_dir.mkdir(parents=True, exist_ok=True)
        output_path = out_dir / "processed_descriptors.csv"
    else:
        output_path = Path(output_path)
        output_path.parent.mkdir(parents=True, exist_ok=True)

    # Write CSV
    fieldnames = ["smiles", "tm", "alpha", "pi", "MR", "B2", "I3"]
    with output_path.open("w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()
        for rec in results:
            writer.writerow({k: rec.get(k, None) for k in fieldnames})

    print(f"Wrote {len(results)} rows to: {output_path}")
    return reduce_memory_usage(pd.DataFrame(results))

In [None]:
# Example run on main-data/melting_point_pubchem.csv


default_input = "/home/tp_ubuntu/project/main-data/melting_point_pubchem.csv"

try:

    processed = process_csv(default_input)

    display(processed.head())

except Exception as e:

    print(f"Processing skipped or failed: {e}")

[16:02:57] UFFTYPER: Unrecognized charge state for atom: 1
[16:02:57] UFFTYPER: Unrecognized charge state for atom: 1
[16:02:57] UFFTYPER: Unrecognized atom type: Ca+2 (0)
[16:02:57] UFFTYPER: Unrecognized atom type: Ca+2 (0)
[16:02:57] UFFTYPER: Unrecognized charge state for atom: 1
[16:02:57] UFFTYPER: Unrecognized charge state for atom: 1
[16:03:00] UFFTYPER: Unrecognized charge state for atom: 0
[16:03:00] UFFTYPER: Unrecognized charge state for atom: 0
[16:03:00] UFFTYPER: Unrecognized atom type: Ni3+2 (0)
[16:03:00] UFFTYPER: Unrecognized atom type: Ni3+2 (0)
[16:03:00] UFFTYPER: Unrecognized atom type: Ni5+2 (0)
[16:03:00] UFFTYPER: Unrecognized atom type: Ni5+2 (0)
[16:03:00] UFFTYPER: Unrecognized charge state for atom: 1
[16:03:00] UFFTYPER: Unrecognized charge state for atom: 1
[16:03:00] UFFTYPER: Unrecognized charge state for atom: 10
[16:03:00] UFFTYPER: Unrecognized charge state for atom: 10
[16:03:01] UFFTYPER: Unrecognized charge state for atom: 1
[16:03:01] UFFTYPER: 