In [None]:
import pandas as pd
from pathlib import Path

root = Path("./data/")
in_file = root / "mixture_stats_chemprop_embeddings.csv"
out_file = root / "belfaction_mixture_2686.csv"

mix = pd.read_csv(in_file, header=[0, 1], index_col=0)
mix.columns = [f"{stat}_{feat}" for stat, feat in mix.columns]
mix.index.name = "mixture_id"

mix.to_csv(out_file)
print("saved:", out_file, "|  shape:", mix.shape)

saved: data/belfaction_mixture_2686.csv |  shape: (795, 2686)


In [None]:
import numpy as np, pandas as pd
from pathlib import Path
from scipy import stats
from tqdm import tqdm

root = Path('./data/')
pairs = (
    pd.read_csv(root/'TrainingData_mixturedist.csv')
      .dropna(subset=['Mixture 1', 'Mixture 2'])
)
mix2686 = (
    pd.read_csv(root/'belfaction_mixture_2686.csv', index_col=0)
      .rename_axis('mixture_id')
)

def key(dataset, mixture):
    ds = str(dataset).strip()
    mnum = int(float(mixture))

    for fmt in [f'{mnum:03d}', f'{mnum:02d}', str(mnum)]:
        candidate = f'{ds}/{fmt}'
        if candidate in mix2686.index:
            return candidate

    raise KeyError(f"Mixture {ds}/{mnum} not found in mix2686")

# Pre-extract the 158-dim molecule embeddings we’ll need for U-tests
mix158 = mix2686.filter(like='mean_fp_').join(
           mix2686.filter(like='mean_pc')
         )

records, targets = [], []

for _, r in tqdm(pairs.iterrows(), total=len(pairs)):
    if pd.isna(r['Mixture 1']) or pd.isna(r['Mixture 2']):
        continue
    ds, m1, m2, y = r['Dataset'], r['Mixture 1'], r['Mixture 2'], r['Experimental Values']

    try:
        v1 = mix2686.loc[key(ds, m1)].values
        v2 = mix2686.loc[key(ds, m2)].values
    except KeyError:
        continue

    # algebraic contrasts
    f_pair = np.stack([
        (v1 + v2) / 2, # mean
        v1 * v2, # product
        np.abs(v1 - v2), # absolute diff
        np.minimum(v1, v2), # min
        np.maximum(v1, v2) # max
    ]).ravel()

    # Mann-Whitney U for each of the 158 molecular dims
    u_stats = np.fromiter(
        (stats.mannwhitneyu(mix158.iloc[:, i].loc[key(ds, m1)],
                            mix158.iloc[:, i].loc[key(ds, m2)],
                            alternative='two-sided').statistic
         for i in range(mix158.shape[1])),
        dtype=float,
        count=mix158.shape[1]
    )

    # Bushdid flag
    bushdid = int(ds.lower().startswith('bushdid'))

    records.append(np.concatenate([f_pair, u_stats, [bushdid]]))
    targets.append(y)

X = np.vstack(records)
y = np.array(targets)

print('pair-feature matrix:', X.shape)

out  = Path("./output"); out.mkdir(exist_ok=True)
np.save(out / "belfaction_pair_train_X.npy", X)
np.save(out / "belfaction_pair_train_y.npy", y)

100%|██████████| 500/500 [01:25<00:00,  5.85it/s]


pair-feature matrix: (500, 14221)


In [None]:
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import KFold
from scipy.stats import pearsonr
import numpy as np

pipe = make_pipeline(
    SimpleImputer(strategy='mean'),        # replaces every NaN by the column mean
    ExtraTreesRegressor(
        n_estimators=1000,
        random_state=42,
        n_jobs=-1
    )
)

pear, rmse = [], []
kf = KFold(n_splits=10, shuffle=True, random_state=42)

for tr, vl in kf.split(X):
    pipe.fit(X[tr], y[tr])
    p = pipe.predict(X[vl])
    pear.append(pearsonr(y[vl], p)[0])
    rmse.append(np.sqrt(((y[vl] - p) ** 2).mean()))

print(f'CV Pearson {np.mean(pear):.3f} ± {np.std(pear):.3f}  |  RMSE {np.mean(rmse):.3f}')

# train on full data and save the fitted pipeline
pipe.fit(X, y)
import joblib
joblib.dump(pipe, 'belfaction_extratrees.joblib')

CV Pearson 0.667 ± 0.104  |  RMSE 0.117


['belfaction_extratrees.joblib']