In [None]:
import pandas as pd
import numpy as np
import ast

# Config
TAU = 30
EMBEDDING_FILE = "./data/ensemble_embeddings_161.csv"
MIXTURE_DEF_FILE = "./data/Cleaned_Mixure_Definitions_Training_Set.csv"
TRAIN_DIST_FILE = "./data/TrainingData_mixturedist.csv"
OUTPUT_FILE = "./output/umich_train_features_clean.csv"

# Load molecule-level embeddings
df_embed = pd.read_csv(EMBEDDING_FILE)
df_embed["prediction"] = df_embed["prediction"].apply(ast.literal_eval)
embedding_dict = df_embed.set_index("CID")["prediction"].to_dict()
intensity_dict = df_embed.set_index("CID")["INTENSITY"].to_dict()

# Load mixture definitions
df_mix = pd.read_csv(MIXTURE_DEF_FILE)
mixture_dict = {}

for _, row in df_mix.iterrows():
    dataset = row["Dataset"]
    mixture = row["Mixture Label"]
    
    raw_cids = row[2:].dropna()
    cids = []
    for val in raw_cids:
        try:
            clean_val = str(val).strip("[] ").split("]")[0]
            cid = int(float(clean_val))
            cids.append(cid)
        except Exception:
            continue
    mixture_dict[(dataset, mixture)] = cids

# Weighting function
def smooth_weights(intensities, tau=TAU):
    exp_vals = np.exp(np.array(intensities) / tau)
    return exp_vals / np.sum(exp_vals)

def get_mixture_embedding(dataset, mixture):
    key = (dataset, mixture)
    cids = mixture_dict.get(key, [])
    features, intensities = [], []
    for cid in cids:
        if cid in embedding_dict:
            features.append(embedding_dict[cid])
            intensities.append(intensity_dict[cid])
    if not features:
        return None
    weights = smooth_weights(intensities)
    return np.average(features, axis=0, weights=weights)

# Load training data (mixture pairs)
df_train = pd.read_csv(TRAIN_DIST_FILE)
rows = []

for _, row in df_train.iterrows():
    dataset = row["Dataset"]
    if pd.isna(row["Mixture 1"]) or pd.isna(row["Mixture 2"]):
        continue
    m1 = int(row["Mixture 1"])
    m2 = int(row["Mixture 2"])
    y = row["Experimental Values"]

    emb1 = get_mixture_embedding(dataset, m1)
    emb2 = get_mixture_embedding(dataset, m2)

    if emb1 is not None and emb2 is not None:
        diff = np.abs(np.array(emb1) - np.array(emb2))
        rows.append([dataset, m1, m2, y] + diff.tolist())

# Save to CSV
cols = ["Dataset", "Mixture 1", "Mixture 2", "Experimental Values"] + [f"f{i}" for i in range(len(diff))]
df_out = pd.DataFrame(rows, columns=cols)
df_out.to_csv(OUTPUT_FILE, index=False)
print(f"Saved: {OUTPUT_FILE}")


Saved: ./output/umich_train_features_clean.csv
