### Evaluating reference data and model predictions


## Loading packages


In [None]:
import numpy as np
import os.path as osp
from gnnepcsaft.data.graphdataset import ThermoMLDataset, Ramirez, Esper
from gnnepcsaft.train.utils import rho_single, vp_single
from gnnepcsaft.epcsaft.utils import parameters_gc_pcsaft
from gnnepcsaft.demo.utils import _get_model_params, binary_test, es_para
from gnnepcsaft.data.rdkit_util import assoc_number, smilestoinchi
import polars as pl
import torch
from gnnepcsaft.train.models import GNNePCSAFTL, HabitchNNL
import joblib
import xgboost as xgb

In [None]:
path = osp.join("gnnepcsaft/data", "thermoml")
test_loader = ThermoMLDataset(path)

pna_msigmae = GNNePCSAFTL.load_from_checkpoint(
    "./gnnepcsaft/train/checkpoints/model-hlrn7lqv.ckpt",
    "cpu",
).eval()
pna_assoc = GNNePCSAFTL.load_from_checkpoint(
    "./gnnepcsaft/train/checkpoints/assoc_model-j7isfrga.ckpt",
    "cpu",
).eval()
habitch_msigmae = HabitchNNL.load_from_checkpoint(
    "./gnnepcsaft/train/checkpoints/model-u62sbl40.ckpt", "cpu"
).eval()


rf_msigmae = joblib.load("./gnnepcsaft/train/checkpoints/rf_model.joblib")
xgb_msigmae = xgb.Booster()
xgb_msigmae.load_model("./gnnepcsaft/train/checkpoints/xgb_model.json")

In [None]:
def test(para_data):
    data = {"inchis": [], "mden": [], "mvp": []}
    for gh in test_loader:
        if gh.InChI not in para_data:
            continue
        params = para_data[gh.InChI]
        rho = gh.rho[:, -1]
        vp = gh.vp[:, -1]
        if rho.shape[0] > 0:
            pred_den = rho_single((params, gh.rho))
            mape_den = np.mean(np.abs((pred_den - rho) / rho))
        else:
            mape_den = np.nan
        if vp.shape[0] > 0:
            pred_vp = vp_single((params, gh.vp))
            mape_vp = np.mean(np.abs(np.log10(pred_vp + 1e-6) - np.log10(vp)))
        else:
            mape_vp = np.nan
        data["inchis"].append(gh.InChI)
        data["mden"].append(mape_den)
        data["mvp"].append(mape_vp)
    return data

## Evaluating references


In [None]:
train_loader = Esper("gnnepcsaft/data/esper2023")
para_data = {}
for graph in train_loader:
    para_data[graph.InChI] = (
        torch.hstack(
            [graph.para, 10 ** (graph.assoc * torch.tensor([-1.0, 1.0])), graph.munanb]
        )
        .squeeze()
        .tolist()
    )
test_data = test(para_data)
test_data = pl.DataFrame(test_data)
test_data.write_csv("../refesper.csv")

In [None]:
winter = pl.read_csv("../d4dd00077c1.csv", skip_rows=1)
para_data = {}
for row in winter.iter_rows(named=True):
    inchi = smilestoinchi(row["SMILES0"])
    nanb = assoc_number(inchi)
    para_data[inchi] = [row["m"], row["sigma"], row["epsilon_k"], row["kappa_ab"], row["epsilon_k_ab"], row["mu"], *nanb]
test_data = test(para_data)
test_data = pl.DataFrame(test_data)
test_data.write_csv("../ref_winter.csv")

In [None]:
para_data = {}
for graph in test_loader:
    try:
        para_data[graph.InChI] = list(parameters_gc_pcsaft(graph.smiles))
    except BaseException as e:
        pass
test_data = test(para_data)
test_data = pl.DataFrame(test_data)
test_data.write_csv("../refgc_pcsaft.csv")

## Evaluating models


In [None]:
para_data = {}
for graph in test_loader:
    params = _get_model_params(pna_assoc.model, pna_msigmae.model, graph).tolist()  # type: ignore
    para_data[graph.InChI] = params
test_data = test(para_data)
test_data = pl.DataFrame(test_data)
test_data.write_csv("../model_pna.csv")

In [None]:
para_data = {}
for graph in test_loader:
    params = _get_model_params(pna_assoc.model, habitch_msigmae.model, graph).tolist()  # type: ignore
    para_data[graph.InChI] = params
test_data = test(para_data)
test_data = pl.DataFrame(test_data)
test_data.write_csv("../model_habitch.csv")

In [None]:
para_data = {}
for graph in test_loader:
    params = _get_model_params(pna_assoc.model, rf_msigmae, graph).tolist()  # type: ignore
    para_data[graph.InChI] = params
test_data = test(para_data)
test_data = pl.DataFrame(test_data)
test_data.write_csv("../model_rf.csv")

In [None]:
binary_data = binary_test(pna_assoc.model, pna_msigmae.model)

data = {
    "inchis1": [],
    "inchis2": [],
    "mden": [],
    "inchi1_in_train": [],
    "inchi2_in_train": [],
    "(non-self) association": [],
    "self-association": [],
    "no-association": [],
}

for inchis, rho_data in binary_data:
    rho = np.asarray(rho_data)
    ref_rho = rho[:, 1]
    pred_rho = rho[:, 0]
    mape_den = np.mean(np.abs(pred_rho - ref_rho) / ref_rho)
    inchi1_nanb = assoc_number(inchis[0])
    inchi2_nanb = assoc_number(inchis[1])
    data["inchis1"].append(inchis[0])
    data["inchis2"].append(inchis[1])
    data["mden"].append(mape_den)
    data["inchi1_in_train"].append(inchis[0] in es_para)
    data["inchi2_in_train"].append(inchis[1] in es_para)
    data["(non-self) association"].append(
        ((inchi1_nanb[0] == 0 and inchi1_nanb[1] > 0) and (inchi2_nanb[0] > 0))
        or ((inchi1_nanb[0] > 0 and inchi1_nanb[1] == 0) and (inchi2_nanb[1] > 0))
    )
    data["self-association"].append(
        (
            (inchi1_nanb[0] > 0 and inchi1_nanb[1] > 0)
            or (inchi2_nanb[0] > 0 and inchi2_nanb[1] > 0)
        )
    )
    data["no-association"].append(
        (
            (inchi1_nanb[0] == 0 and inchi1_nanb[1] == 0)
            and (inchi2_nanb[0] == 0 and inchi2_nanb[1] == 0)
        )
    )

data = pl.DataFrame(data)
data.write_csv("../binary_test.csv")