# Notebook for assessing the performance of GAME-Net on the C3 dataset

In [2]:
import warnings
warnings.filterwarnings('ignore')
import torch
import numpy as np
import os
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px
# from gnn_eads.models import MODELSPATH
from gnn_eads.data import DATAPATH
from gnn_eads.core.process_ase_db_to_PyG_dataset import load_FG_dataset, families
from gnn_eads.core.functions_data_extraction import get_fragment_energy
from gnn_eads.core.nets import PreTrainedModel
from ase.db import connect
from ase.atoms import Atoms
from ase.visualize import view
from torch_geometric.loader import DataLoader
import plotly.offline
from sklearn.decomposition import PCA
plotly.offline.init_notebook_mode()

ModuleNotFoundError: No module named 'gnn_eads.data'

# Load Model

In [None]:
model_dir = os.path.join(MODELSPATH, "GAME_NET/")
model = PreTrainedModel(model_dir)

# Load C3 test set

In [None]:
families = ["propylene_111"]
path = os.path.join(DATAPATH, "propylene_test")
c3_loader =  load_FG_dataset(root=path,                                 database="propylene_c3_cu111_database.db",
                                            second_order=False, scale_factor=1.5, tolerance=.50, edge_features=False, ring_features=False,
                                            aromatic_features=False, radical_features=False, relax=False, num_el=False,
                                            family=families)

fam:  propylene_111


[FGGraphDataset_Feat(357)]

## Create dataloader

In [None]:
c3_dataloader = DataLoader(c3_loader, batch_size=16, shuffle=False)
print(len(c3_dataloader.dataset))

1


# Evaluate C3 test set

In [None]:
formula, data_id, category, true, pred = [], [], [], [], []
for i in range(len(c3_dataloader.dataset)):
    for graph in c3_dataloader.dataset[i]:
        formula.append(graph.formula)
        data_id.append(graph.id.item())
        category.append(graph.family)
        true.append(graph.ener.item())

    for batch in c3_dataloader.dataset[i]:
        with torch.no_grad():
            y = model.model(batch) * model.std + model.mean
            for i in range(len(y)):
                pred.append(y[i].item())

<class 'float'>


## Create dataframe with predictions

In [None]:
df_c3_dataset = pd.DataFrame({'formula': formula, 'data_id': data_id, 'family': category, 'true': true, 'pred': pred})
df_c3_dataset["error"] = df_c3_dataset["true"] - df_c3_dataset["pred"]
df_c3_dataset["abs_error"] = np.abs(df_c3_dataset["error"])
df_c3_dataset

Unnamed: 0,formula,data_id,family,true,pred,error,abs_error
0,Cu5C3H3O3,1,propylene_111,-57.414040,-57.297058,-0.116982,0.116982
1,Cu5C3H3O2,2,propylene_111,-50.023262,-50.026581,0.003319,0.003319
2,Cu4C3H2O,3,propylene_111,-39.781322,-40.038677,0.257355,0.257355
3,Cu4C3H2,4,propylene_111,-32.264820,-33.349289,1.084469,1.084469
4,Cu6C3H4,5,propylene_111,-39.215176,-40.287033,1.071857,1.071857
...,...,...,...,...,...,...,...
352,Cu4C3H5O2,353,propylene_111,-59.469173,-58.421738,-1.047436,1.047436
353,Cu6C3H4O2,354,propylene_111,-53.106983,-54.286736,1.179752,1.179752
354,Cu4C3H6O3,355,propylene_111,-69.362976,-69.255493,-0.107483,0.107483
355,Cu5C3H3O2,356,propylene_111,-50.902348,-50.120155,-0.782192,0.782192


## Get Information From Database

In [None]:
ids = df_c3_dataset["data_id"]
with connect(os.path.join(DATAPATH, "propylene_test/raw", "propylene_c3_cu111_database_15_50.db")) as conn:
    for id in ids:
        row = conn.get(id)
        atoms = row.toatoms()
        atoms = atoms.todict()
        metal = row.metal
        n_O = row.o_atoms
        n_C = row.c_atoms
        n_H = row.h_atoms
        try:
            n_N = row.n_atoms
            n_S = row.s_atoms
        except:
            n_N = 0
            n_S = 0
        df_c3_dataset.loc[df_c3_dataset["data_id"] == id, ["metal", "n_O", "n_C", "n_H", "n_N", "n_S",]] = [metal, n_O, n_C, n_H, n_N, n_S,]
        df_c3_dataset.loc[df_c3_dataset["data_id"] == id, "atoms_ob"] = [atoms]


## Get $E_{ads}$ by substracting gas phase energy of adsorbate

In [None]:
df_game_net = df_c3_dataset.assign(mae_metal = df_c3_dataset.groupby(['metal'])['abs_error'].transform('mean'))
df_game_net['e_frag'] = df_gnn_eads.apply(lambda x: get_fragment_energy([x['n_C'], x['n_H'], x['n_O'], x['n_N'], x['n_S']]), axis=1)
df_game_net['true_scaled'] = df_game_net['true'] - df_game_net['e_frag']
df_game_net['pred_scaled'] = df_game_net['pred'] - df_game_net['e_frag']

Unnamed: 0,formula,data_id,family,true,pred,error,abs_error,metal,n_O,n_C,n_H,n_N,n_S,atoms_ob,mae_metal,e_frag,true_scaled,pred_scaled
337,Cu7C3H6O2,338,propylene_111,-62.196178,-62.221443,0.025265,0.025265,Cu,2.0,3.0,6.0,0.0,0.0,"{'numbers': [29, 29, 29, 29, 29, 29, 29, 29, 2...",0.551729,-59.376141,-2.820038,-2.845303
338,Cu6C3H6O3,339,propylene_111,-69.2211,-69.235733,0.014633,0.014633,Cu,3.0,3.0,6.0,0.0,0.0,"{'numbers': [29, 29, 29, 29, 29, 29, 29, 29, 2...",0.551729,-66.828518,-2.392581,-2.407215
339,Cu5C3H6O2,340,propylene_111,-61.593639,-61.603863,0.010223,0.010223,Cu,2.0,3.0,6.0,0.0,0.0,"{'numbers': [29, 29, 29, 29, 29, 29, 29, 29, 2...",0.551729,-59.376141,-2.217499,-2.227722
340,Cu7C3H3O2,341,propylene_111,-50.113289,-50.537949,0.42466,0.42466,Cu,2.0,3.0,3.0,0.0,0.0,"{'numbers': [29, 29, 29, 29, 29, 29, 29, 29, 2...",0.551729,-49.226548,-0.886741,-1.3114
341,Cu6C3H3O,342,propylene_111,-43.058178,-42.764397,-0.293781,0.293781,Cu,1.0,3.0,3.0,0.0,0.0,"{'numbers': [29, 29, 29, 29, 29, 29, 29, 29, 2...",0.551729,-41.77417,-1.284008,-0.990226
342,Cu9C3H3O2,343,propylene_111,-51.324703,-49.505154,-1.81955,1.81955,Cu,2.0,3.0,3.0,0.0,0.0,"{'numbers': [29, 29, 29, 29, 29, 29, 29, 29, 2...",0.551729,-49.226548,-2.098155,-0.278605
343,Cu6C3H3O,344,propylene_111,-43.607922,-43.203491,-0.40443,0.40443,Cu,1.0,3.0,3.0,0.0,0.0,"{'numbers': [29, 29, 29, 29, 29, 29, 29, 29, 2...",0.551729,-41.77417,-1.833751,-1.429321
344,Cu6C3H3O2,345,propylene_111,-50.355511,-49.874825,-0.480686,0.480686,Cu,2.0,3.0,3.0,0.0,0.0,"{'numbers': [29, 29, 29, 29, 29, 29, 29, 29, 2...",0.551729,-49.226548,-1.128962,-0.648276
345,Cu5C3H3O,346,propylene_111,-43.733509,-43.740005,0.006496,0.006496,Cu,1.0,3.0,3.0,0.0,0.0,"{'numbers': [29, 29, 29, 29, 29, 29, 29, 29, 2...",0.551729,-41.77417,-1.959339,-1.965835
346,Cu6C3H3O2,347,propylene_111,-49.959259,-49.423416,-0.535843,0.535843,Cu,2.0,3.0,3.0,0.0,0.0,"{'numbers': [29, 29, 29, 29, 29, 29, 29, 29, 2...",0.551729,-49.226548,-0.732711,-0.196868


## Get prediction metric

In [None]:
MAE = df_c3_dataset["abs_error"].mean()
RMSE = np.sqrt((df_c3_dataset["error"]**2).mean())
R2 = 1 - (df_game_net["error"]**2).sum() / ((df_game_net["true_scaled"] - df_game_net["true_scaled"].mean())**2).sum()
MDAE = df_c3_dataset["abs_error"].median()

# print statistics

print("MAE: {:.2f} eV".format(MAE))
print("RMSE: {:.2f} eV".format(RMSE))
print("R2: {:.2f}".format(R2))
print("MDAE: {:.2f} eV".format(MDAE))

MAE: 0.55 eV
RMSE: 0.73 eV
R2: 0.67
MDAE: 0.42 eV


# Create interactive scatter plot

In [None]:
from gnn_eads.core.constants import rgb_colors
from plotly.express import colors
import plotly.figure_factory as ff
colors = colors.qualitative.Light24_r
metals = df_game_net["metal"].unique() 
colour_dict = dict(zip(metals, colors))

In [None]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=[df_game_net["true_scaled"].min() - 0.25, df_game_net["true_scaled"].max() + 0.25], y=[df_game_net["true_scaled"].min() - 0.25, df_game_net["true_scaled"].max() + .25], mode='lines', name='Parity line'))
                         
fig.add_trace(go.Scatter(x=df_game_net["true_scaled"], y=df_game_net["pred_scaled"], customdata=df_game_net["formula"],
                             hovertemplate='DFT: %{x:.2f} eV<br>Prediction: %{y:.2f} eV<br>' + 
                             'Formula: %{customdata}<extra></extra>', mode='markers', marker=dict(size=12, line=dict(width=2, color="black"), color=colour_dict["Cu"]), opacity=0.6))

fig.update_xaxes(showgrid=True, gridwidth=1, gridcolor='black',  zeroline=True, zerolinewidth=2, zerolinecolor='black', mirror=True)

fig.update_yaxes(showgrid=True, gridwidth=1, gridcolor='black', zeroline=True, zerolinewidth=2, zerolinecolor='black', mirror=True)

fig.update_coloraxes(colorbar=dict(title="Metal", titleside="right", titlefont=dict(size=12), tickfont=dict(size=12), lenmode="pixels", len=200))


fig.update_layout(title='Parity plot for fragment groups', xaxis_title='DFT Energy / eV', yaxis_title=r'$E_\text{ads}^\text{GNN} / eV$',
                  autosize=True, width=1200, height=1200
                  , margin=dict(l=50, r=50, b=100, t=100, pad=4), grid=dict(rows=1, columns=1, pattern="independent"), 
                  paper_bgcolor="white", plot_bgcolor="white",
                  )
# add text box with MAE and RMSE
fig.add_annotation(x=0.05, y=0.95, xref="paper", yref="paper",
                   text="MAE: " + str(round(df_game_net["abs_error"].mean(), 2)) + " eV\
                   <br>RMSE: " + str(RMSE.round(2)) + "eV \
                   <br>R2: " + str(R2.round(2)) + " \
                   <br>MDAE: " + str(MDAE.round(2)) + "eV ",
                   align="left", showarrow=False, font=dict(size=12, color="black", family="Arial"),
                   bordercolor="black", borderwidth=1, borderpad=2, bgcolor="white", opacity=0.8)

fig.show()

# Error distribution plot

In [None]:
fig = ff.create_distplot([df_game_net["error"]], ["Error_C3", "Error_rad"], bin_size=0.05, curve_type="normal", colors=["#2E91E5"], show_rug=False, show_hist=False)

fig.update_layout(title_text=r'$ \text{Distribution of } E_{\text{ads}} \text{energies for open-shell molecules}$', xaxis_title=r"$E_{\text{ads}} \text{/ eV}$", yaxis_title="Density")

fig.update_layout(width=576, height=576, margin=dict(l=50, r=50, b=50, t=50, pad=4), paper_bgcolor="white", plot_bgcolor="white", font=dict(size=12, color="black", family="Arial"))

fig.update_xaxes(showgrid=True, zeroline=False, showline=True, linewidth=2, linecolor='black', mirror=False, tickangle=0, tickfont=dict(size=12, color="black", family="Arial"), gridcolor="black", gridwidth=1,
                 title=dict(font=dict(size=16, color="black", family="Arial")), range=[-2, 2.5])

fig.update_yaxes(showgrid=True, gridcolor="black", zeroline=False, showline=True, linewidth=2, linecolor='black', mirror=True,
                 title=dict(font=dict(size=16, color="black", family="Arial", )
                            ))

fig.update_traces(line=dict(width=3))
# add vertical line for mean and median deviation
fig.add_shape(type="line", x0=df_game_net["error"].mean(), y0=-.05, x1=df_game_net["error"].mean(), y1=0.75, line=dict(color="black", width=2, dash="dash"))

fig.add_shape(type="line", x0=df_game_net["error"].median(), y0=-.05, x1=df_game_net["error"].median(), y1=0.75, line=dict(color="red", width=2, dash="dash"))

fig.show()