In [1]:
import numpy as np
import pandas as pd
import joblib
import os
import math

In [2]:
data_mapping = joblib.load('../data/oc20_data_mapping.pkl')

In [51]:
file_names = [i for i in os.listdir('../data/s2ef/all/val_ood_cat') if i.find('.txt')>=0]
new_dict = []
for file in file_names:
    path = os.path.join('../data/s2ef/all/val_ood_cat', file)
    with open(path, 'r') as f:
        lines = f.readlines()
    for line in lines:
        system_id, frame_id = line.split(',')
        temp = {}
        temp['system_id'] = system_id.strip()
        temp['frame_id'] = frame_id.strip()
        temp.update(data_mapping.get(system_id, {}))
        new_dict.append(temp)

In [52]:
df_summary = pd.DataFrame(new_dict)

In [53]:
s2ef_predictions = np.load('../results/2023-07-14-09-08-16/s2ef_predictions.npz')

In [54]:
s2ef_predictions['ids'].shape

(999809,)

In [55]:
chunk_size = [i.shape[0] for i in  np.array_split(s2ef_predictions['forces'], s2ef_predictions['chunk_idx'])]
force = s2ef_predictions['forces']
force_label = s2ef_predictions['forces_label']
ids = s2ef_predictions['ids']
df_force = []
start = 0
for i in range(s2ef_predictions['ids'].shape[0]):
    tot = chunk_size[i]
    temp = abs(force[start:start+tot] - force_label[start:start+tot])
    xyz_temp = temp.sum(axis=0)
    result = {"force{}_sum".format(j): xyz_temp[j] for j in range(3)}
    result.update({"force{}_tol".format(j): tot for j in range(3)})
    result['force_sum'] = xyz_temp.sum()
    result['force_tot'] = tot * 3
    result['ids'] = ids[i]
    df_force.append(result)
    start = start + tot
df_force = pd.DataFrame(df_force)
df_force['system_id'] = df_force['ids'].apply(lambda x: 'random{}'.format(x.split('_')[0]))
df_force['frame_id'] = df_force['ids'].apply(lambda x: 'frame{}'.format(x.split('_')[1]))

In [56]:
df_force = pd.merge(left=df_force, right=df_summary, left_on=['system_id', 'frame_id'], right_on=['system_id', 'frame_id'], how='left')

In [57]:
df_force.head()

Unnamed: 0,force0_sum,force1_sum,force2_sum,force0_tol,force1_tol,force2_tol,force_sum,force_tot,ids,system_id,...,ads_id,bulk_mpid,bulk_symbols,ads_symbols,miller_index,shift,top,adsorption_site,class,anomaly
0,4.628906,8.976562,5.511719,30,30,30,19.125,90,1000066_11,random1000066,...,60,mp-2330,Se6Nb4,*OHNNCH3,"(1, 1, 1)",0.096,True,"((13.81, 2.92, 30.05),)",2,1
1,1.313477,1.401367,1.554688,30,30,30,4.269531,90,1000066_122,random1000066,...,60,mp-2330,Se6Nb4,*OHNNCH3,"(1, 1, 1)",0.096,True,"((13.81, 2.92, 30.05),)",2,1
2,1.332031,1.478516,1.277344,30,30,30,4.085938,90,1000066_133,random1000066,...,60,mp-2330,Se6Nb4,*OHNNCH3,"(1, 1, 1)",0.096,True,"((13.81, 2.92, 30.05),)",2,1
3,1.498047,1.542969,1.433594,30,30,30,4.476562,90,1000066_148,random1000066,...,60,mp-2330,Se6Nb4,*OHNNCH3,"(1, 1, 1)",0.096,True,"((13.81, 2.92, 30.05),)",2,1
4,3.007812,2.666016,3.162109,30,30,30,8.835938,90,1000066_167,random1000066,...,60,mp-2330,Se6Nb4,*OHNNCH3,"(1, 1, 1)",0.096,True,"((13.81, 2.92, 30.05),)",2,1


In [58]:
def force_stat(df_force):
    batch_size = 128
    index= {'force_sum': 0, 'force0_sum': 0, 'force1_sum': 0, 'force2_sum': 0,
            'force_tot': 0, 'force0_tol': 0, 'force1_tol': 0, 'force2_tol': 0}
    for i in range(math.ceil(df_force.shape[0]/batch_size)):
        temp = df_force.iloc[i*batch_size:(i+1)*batch_size]
        for key in index:
            index[key] += temp[key].sum()
    return pd.Series([index['force_sum']/index['force_tot'], index['force0_sum']/index['force0_tol'],
                      index['force1_sum']/index['force1_tol'], index['force2_sum']/index['force2_tol']],
                     index=['force_mae', 'forcex_mae', 'forcey_mae', 'forcez_mae'])

In [59]:
df_force['tot'] = 'tot'

In [60]:
df_force.groupby('tot').apply(force_stat)

Unnamed: 0_level_0,force_mae,forcex_mae,forcey_mae,forcez_mae
tot,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
tot,0.052571,0.046754,0.054297,0.056661


In [33]:
df_force.groupby('tot').apply(force_stat)

Unnamed: 0_level_0,force_mae,forcex_mae,forcey_mae,forcez_mae
tot,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
tot,0.049389,0.044685,0.051262,0.052219


In [61]:
df_force.groupby(['tot', 'class']).apply(force_stat)

Unnamed: 0_level_0,Unnamed: 1_level_0,force_mae,forcex_mae,forcey_mae,forcez_mae
tot,class,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
tot,0,0.039532,0.034097,0.04099,0.04351
tot,1,0.051458,0.046403,0.053318,0.054653
tot,2,0.070963,0.064098,0.073016,0.075775
tot,3,0.063231,0.058061,0.064528,0.067105


In [34]:
df_force.groupby(['tot', 'class']).apply(force_stat)

Unnamed: 0_level_0,Unnamed: 1_level_0,force_mae,forcex_mae,forcey_mae,forcez_mae
tot,class,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
tot,0,0.038575,0.033689,0.040291,0.041745
tot,1,0.05074,0.046273,0.052669,0.053279
tot,2,0.060426,0.055855,0.062451,0.062973
tot,3,0.059605,0.054177,0.061496,0.063146


In [62]:
df_force.groupby(['ads_symbols', 'class']).apply(force_stat)

Unnamed: 0_level_0,Unnamed: 1_level_0,force_mae,forcex_mae,forcey_mae,forcez_mae
ads_symbols,class,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
*C,0,0.058030,0.051253,0.058902,0.063935
*C,1,0.075550,0.067887,0.079610,0.079145
*C,2,0.087361,0.079949,0.088630,0.093484
*C,3,0.089591,0.088434,0.088718,0.091589
*C*C,0,0.052783,0.049318,0.054775,0.054251
...,...,...,...,...,...
*ONNH2,3,0.064809,0.058924,0.063335,0.072197
CH2*CO,0,0.038425,0.030774,0.041520,0.042980
CH2*CO,1,0.051094,0.043823,0.054655,0.054811
CH2*CO,2,0.062803,0.054990,0.066553,0.066853


In [13]:
df_force.groupby(['ads_symbols', 'class']).apply(force_stat)

Unnamed: 0_level_0,Unnamed: 1_level_0,force_mae,forcex_mae,forcey_mae,forcez_mae
ads_symbols,class,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
*C,0,0.051669,0.044546,0.054334,0.056129
*C,1,0.070813,0.067285,0.072991,0.072181
*C,2,0.080176,0.074626,0.083771,0.082146
*C,3,0.086012,0.090853,0.081938,0.085260
*C*C,0,0.049400,0.046418,0.051859,0.049920
...,...,...,...,...,...
*ONNH2,3,0.056726,0.046465,0.060687,0.063008
CH2*CO,0,0.037822,0.031444,0.040714,0.041303
CH2*CO,1,0.050589,0.045118,0.053703,0.052945
CH2*CO,2,0.058639,0.052020,0.061890,0.061997


In [63]:
pd.concat([df_force.groupby(['tot', 'class']).apply(force_stat).reset_index(drop=False).rename(columns={'tot': 'ads_symbols'}),
           df_force.groupby(['ads_symbols', 'class']).apply(force_stat).reset_index(drop=False)],
          axis=0).to_csv('../data/val_ood_cat_force.csv', index=None)

In [64]:
df_energy = pd.DataFrame()
df_energy['energy'] = s2ef_predictions['energy']
df_energy['energy_label'] = s2ef_predictions['energy_label']
df_energy['ids'] = s2ef_predictions['ids']
df_energy['system_id'] = df_energy['ids'].apply(lambda x: 'random{}'.format(x.split('_')[0]))
df_energy['frame_id'] = df_energy['ids'].apply(lambda x: 'frame{}'.format(x.split('_')[1]))

In [65]:
df_energy['energy_abs_e'] = df_energy[['energy', 'energy_label']].apply(lambda x: abs(x['energy'] - x['energy_label']), axis=1)
df_energy['energy_sqr_e'] = df_energy[['energy', 'energy_label']].apply(lambda x: (x['energy'] - x['energy_label'])**2, axis=1)

In [66]:
df_energy = pd.merge(left=df_energy, right=df_summary, left_on=['system_id', 'frame_id'], right_on=['system_id', 'frame_id'], how='left')

In [67]:
df_energy.head()

Unnamed: 0,energy,energy_label,ids,system_id,frame_id,energy_abs_e,energy_sqr_e,bulk_id,ads_id,bulk_mpid,bulk_symbols,ads_symbols,miller_index,shift,top,adsorption_site,class,anomaly
0,8.117188,9.515625,1000066_11,random1000066,frame11,1.398438,1.955627,1287,60,mp-2330,Se6Nb4,*OHNNCH3,"(1, 1, 1)",0.096,True,"((13.81, 2.92, 30.05),)",2,1
1,1.700195,1.919922,1000066_122,random1000066,frame122,0.219727,0.04828,1287,60,mp-2330,Se6Nb4,*OHNNCH3,"(1, 1, 1)",0.096,True,"((13.81, 2.92, 30.05),)",2,1
2,1.700195,1.878906,1000066_133,random1000066,frame133,0.178711,0.031938,1287,60,mp-2330,Se6Nb4,*OHNNCH3,"(1, 1, 1)",0.096,True,"((13.81, 2.92, 30.05),)",2,1
3,1.694336,1.819336,1000066_148,random1000066,frame148,0.125,0.015625,1287,60,mp-2330,Se6Nb4,*OHNNCH3,"(1, 1, 1)",0.096,True,"((13.81, 2.92, 30.05),)",2,1
4,1.264648,1.482422,1000066_167,random1000066,frame167,0.217773,0.047425,1287,60,mp-2330,Se6Nb4,*OHNNCH3,"(1, 1, 1)",0.096,True,"((13.81, 2.92, 30.05),)",2,1


In [68]:
def energy_stat(df_energy):
    batch_size = 128
    index= {'energy_abs_e': 0, 'energy_abs_e_tot': 0, 'energy_sqr_e': 0, 'energy_sqr_e_tot': 0}
    for i in range(math.ceil(df_energy.shape[0]/batch_size)):
        temp = df_energy.iloc[i*batch_size:(i+1)*batch_size]
        index['energy_abs_e'] += temp['energy_abs_e'].sum()
        index['energy_abs_e_tot'] += temp['energy_abs_e'].shape[0]
        index['energy_sqr_e'] += temp['energy_sqr_e'].sum()
        index['energy_sqr_e_tot'] += temp['energy_sqr_e'].shape[0]
    return pd.Series([index['energy_abs_e']/index['energy_abs_e_tot'], index['energy_sqr_e']/index['energy_sqr_e_tot']],
                     index=['energy_abs_e', 'energy_sqr_e'])

In [69]:
df_energy['tot'] = 'tot'

In [70]:
df_energy.groupby('tot').apply(energy_stat)

Unnamed: 0_level_0,energy_abs_e,energy_sqr_e
tot,Unnamed: 1_level_1,Unnamed: 2_level_1
tot,0.603336,4.332628


In [26]:
df_energy.groupby('tot').apply(energy_stat)

Unnamed: 0_level_0,energy_abs_e,energy_sqr_e
tot,Unnamed: 1_level_1,Unnamed: 2_level_1
tot,0.446813,0.716775


In [71]:
df_energy.groupby(['tot', 'class']).apply(energy_stat).reset_index(drop=False)

Unnamed: 0,tot,class,energy_abs_e,energy_sqr_e
0,tot,0,0.251029,0.163676
1,tot,1,0.496527,0.894868
2,tot,2,1.234117,13.589407
3,tot,3,0.699252,1.205068


In [28]:
df_energy.groupby(['tot', 'class']).apply(energy_stat).reset_index(drop=False)

Unnamed: 0,tot,class,energy_abs_e,energy_sqr_e
0,tot,0,0.219624,0.114614
1,tot,1,0.438379,0.534101
2,tot,2,0.724752,1.582577
3,tot,3,0.679711,1.233938


In [72]:
df_energy.groupby(['ads_symbols', 'class']).apply(energy_stat)

Unnamed: 0_level_0,Unnamed: 1_level_0,energy_abs_e,energy_sqr_e
ads_symbols,class,Unnamed: 2_level_1,Unnamed: 3_level_1
*C,0,0.304751,0.189954
*C,1,0.633678,0.988742
*C,2,1.277577,4.386955
*C,3,1.003200,2.013195
*C*C,0,0.307063,0.211179
...,...,...,...
*ONNH2,3,0.621730,0.846417
CH2*CO,0,0.240429,0.125254
CH2*CO,1,0.486809,0.788576
CH2*CO,2,0.829453,2.367818


In [36]:
df_energy.groupby(['ads_symbols', 'class']).apply(energy_stat)

Unnamed: 0_level_0,Unnamed: 1_level_0,energy_abs_e,energy_sqr_e
ads_symbols,class,Unnamed: 2_level_1,Unnamed: 3_level_1
*C,0,0.271506,0.164877
*C,1,0.611988,0.879887
*C,2,0.831009,1.441534
*C,3,1.212325,2.884799
*C*C,0,0.253115,0.136425
...,...,...,...
*ONNH2,3,0.845318,6.114027
CH2*CO,0,0.205702,0.101487
CH2*CO,1,0.438689,0.668835
CH2*CO,2,0.619055,1.135121


In [73]:
pd.concat([df_energy.groupby(['tot', 'class']).apply(energy_stat).reset_index(drop=False).rename(columns={'tot': 'ads_symbols'}),
           df_energy.groupby(['ads_symbols', 'class']).apply(energy_stat).reset_index(drop=False)],
          axis=0).to_csv('../data/val_ood_cat_energy.csv', index=None)