In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.optimize import curve_fit
from Scripts.def_colors import map_DMC, dmc_color
from Scripts.define_setup import *
from Scripts.myfit import fit_err, fun_lin, fun_quad, fun_cub, fun_quart, get_chi2_alpha_parfun
from Scripts.jup_plot import *
import re

# Set the display option for maximum rows (you can adjust this based on your needs)
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)

# Check if we are in Google Colab environment
try:
    import google.colab
    IN_COLAB = True
    usetex = False
except:
    import os
    IN_COLAB = False
    if os.path.expanduser('~') == '/home/shixubenjamin':
        usetex = True
    else:
        usetex = False

if usetex:
    textrue_import()

# If in Google Colab, install the necessary data and set up the necessary environment
if IN_COLAB == True:
    !rm -rf /content/DMC-reproducibility-main /content/main.zip
    !wget https://github.com/zenandrea/DMC-reproducibility/archive/refs/heads/main.zip
    !unzip /content/main.zip
    %pwd
    %cd /content/DMC-reproducibility-main

replot_graphs = True

# Load the data
dimer_info = pd.read_csv('Data/dim_info.csv', index_col=0)
monomer_info = pd.read_csv('Data/mol_info.csv', index_col=0)

dimer_dmc_total_energy_data = pd.read_csv('Data/results_dim.csv', index_col=0)
monomer_dmc_total_energy_data = pd.read_csv('Data/results_mol.csv', index_col=0)
monomer_geometry_correction_data = pd.read_csv('Data/delta_mol_ref.csv', index_col=0)

# Define formatted names for the dimer systems
formatted_name_list = []
for system_id in range(1,67):
    dimer_name = dimer_info.loc[system_id,'name'].replace('pi', '$\pi$')
    dimer_name = dimer_name.split('_')
    part1 = dimer_name[0]

    part2 =  re.sub(r'(\d+)', r'$_\1$', dimer_name[1]).split('-')[0]
    part3 = re.sub(r'(\d+)', r'$_\1$', dimer_name[1]).split('-')[1]
    
    if len(dimer_name) > 2:
        additional_info = dimer_name[2:][0]
        formatted_name = part2 + '$\cdots$' + part3 + ' (' + additional_info + ')'
    else:
        formatted_name = part2 + '$\cdots$' + part3
    formatted_name_list.append(formatted_name)

dimer_info['formatted_name'] = formatted_name_list
dimer_info = dimer_info[['formatted_name','name','mol1','mol2','Nel','Nelv','atoms']]

### Analysis of the DMC data for the S66 dataset

In [2]:
# Compute the binding energy for the S66
# Filter dmc data to only include data with dmc_type = 'DMCdla5' and dmc_Jas = 'Jopt'
filtered_dimer_dmc_total_energy_data = dimer_dmc_total_energy_data[(dimer_dmc_total_energy_data['dmc_type'] == 'DMCdla5') & (dimer_dmc_total_energy_data['dmc_Jas'] == 'Jopt')]

dmc_energy_data = {system_id: {'total_energy_dimer': 0, 'total_energy_monomer_1':0, 'total_energy_monomer_2':0, 'binding_energy': 0} for system_id in range(1,67)}

system_name_dict = {system_id: {'Original': '', 'New': ''} for system_id in range(1,67)}

# Loop over the the dimers
for system_id, system_data in filtered_dimer_dmc_total_energy_data.groupby('ID'):
    system_data = system_data.sort_values('tau', ascending=False)
    system_data.set_index( 'tau', inplace=True )
    system_name = dimer_info.loc[system_id,'name']
    
    monomer_data = {1:0, 2:0}
    monomer_geometry_correction = {1:0, 2:0}
    # Get the monomer data
    for monomer_num in [1,2]:
        monomer_id = f'{system_id:02d}_{monomer_num}'
        monomer_name = dimer_info.loc[system_id,f'mol{monomer_num}']
        monomer_ref_id = monomer_info.loc[monomer_name, 'ref']
        monomer_ref_data = monomer_dmc_total_energy_data[(monomer_dmc_total_energy_data['mol_id'] == monomer_ref_id) & (monomer_dmc_total_energy_data['dmc_type'] == 'DMCdla5') & (monomer_dmc_total_energy_data['dmc_Jas'] == 'Jopt')].sort_values('tau', ascending=False)
        monomer_ref_data.set_index( 'tau', inplace=True )
        # Add the geometry correction
        monomer_ref_data['ene'] = monomer_ref_data['ene'] + monomer_geometry_correction_data[monomer_geometry_correction_data['mol_id'] == monomer_id]['ene-ref'].values[0]
        monomer_data[monomer_num] = monomer_ref_data
    dmc_energy_data[system_id]['total_energy_dimer'] = system_data.copy()
    dmc_energy_data[system_id]['total_energy_monomer_1'] = monomer_data[1]
    dmc_energy_data[system_id]['total_energy_monomer_2'] = monomer_data[2]
    # Compute the binding energy
    system_data['binding_energy'] = system_data['ene'] - monomer_data[1]['ene'] - monomer_data[2]['ene']
    system_data['binding_energy_err'] = (system_data['err']**2 + monomer_data[1]['err']**2 + monomer_data[2]['err']**2)**0.5
    dmc_energy_data[system_id]['binding_energy'] = system_data

### SI - Estimating CCSD(T) deformation energy

In [3]:
counter=0
final_monomer_total_energy = {}

latex_input_str = ''

for mol, monomer in monomer_info.groupby('mol'):
    monomer_dimer_index = monomer['ref'].tolist()[0].split('_')[0]
    monomer_name = re.sub(r'(\d+)', r'$_\1$', monomer.index.tolist()[0])
    monomer_data = monomer_dmc_total_energy_data[(monomer_dmc_total_energy_data['mol_id'] == monomer['ref'].values[0]) & (monomer_dmc_total_energy_data['dmc_type'] == 'DMCdla5') & (monomer_dmc_total_energy_data['dmc_Jas'] == 'Jopt')].sort_values('tau', ascending=False)
    monomer_data.set_index( 'tau', inplace=True )
    fig, ax = plt.subplots(figsize=(3.365,2), dpi=300,constrained_layout=True)

    # Fit the linear data
    fitting_data = monomer_data[ monomer_data.index <= 0.015 ]
    xdata = fitting_data.index.to_numpy()
    ydata = fitting_data['ene'].to_numpy()
    sigma = fitting_data['err'].to_numpy()
    xfit1, m1, s1 = fit_err(xdata,ydata,sigma,fitfun=fun_lin)

    # Fit the cubic data
    fitting_data = monomer_data[ monomer_data.index <= 0.11]
    xdata = fitting_data.index.to_numpy()
    ydata = fitting_data['ene'].to_numpy()
    sigma = fitting_data['err'].to_numpy()
    xfit3, m3, s3 = fit_err(xdata,ydata,sigma,fitfun=fun_cub)

    # Determine which fit is the best fit
    if abs(m1[0] - m3[0]) > 0.1:
        extrap_system_total_energy = m1[0]
        final_monomer_total_energy[monomer['ref'].tolist()[0]] = {'Monomer': monomer_name, 'Dimer Geometry': dimer_info.loc[int(monomer_dimer_index),'formatted_name'] + f" (ID {int(monomer_dimer_index)})", 'Total Energy': m1[0], 'Total Energy Error': s1[0], 'Formatted Total Energy': f'{m1[0]:.2f}$\pm${s1[0]:.2f}','Fit Type': 'linear'}
        energy_fit_type = 'linear'
    else:
        extrap_system_total_energy = m3[0]
        final_monomer_total_energy[monomer['ref'].tolist()[0]] = {'Monomer': monomer_name, 'Dimer Geometry': dimer_info.loc[int(monomer_dimer_index),'formatted_name'] + f" (ID {int(monomer_dimer_index)})", 'Total Energy': m3[0], 'Total Energy Error': s3[0], 'Formatted Total Energy': f'{m3[0]:.2f}$\pm${s3[0]:.2f}', 'Fit Type': 'cubic'}
        energy_fit_type = 'quadratic'

    # Plot the actual computed data
    ax.errorbar(monomer_data.index.tolist(), monomer_data['ene'].values - extrap_system_total_energy, yerr=monomer_data['err'].values, fmt='o', color='black',markeredgecolor='none',markersize=4, label=r'DMC//DLA')

    ax.plot(xfit1,m1 - extrap_system_total_energy,'--',color='blue', label=r'linear fit ($E^\textrm{bind}_{\tau \to 0}=$' + f'{m1[0]:.2f}' + r'${\pm}$' + f'{s1[0]:.2f})')
    ax.fill_between(xfit1,m1 - extrap_system_total_energy -1*s1,m1 - extrap_system_total_energy +1*s1,color='blue',alpha=0.2)

    ax.plot(xfit3,m3 - extrap_system_total_energy,'--',color='green', label=r'cubic fit ($E^\textrm{bind}_{\tau \to 0}=$' + f'{m3[0]:.2f}' + r'${\pm}$' + f'{s3[0]:.2f})')
    ax.fill_between(xfit3,m3 - extrap_system_total_energy -1*s3,m3 - extrap_system_total_energy +1*s3,color='green',alpha=0.2)

    ax.set_xlabel( 'DMC timestep [a.u.]' )
    ax.set_xticks( [0, 0.003, 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.1, 0.2, 0.3 ] )
    ax.set_xticklabels( [ '0', '3E-3', '0.01', '0.02', '0.03', '0.04', '0.05', '0.06', '0.1', '0.2', '0.3' ], rotation=90 )
    ax.set_xlim( [0,0.1*1.03] )
    ax.set_ylim([-5,5])
    ax.set_ylabel( 'Total Energy [kcal/mol]' )
    ax.legend(loc='lower left', fontsize=7)
    ax.set_title(f'{monomer_name}')

    counter +=1
    plt.savefig(f'Figures/Fig_SI_Monomer_{counter:02d}.png')

    latex_input_str += r"""\begin{figure}[!h]
    \includegraphics[width=3.365in]{"""+ f"Figures/Fig_SI_Monomer_{counter:02d}.png" + r"""}
    \caption{\label{fig:""" + f"monomer_{counter:02d}" + r"""} The time step dependence of the total energy of the """ + monomer_name + r""" monomer in the """ + dimer_info.loc[int(monomer_dimer_index),'formatted_name'] + f" dimer (ID {int(monomer_dimer_index)}) " + "geometry." + r"""}
\end{figure}
    
"""

In [4]:
# Convert the dictionary to a DataFrame
final_monomer_total_energy_df = pd.DataFrame(final_monomer_total_energy).T
final_monomer_total_energy_df = final_monomer_total_energy_df[['Monomer', 'Dimer Geometry', 'Formatted Total Energy', 'Fit Type']]
final_monomer_total_energy_df.columns = ['Monomer', 'Dimer Geometry', 'Total Energy [kcal/mol]', 'Fit Type']
latex_input_str = convert_df_to_latex_input(
    final_monomer_total_energy_df,
    start_input = '\\begin{table}',
    label = 'tab:monomer_tot_ene',
    caption = r'Total energy of the 14 monomers which make up the S66 dataset. These geometries are taken from specific dimer complexes within the S66 dataset that are identified in the table. The type of line used to extrapolate to the zero time step limit is also identified',
    end_input = '\\end{table}',
    replace_input = {
    },
    center = True,
    df_latex_skip = 0,
    index=False,
    output_str = True,
    column_format = 'l' + 'r'*(len(final_monomer_total_energy_df.columns)-1)
)
final_monomer_total_energy_df

Unnamed: 0,Monomer,Dimer Geometry,Total Energy [kcal/mol],Fit Type
21_1,AcNH$_2$,AcNH$_2$$\cdots$AcNH$_2$ (ID 21),-25290.29$\pm$0.03,cubic
20_1,AcOH,AcOH$\cdots$AcOH (ID 20),-28725.26$\pm$0.04,cubic
24_1,Benzene,Benzene$\cdots$Benzene ($\pi$-$\pi$) (ID 24),-23624.42$\pm$0.04,cubic
37_1,Cyclopentane,Cyclopentane$\cdots$Neopentane (ID 37),-21586.06$\pm$0.03,cubic
30_2,Ethene,Benzene$\cdots$Ethene (ID 30),-8610.42$\pm$0.02,cubic
32_2,Ethyne,Uracil$\cdots$Ethyne (ID 32),-7823.07$\pm$0.02,cubic
56_2,MeNH$_2$,Benzene$\cdots$MeNH$_2$ (NH-$\pi$) (ID 56),-11671.48$\pm$0.02,cubic
55_2,MeOH,Benzene$\cdots$MeOH (OH-$\pi$) (ID 55),-15103.82$\pm$0.02,cubic
36_1,Neopentane,Neopentane$\cdots$Neopentane (ID 36),-22350.00$\pm$0.04,cubic
34_1,Pentane,Pentane$\cdots$Pentane (ID 34),-22346.69$\pm$0.03,cubic


In [5]:
deformation_energy_data = {system_id: {'dimer name': 0,'mol1 name': 0, 'mol2 name': 0, 'mol1 deformation energy': 0, 'mol2 deformation energy': 0} for system_id in range(1,67)}

for i in range(1,67):
    formatted_name = dimer_info.loc[i,'formatted_name']
    mol_1_name = dimer_info.loc[i,'mol1']
    mol_2_name = dimer_info.loc[i,'mol2']
    mol_1_deformation_energy = monomer_geometry_correction_data.loc[monomer_geometry_correction_data['mol_id'] == f'{i:02d}_1', 'ene-ref'].values[0]
    mol_2_deformation_energy = monomer_geometry_correction_data.loc[monomer_geometry_correction_data['mol_id'] == f'{i:02d}_2','ene-ref'].values[0]
    deformation_energy_data[i] = {'dimer name': formatted_name, 'mol1 name': mol_1_name, 'mol2 name': mol_2_name, 'mol1 deformation energy': f'{mol_1_deformation_energy:.3f}', 'mol2 deformation energy': f'{mol_2_deformation_energy:.3f}'}
# Convert to Pandas Dataframe and the convert to latex string

deformation_energy_data_df = pd.DataFrame(deformation_energy_data).T
deformation_energy_data_df.columns = ['Dimer Name', 'Monomer 1', 'Monomer 2', r'$\Delta E_\textrm{mon. 1, def.}^\textrm{CCSD(T)}$ [kcal/mol]', r'$\Delta E_\textrm{mon. 2, def.}^\textrm{CCSD(T)}$ [kcal/mol]']
latex_input_str = convert_df_to_latex_input(
    deformation_energy_data_df,
    start_input = '\\begin{table}',
    label = 'tab:monomer_deformation_ene',
    caption = r'Deformation energy for the two monomers within each of the dimers of the S66 dataset. This energy is with respect to the geometry used in Table~\ref{tab:monomer_tot_ene}.',
    end_input = '\\end{table}',
    replace_input = {
    },
    center = True,
    df_latex_skip = 0,
    index=True,
    output_str = True,
    column_format = 'll' + 'r'*(len(deformation_energy_data_df.columns)-1)
)


# Write the DataFrame to a latex input
latex_input_str = '\n'.join(latex_input_str.splitlines()[7:-4]) + '\n'

with open('deformation_energy_table.tex', 'w') as f:
    f.write(r"""\LTcapwidth=\textwidth
    
\begin{longtable}{llllrr}
\caption{\label{tab:monomer_deformation_ene}Deformation energy for the two monomers within each of the dimers of the S66 dataset. This energy is with respect to the geometry used in Table~\ref{tab:monomer_tot_ene}.} \\

\toprule
ID & Dimer Name & Molecule 1 & Molecule 2 & $\Delta E_\textrm{mon. 1, def.}^\textrm{CCSD(T)}$ [kcal/mol] & $\Delta E_\textrm{mon. 2, def.}^\textrm{CCSD(T)}$ [kcal/mol] \\
\midrule
\endfirsthead



\caption[]{(continued)} \\
\endhead

\multicolumn{6}{r}{{Continued on next page}} \\
\endfoot

\bottomrule
\endlastfoot

""")
    f.write(latex_input_str)
    f.write(r"\end{longtable}")

deformation_energy_data_df

Unnamed: 0,Dimer Name,Monomer 1,Monomer 2,"$\Delta E_\textrm{mon. 1, def.}^\textrm{CCSD(T)}$ [kcal/mol]","$\Delta E_\textrm{mon. 2, def.}^\textrm{CCSD(T)}$ [kcal/mol]"
1,Water$\cdots$Water,Water,Water,0.031,0.0
2,Water$\cdots$MeOH,Water,MeOH,0.042,-0.016
3,Water$\cdots$MeNH$_2$,Water,MeNH2,0.109,-0.026
4,Water$\cdots$Peptide,Water,Peptide,0.087,0.067
5,MeOH$\cdots$MeOH,MeOH,MeOH,0.056,-0.022
6,MeOH$\cdots$MeNH$_2$,MeOH,MeNH2,0.222,-0.026
7,MeOH$\cdots$Peptide,MeOH,Peptide,0.147,-0.006
8,MeOH$\cdots$Water,MeOH,Water,0.038,-0.001
9,MeNH$_2$$\cdots$MeOH,MeNH2,MeOH,-0.003,-0.033
10,MeNH$_2$$\cdots$MeNH$_2$,MeNH2,MeNH2,0.005,-0.015


### SI - Comparing DLA and TM localization schemes for H<sub>2</sub>O and AcOH dimer

In [6]:
# Compute the binding energy with the TM

tm_filtered_dimer_dmc_total_energy_data = dimer_dmc_total_energy_data[(dimer_dmc_total_energy_data['dmc_type'] == 'DMCtm5') & (dimer_dmc_total_energy_data['dmc_Jas'] == 'JoptLA')]

tm_dmc_energy_data = {}

tm_system_name_dict = {system_id: {'Original': '', 'New': ''} for system_id in range(1,67)}

system_loc_scheme_binding_energy = {id: {'TM Binding Energy': 0, 'TM Binding Energy Error': 0, 'DLA Binding Energy': 0, 'DLA Binding Energy Error': 0} for id in [1,20]}

# Loop over the the dimers
for system_id, system_data in tm_filtered_dimer_dmc_total_energy_data.groupby('ID'):
    tm_dmc_energy_data[system_id] = {'total_energy_dimer': 0, 'total_energy_monomer_1':0, 'total_energy_monomer_2':0, 'binding_energy': 0}
    system_data = system_data.sort_values('tau', ascending=False)
    system_data.set_index( 'tau', inplace=True )
    system_name = dimer_info.loc[system_id,'name']
    
    monomer_data = {1:0, 2:0}
    monomer_geometry_correction = {1:0, 2:0}
    # Get the monomer data
    for monomer_num in [1,2]:
        monomer_id = f'{system_id:02d}_{monomer_num}'
        monomer_name = dimer_info.loc[system_id,f'mol{monomer_num}']
        monomer_ref_id = monomer_info.loc[monomer_name, 'ref']
        monomer_ref_data = monomer_dmc_total_energy_data[(monomer_dmc_total_energy_data['mol_id'] == monomer_ref_id) & (monomer_dmc_total_energy_data['dmc_type'] == 'DMCtm5') & (monomer_dmc_total_energy_data['dmc_Jas'] == 'JoptLA')].sort_values('tau', ascending=False)
        monomer_ref_data.set_index( 'tau', inplace=True )
        # Add the geometry correction
        monomer_ref_data['ene'] = monomer_ref_data['ene'] + monomer_geometry_correction_data[monomer_geometry_correction_data['mol_id'] == monomer_id]['ene-ref'].values[0]
        monomer_data[monomer_num] = monomer_ref_data
    tm_dmc_energy_data[system_id]['total_energy_dimer'] = system_data.copy()
    tm_dmc_energy_data[system_id]['total_energy_monomer_1'] = monomer_data[1]
    tm_dmc_energy_data[system_id]['total_energy_monomer_2'] = monomer_data[2]
    # Compute the binding energy
    system_data['binding_energy'] = system_data['ene'] - monomer_data[1]['ene'] - monomer_data[2]['ene']
    system_data['binding_energy_err'] = (system_data['err']**2 + monomer_data[1]['err']**2 + monomer_data[2]['err']**2)**0.5
    tm_dmc_energy_data[system_id]['binding_energy'] = system_data

    # Extrapolate the binding energy to the zero time step limit
    fitting_data = system_data[ system_data.index <= 0.015 ]
    xdata = fitting_data.index.to_numpy()
    ydata = fitting_data['binding_energy'].to_numpy()
    sigma = fitting_data['binding_energy_err'].to_numpy()
    xfit1, m1, s1 = fit_err(xdata,ydata,sigma,fitfun=fun_lin)

    fitting_data = system_data[ system_data.index <= 0.11]
    xdata = fitting_data.index.to_numpy()
    ydata = fitting_data['binding_energy'].to_numpy()
    sigma = fitting_data['binding_energy_err'].to_numpy()
    xfit3, m3, s3 = fit_err(xdata,ydata,sigma,fitfun=fun_cub)

    if abs(m1[0] - m3[0]) > 0.1:
        system_loc_scheme_binding_energy[system_id]['TM Binding Energy'] = m1[0]
        system_loc_scheme_binding_energy[system_id]['TM Binding Energy Error'] = s1[0]
    else:
        system_loc_scheme_binding_energy[system_id]['TM Binding Energy'] = m3[0]
        system_loc_scheme_binding_energy[system_id]['TM Binding Energy Error'] = s3[0]

    # Compute the binding energy with the DLA
    fitting_data = dmc_energy_data[system_id]['binding_energy'][dmc_energy_data[system_id]['binding_energy'].index <= 0.015 ]
    xdata = fitting_data.index.to_numpy()
    ydata = fitting_data['binding_energy'].to_numpy()
    sigma = fitting_data['binding_energy_err'].to_numpy()
    xfit1, m1, s1 = fit_err(xdata,ydata,sigma,fitfun=fun_lin)

    fitting_data = dmc_energy_data[system_id]['binding_energy'][dmc_energy_data[system_id]['binding_energy'].index <= 0.11]
    xdata = fitting_data.index.to_numpy()
    ydata = fitting_data['binding_energy'].to_numpy()
    sigma = fitting_data['binding_energy_err'].to_numpy()
    xfit3, m3, s3 = fit_err(xdata,ydata,sigma,fitfun=fun_cub)

    if abs(m1[0] - m3[0]) > 0.1:
        system_loc_scheme_binding_energy[system_id]['DLA Binding Energy'] = m1[0]
        system_loc_scheme_binding_energy[system_id]['DLA Binding Energy Error'] = s1[0]
    else:
        system_loc_scheme_binding_energy[system_id]['DLA Binding Energy'] = m3[0]
        system_loc_scheme_binding_energy[system_id]['DLA Binding Energy Error'] = s3[0]

# Create a dictionary with tuples as values for binding energy and error
system_loc_scheme_binding_energy_formatted = {
    (r'H2O$\cdots$H2O', 'TM'): (system_loc_scheme_binding_energy[1]['TM Binding Energy'], system_loc_scheme_binding_energy[1]['TM Binding Energy Error']),
    (r'H2O$\cdots$H2O', 'DLA'): (system_loc_scheme_binding_energy[1]['DLA Binding Energy'], system_loc_scheme_binding_energy[1]['DLA Binding Energy Error']),
    (r'AcOH$\cdots$AcOH', 'TM'): (system_loc_scheme_binding_energy[20]['TM Binding Energy'], system_loc_scheme_binding_energy[20]['TM Binding Energy Error']),
    (r'AcOH$\cdots$AcOH', 'DLA'): (system_loc_scheme_binding_energy[20]['DLA Binding Energy'], system_loc_scheme_binding_energy[20]['DLA Binding Energy Error']),
}

# Convert the dictionary to a pandas DataFrame
system_loc_scheme_binding_energy_df = pd.DataFrame(system_loc_scheme_binding_energy_formatted).T

# Rename the columns for clarity
system_loc_scheme_binding_energy_df.columns = ['Initial Binding Energy', 'Error']

# Optionally apply formatting later if you need it displayed as strings
system_loc_scheme_binding_energy_df['Eint [kcal/mol]'] = system_loc_scheme_binding_energy_df.apply(
    lambda row: f"{row['Initial Binding Energy']:.2f}$\pm${row['Error']:.2f}", axis=1
)

system_loc_scheme_binding_energy_df = system_loc_scheme_binding_energy_df[['Eint [kcal/mol]']]

# Create latex input string
latex_input_str = convert_df_to_latex_input(
    system_loc_scheme_binding_energy_df,
    start_input = '\\begin{table}',
    label = 'tab:loc_scheme_test',
    caption = r'Comparison of the extrapolated interaction energy $\Delta_\textrm{int}$ for the TM and DLA localization schemes for the H$_2$O$\cdots$H$_2$O (ID 1) and AcOH$\cdots$AcOH dimers (ID 20).',
    end_input = '\\end{table}',
    center = True,
    df_latex_skip = 0,
    index=True,
    output_str = True,
    column_format = 'll' + 'r'*(len(system_loc_scheme_binding_energy_df.columns)-1)
)

system_loc_scheme_binding_energy_df

Unnamed: 0,Unnamed: 1,Eint [kcal/mol]
H2O$\cdots$H2O,TM,-5.06$\pm$0.03
H2O$\cdots$H2O,DLA,-5.17$\pm$0.03
AcOH$\cdots$AcOH,TM,-19.99$\pm$0.06
AcOH$\cdots$AcOH,DLA,-20.28$\pm$0.10


## SI - Previous CCSD(T) literature and final CCSD(T) and CCSD(cT)-fit estimates

In [7]:
keshwarni_cc_data = pd.read_excel('Data/Kesharwani_10.1071_CH17588_SI.xlsx', sheet_name='F12c_aVTZ-F12 CCSD',usecols = 'J:L').dropna().drop([7, 8]).reset_index(drop=True)
keshwarni_cc_data.columns = ['HF', 'MP2', 'CCSD']
keshwarni_cc_data['(T)'] = pd.read_excel('Data/Kesharwani_10.1071_CH17588_SI.xlsx', sheet_name='(T) ',usecols = 'P').drop(list(range(19))).reset_index(drop=True)['Unnamed: 15']
keshwarni_cc_data['(cT)'] = keshwarni_cc_data['(T)'] /( 0.7764+0.278*(keshwarni_cc_data['MP2'] - keshwarni_cc_data['HF'])/(keshwarni_cc_data['CCSD']  - keshwarni_cc_data['HF']))
keshwarni_cc_data['(cT)-(T)'] = keshwarni_cc_data['(cT)'] - keshwarni_cc_data['(T)']

# Load CCSD(T) references
ccsdt_references = pd.read_csv('Data/Refs.csv', index_col=0)
ccsdt_references['CCSD(T) Final'] = ccsdt_references['Hobza_1']
ccsdt_references['CCSD(T) Error'] = ccsdt_references['Hobza_1']
ccsdt_references['CCSD(cT)-fit Final'] = ccsdt_references['Hobza_1']

for i in range(66):
    if np.isnan(ccsdt_references['Martin_Gold'][i+1]):
        ccsdt_references.loc[i+1,'CCSD(T) Final'] = np.average([ccsdt_references['Hobza_2'][i+1], ccsdt_references['Martin_Silver'][i+1], ccsdt_references['14k-Gold'][i+1]])
        ccsdt_references.loc[i+1,'CCSD(T) Error'] = np.std([ccsdt_references['Hobza_2'][i+1], ccsdt_references['Martin_Silver'][i+1], ccsdt_references['14k-Gold'][i+1]])*2
        ccsdt_references.loc[i+1,'CCSD(cT)-fit Final'] = ccsdt_references.loc[i+1,'Martin_Silver'] - keshwarni_cc_data['(cT)-(T)'][i]

    else:
        ccsdt_references.loc[i+1,'CCSD(T) Final'] = np.average([ccsdt_references['Hobza_2'][i+1], ccsdt_references['Martin_Gold'][i+1], ccsdt_references['14k-Gold'][i+1]])
        ccsdt_references.loc[i+1,'CCSD(T) Error'] = np.std([ccsdt_references['Hobza_2'][i+1], ccsdt_references['Martin_Gold'][i+1], ccsdt_references['14k-Gold'][i+1]])*2
        ccsdt_references.loc[i+1,'CCSD(cT)-fit Final'] = ccsdt_references.loc[i+1,'Martin_Silver'] - keshwarni_cc_data['(cT)-(T)'][i]

ccsdt_raw_references = ccsdt_references.copy()

# Round to nearest 0.01 kcal/mol
ccsdt_references = ccsdt_references.round(2)
ccsdt_references = ccsdt_references.applymap(lambda x: f"{x:.2f}" if isinstance(x, (int, float)) else x)
ccsdt_references['formatted_name'] = dimer_info['formatted_name']

# Make a list combining CCSD(T) Final $\pm$ CCSD(T) Error
ccsdt_references['CCSD(T) Final'] = ccsdt_references['CCSD(T) Final'].astype(str)
ccsdt_references['CCSD(T) Error'] = ccsdt_references['CCSD(T) Error'].astype(str)
ccsdt_references['CCSD(cT)-fit Final'] = ccsdt_references['CCSD(cT)-fit Final'].astype(str)
ccsdt_references['CCSD(T) Final'] = ccsdt_references['CCSD(T) Final'] + '$\pm$' + ccsdt_references['CCSD(T) Error']
ccsdt_references['CCSD(cT)-fit Final'] = ccsdt_references['CCSD(cT)-fit Final'] + '$\pm$' + ccsdt_references['CCSD(T) Error']

# Only include 'formatted_name', 'Hobza_2', 'Martin_Silver', '14k-Gold', 'CCSD(T) Final' and 'CCSD(cT)-fit Final' columns
ccsdt_references_table = ccsdt_references[['formatted_name', 'Hobza_2', 'Martin_Silver', '14k-Gold', 'CCSD(T) Final', 'CCSD(cT)-fit Final']]
ccsdt_references_table.columns = ['System', r'\v{R}ez\'a\v{c} \textit{et al.} (2006)', r'Kesharwani \textit{et al.} (2018)', r'Nagy \textit{et al.} (2023)', 'Final CCSD(T)', 'Final CCSD(cT)-fit']

# Write the DataFrame to a latex input
latex_input_str = '\n'.join(convert_df_to_latex_input(
    ccsdt_references_table,
    start_input = '\\begin{table}',
    label = 'tab:cc_references',
    caption = r'CCSD(T) references for the S66 dataset. The final CCSD(T) and CCSD(cT)-fit values are computed as the average of the values from the three references. The error is computed as twice the standard deviation of the values from the three references.',
    end_input = '\\end{table}',
    replace_input = {
    },
    adjustbox = 1,
    center = True,
    df_latex_skip = 0,
    rotate_column_header = True,
    output_str = True,
    column_format = 'll' + 'r'*len(ccsdt_references_table.columns)
).splitlines()[7:-4]) + '\n'

with open('ccsdt_references_table.tex', 'w') as f:
    f.write(r"""\LTcapwidth=\textwidth
    
\begin{longtable}{llrrrrrr}
\caption{\label{tab:cc_references}CCSD(T) references for the S66 dataset. The final CCSD(T) and CCSD(cT)-fit values are computed as the average of the values from the three references. The error is computed as twice the standard deviation of the values from the three references.} \\

\toprule
 & \rotatebox{90}{System} & \rotatebox{90}{\v{R}ez\'a\v{c} \textit{et al.} (2006)} & \rotatebox{90}{Kesharwani \textit{et al.} (2018)} & \rotatebox{90}{Nagy \textit{et al.} (2023)} & \rotatebox{90}{Final CCSD(T)} & \rotatebox{90}{Final CCSD(cT)-fit} \\ 
\midrule
\endfirsthead



\caption[]{(continued)} \\
\endhead

\multicolumn{8}{r}{{Continued on next page}} \\
\endfoot

\bottomrule
\endlastfoot

""")
    f.write(latex_input_str)
    f.write(r"\end{longtable}")

# ccsdt_references_table_latex = ccsdt_references_table.to_latex(index=True, escape=False, column_format='lrrrrr')
ccsdt_references_table

  ccsdt_references = ccsdt_references.applymap(lambda x: f"{x:.2f}" if isinstance(x, (int, float)) else x)


Unnamed: 0,\rotatebox{90}{System},\rotatebox{90}{\v{R}ez\'a\v{c} \textit{et al.} (2006)},\rotatebox{90}{Kesharwani \textit{et al.} (2018)},\rotatebox{90}{Nagy \textit{et al.} (2023)},\rotatebox{90}{Final CCSD(T)},\rotatebox{90}{Final CCSD(cT)-fit}
1,Water$\cdots$Water,-5.01,-4.98,-4.99,-4.99$\pm$0.03,-4.96$\pm$0.03
2,Water$\cdots$MeOH,-5.7,-5.67,-5.67,-5.68$\pm$0.03,-5.63$\pm$0.03
3,Water$\cdots$MeNH$_2$,-7.04,-6.99,-7.0,-7.01$\pm$0.05,-6.94$\pm$0.05
4,Water$\cdots$Peptide,-8.22,-8.18,-8.19,-8.20$\pm$0.03,-8.15$\pm$0.03
5,MeOH$\cdots$MeOH,-5.85,-5.82,-5.83,-5.83$\pm$0.02,-5.78$\pm$0.02
6,MeOH$\cdots$MeNH$_2$,-7.67,-7.62,-7.62,-7.64$\pm$0.04,-7.55$\pm$0.04
7,MeOH$\cdots$Peptide,-8.34,-8.31,-8.31,-8.32$\pm$0.03,-8.25$\pm$0.03
8,MeOH$\cdots$Water,-5.09,-5.06,-5.07,-5.08$\pm$0.02,-5.03$\pm$0.02
9,MeNH$_2$$\cdots$MeOH,-3.11,-3.09,-3.09,-3.10$\pm$0.02,-3.05$\pm$0.02
10,MeNH$_2$$\cdots$MeNH$_2$,-4.22,-4.18,-4.19,-4.20$\pm$0.03,-4.13$\pm$0.03


## SI - Timestep dependence for the binding energy of each S66 system

In [32]:
 # Plot Binding Energy and Total energy of the dimer

final_binding_energy =  {f'{system_id}': [0,0] for system_id in range(1,67)}
final_all_energy =  {f'{system_id}': {energy: 0 for energy in  ['Dimer', 'Dimer Error', 'Monomer 1','Monomer 1 Error', 'Monomer 2', 'Monomer 2 Error', 'Binding Energy', 'Binding Energy Error']} for system_id in range(1,67)}

latex_input_str = ''

if replot_graphs:
    for system_id in range(1,67):
        name = dimer_info.loc[system_id,'formatted_name']
        
        fig, ax = plt.subplots(1,2,figsize=(6.69,2), dpi=300,constrained_layout=True)
        ax[0].set_xlabel( 'DMC timestep [a.u.]' )
        ax[0].set_xticks( [0, 0.003, 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.1, 0.2, 0.3 ] )
        ax[0].set_xticklabels( [ '0', '3E-3', '0.01', '0.02', '0.03', '0.04', '0.05', '0.06', '0.1', '0.2', '0.3' ], rotation=90 )
        ax[0].set_xlim( [0,0.1*1.03] )
        ax[0].set_ylabel( r'$\Delta E_\textrm{int}$ [kcal/mol]' )

        # reference quantum-chemistry result
        ax[0].axhline( ccsdt_raw_references.loc[system_id,'CCSD(T) Final'], c='gray', ls='--', label='CCSD(T)')
        ccsdt_upper_lim = float(ccsdt_raw_references.loc[system_id,'CCSD(T) Final']) + float(ccsdt_raw_references.loc[system_id,'CCSD(T) Error'])
        ccsdt_lower_lim = float(ccsdt_raw_references.loc[system_id,'CCSD(T) Final']) - float(ccsdt_raw_references.loc[system_id,'CCSD(T) Error'])
        ax[0].fill_between([0,0.15],[ccsdt_lower_lim,ccsdt_lower_lim], [ccsdt_upper_lim,ccsdt_upper_lim], color='gray',alpha=0.2,edgecolor='none')

        ax[0].errorbar(dmc_energy_data[system_id]['binding_energy'].index.tolist(), dmc_energy_data[system_id]['binding_energy']['binding_energy'].values, yerr=dmc_energy_data[system_id]['binding_energy']['binding_energy_err'].values, fmt='o', color='black',markeredgecolor='none',markersize=4, label=r'DMC//DLA')

        system_binding_energy_data = dmc_energy_data[system_id]['binding_energy']

        taumaxfit = 0.11 #0.10
        fitting_data = system_binding_energy_data[ system_binding_energy_data.index <= taumaxfit ]
        xdata = fitting_data.index.to_numpy()
        ydata = fitting_data['binding_energy'].to_numpy()
        sigma = fitting_data['binding_energy_err'].to_numpy()

        xfit, m, s = fit_err(xdata,ydata,sigma,fitfun=fun_cub)
        ax[0].plot(xfit,m,'--',color='red', label=r'$\Delta E_\textrm{int}^\textrm{cubic extrap.}=$' + f'{m[0]:.2f}' + r'${\pm}$' + f'{s[0]:.2f}')
        ax[0].fill_between(xfit,m-1*s,m+1*s,color='red',alpha=0.2)

        binding_energy_data = system_binding_energy_data[ system_binding_energy_data.index <= 0.011 ]
        xdata = binding_energy_data.index.to_numpy()
        ydata = binding_energy_data['binding_energy'].to_numpy()
        sigma = binding_energy_data['binding_energy_err'].to_numpy()

        xfit1, m1, s1 = fit_err(xdata,ydata,sigma,fitfun=fun_lin)
        ax[0].plot(xfit1,m1,'--',color='blue', label=r'$\Delta E_\textrm{int}^\textrm{lin. extrap.}=$' + f'{m1[0]:.2f}' + r'${\pm}$' + f'{s1[0]:.2f}')
        ax[0].fill_between(xfit1,m1-1*s1,m1+1*s1,color='blue',alpha=0.2)
        if abs(m[0] - m1[0]) > 0.1:
            print(f'lin {name}: {m[0]:.2f}({int(round(100*s[0]))}) {m1[0]:.2f}({int(round(100*s1[0]))}) {m[0] - m1[0]:.2f}')
            final_binding_energy[f'{system_id}'] = [m1[0],s1[0]]
            final_all_energy[f'{system_id}']['Binding Energy'] = m1[0]
            final_all_energy[f'{system_id}']['Binding Energy Error'] = s1[0]
            energy_fit_type = 'linear'
        else:
            print(f'quad {name}: {m[0]:.2f}({int(round(100*s[0]))}) {m1[0]:.2f}({int(round(100*s1[0]))}) {m[0] - m1[0]:.2f}')
            final_binding_energy[f'{system_id}'] = [m[0],s[0]]
            final_all_energy[f'{system_id}']['Binding Energy'] = m[0]
            final_all_energy[f'{system_id}']['Binding Energy Error'] = s[0]
            energy_fit_type = 'cubic'

        # Assuming you have a subplot ax[0]
        handles, labels = ax[0].get_legend_handles_labels()

        # Reorder the handles and labels (example: swap the order)
        # Modify the indices to get the desired order
        order = [0,3,1,2]  # This is just an example, change the indices as needed

        # Apply the new order to the legend
        ax[0].legend([handles[i] for i in order], [labels[i] for i in order], fontsize=7, ncol=2,frameon=False)

        ax[1].set_xlabel( 'DMC timestep [a.u.]' )
        ax[1].set_xticks( [0, 0.003, 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.1, 0.2, 0.3 ] )
        ax[1].set_xticklabels( [ '0', '3E-3', '0.01', '0.02', '0.03', '0.04', '0.05', '0.06', '0.1', '0.2', '0.3' ], rotation=90 )
        ax[1].set_xlim( [0,0.1*1.03] )
        ax[1].set_ylabel( 'Total Energy [kcal/mol]' )


        system_dimer_total_energy_data = dmc_energy_data[system_id]['total_energy_dimer']

        fitting_data = system_dimer_total_energy_data[ system_dimer_total_energy_data.index <= 0.013 ]
        xdata = fitting_data.index.to_numpy()
        ydata = fitting_data['ene'].to_numpy()
        sigma = fitting_data['err'].to_numpy()

        xfit1, m1, s1 = fit_err(xdata,ydata,sigma,fitfun=fun_lin)

        extrap_system_total_energy = m1[0]

        ax[1].plot(xfit1,m1 - extrap_system_total_energy,'--',color='blue', label=r'$E^\textrm{lin. extrap.}=$' + f'{m1[0]:.2f}' + r'${\pm}$' + f'{s1[0]:.2f}')
        ax[1].fill_between(xfit1,m1 - extrap_system_total_energy -1*s1,m1 - extrap_system_total_energy +1*s1,color='blue',alpha=0.2)

        ax[1].errorbar(dmc_energy_data[system_id]['total_energy_dimer'].index.tolist(), dmc_energy_data[system_id]['total_energy_dimer']['ene'].values - extrap_system_total_energy, yerr=dmc_energy_data[system_id]['total_energy_dimer']['err'].values, fmt='o', color='black',markeredgecolor='none',markersize=4, label=r'DMC//DLA')

        taumaxfit = 0.11 #0.10
        fitting_data = system_dimer_total_energy_data[ system_dimer_total_energy_data.index <= taumaxfit ]
        xdata = fitting_data.index.to_numpy()
        ydata = fitting_data['ene'].to_numpy()
        sigma = fitting_data['err'].to_numpy()

        xfit3, m3, s3 = fit_err(xdata,ydata,sigma,fitfun=fun_cub)

        ax[1].plot(xfit3,m3 - extrap_system_total_energy,'--',color='red', label=r'$E^\textrm{cubic extrap.}=$' + f'{m3[0]:.2f}' + r'${\pm}$' + f'{s3[0]:.2f}')
        ax[1].fill_between(xfit3,m3 - extrap_system_total_energy -1*s3,m3 - extrap_system_total_energy +1*s3,color='green',alpha=0.2)

        # Assuming you have a subplot ax[0]
        handles, labels = ax[1].get_legend_handles_labels()

        # Reorder the handles and labels (example: swap the order)
        # Modify the indices to get the desired order
        order = [2,0,1]  # This is just an example, change the indices as needed

        # Apply the new order to the legend
        ax[1].legend([handles[i] for i in order], [labels[i] for i in order], fontsize=7,frameon=False)
        ax[1].set_ylim([-5,5])

        fig.suptitle(f'{name} (ID {system_id})')
        fig.savefig(f'Figures/Fig_SI_S66_{system_id:02d}.png',format='png')
        latex_input_str += r"""\begin{figure}[!h]
    \includegraphics[width=6.69in]{"""+ f"Figures/Fig_SI_S66_{system_id:02d}.png" + r"""}
    \caption{\label{fig:""" + f"dimer_{system_id:02d}" + r"""} The time step dependence of $\Delta E_\textrm{int}$ and the total energy of the dimer complex for the """ + f'{name} (ID {system_id}) dimer.' + r"""}
\end{figure}
    
"""
        # Make fits for the total energy of the monomers as well based on the energy_fit_type
        if energy_fit_type == 'linear':
            final_all_energy[f'{system_id}']['Dimer'] = m1[0]
            final_all_energy[f'{system_id}']['Dimer Error'] = s1[0]
            for monomer_num in [1,2]:
                monomer_total_energy_data = dmc_energy_data[system_id][f'total_energy_monomer_{monomer_num}']
                fitting_data = monomer_total_energy_data[ monomer_total_energy_data.index <= 0.045 ]
                xdata = fitting_data.index.to_numpy()
                ydata = fitting_data['ene'].to_numpy()
                sigma = fitting_data['err'].to_numpy()

                xfit1, m1, s1 = fit_err(xdata,ydata,sigma,fitfun=fun_lin)

                final_all_energy[f'{system_id}'][f'Monomer {monomer_num}'] = m1[0]
                final_all_energy[f'{system_id}'][f'Monomer {monomer_num} Error'] = s1[0]
        elif energy_fit_type == 'cubic':
            final_all_energy[f'{system_id}'][f'Dimer'] = m[0]
            final_all_energy[f'{system_id}'][f'Dimer Error'] = s[0]

            for monomer_num in [1,2]:
                monomer_total_energy_data = dmc_energy_data[system_id][f'total_energy_monomer_{monomer_num}']
                fitting_data = monomer_total_energy_data[ monomer_total_energy_data.index <= 0.045 ]
                xdata = fitting_data.index.to_numpy()
                ydata = fitting_data['ene'].to_numpy()
                sigma = fitting_data['err'].to_numpy()

                xfit1, m1, s1 = fit_err(xdata,ydata,sigma,fitfun=fun_quad)

                final_all_energy[f'{system_id}'][f'Monomer {monomer_num}'] = m1[0]
                final_all_energy[f'{system_id}'][f'Monomer {monomer_num} Error'] = s1[0]
    
        final_all_energy[f'{system_id}']['Fit type'] = energy_fit_type
    np.save('Data/final_binding_energy.npy', final_binding_energy)

else:
    final_binding_energy = np.load('Data/final_binding_energy.npy', allow_pickle=True).item()

# latex_input_str = ''
# for system_id in range(1,67):
#     name = dimer_info.loc[system_id,'formatted_name']
#     latex_input_str += r"""\begin{figure}[!h]
#     \includegraphics[width=6.69in]{"""+ f"Figures/Fig_SI_S66_{system_id:02d}.png" + r"""}
#     \caption{\label{fig:""" + f"dimer_{system_id:02d}" + r"""} The time step dependence of $\Delta E_\textrm{int}$ and the total energy of the dimer complex for the """ + f'{name} (ID {system_id}) dimer.' + r"""}
# \end{figure}
    
# """
    

quad Water$\cdots$Water: -5.17(3) -5.19(3) 0.02
quad Water$\cdots$MeOH: -5.82(4) -5.84(6) 0.03
quad Water$\cdots$MeNH$_2$: -7.18(4) -7.21(6) 0.03
quad Water$\cdots$Peptide: -8.59(6) -8.62(9) 0.04
quad MeOH$\cdots$MeOH: -5.93(4) -5.83(6) -0.10
quad MeOH$\cdots$MeNH$_2$: -7.83(5) -7.77(7) -0.07
quad MeOH$\cdots$Peptide: -8.58(7) -8.57(10) -0.00
quad MeOH$\cdots$Water: -5.24(5) -5.32(6) 0.08
quad MeNH$_2$$\cdots$MeOH: -3.12(5) -3.04(7) -0.08
lin MeNH$_2$$\cdots$MeNH$_2$: -4.20(5) -4.10(6) -0.10
lin MeNH$_2$$\cdots$Peptide: -5.42(7) -5.30(10) -0.12
quad MeNH$_2$$\cdots$Water: -7.53(5) -7.51(6) -0.03
quad Peptide$\cdots$MeOH: -6.32(7) -6.25(9) -0.07
lin Peptide$\cdots$MeNH$_2$: -7.50(6) -7.39(9) -0.11
quad Peptide$\cdots$Peptide: -8.88(8) -8.88(11) 0.01
quad Peptide$\cdots$Water: -5.37(6) -5.34(9) -0.03
lin Uracil$\cdots$Uracil (BP): -17.79(10) -17.53(14) -0.26
quad Water$\cdots$Pyridine: -7.30(6) -7.21(8) -0.08
quad MeOH$\cdots$Pyridine: -7.88(7) -7.83(10) -0.06
lin AcOH$\cdots$AcOH: -20.1

In [33]:
# Turn the final_all_energy dictionary into a pandas dataframe
final_binding_energy_df = pd.DataFrame(final_all_energy).T

# Set the index name based on dimer_info.loc[system_id,'name']
final_binding_energy_df['System'] = [dimer_info.loc[system_id,'formatted_name'] for system_id in range(1,67)]

# Give binding energy and error a new name
final_binding_energy_df[r'$\Delta E_\textrm{int}$ [kcal/mol]'] = [f"{final_binding_energy_df['Binding Energy'][system_id].round(2):.2f}$\pm${final_binding_energy_df['Binding Energy Error'][system_id].round(2):.2f}" for system_id in range(66)]
final_binding_energy_df = final_binding_energy_df[['System',r'$\Delta E_\textrm{int}$ [kcal/mol]','Fit type']]


# Write the DataFrame to a latex input
latex_input_str = '\n'.join(convert_df_to_latex_input(
    final_binding_energy_df,
    start_input = '\\begin{table}',
    label = 'tab:dmc-final-energies',
     caption = r'Final DMC $\Delta E_\textrm{int}$ estimates for the S66 dataset. The polynomial fit (either linear or cubic) used to extrapolate the zero time step valueis also reported.',
    end_input = '\\end{table}',
    replace_input = {
    },
    adjustbox = 1,
    center = True,
    df_latex_skip = 0,
    rotate_column_header = True,
    output_str = True,
    column_format = 'll' + 'r'*len(final_binding_energy_df.columns)
).splitlines()[7:-4]) + '\n'

with open('final_binding_energy.tex', 'w') as f:
    f.write(r"""\LTcapwidth=\textwidth
    
\begin{longtable}{llrr}
\caption{\label{tab:dmc-final-energies}Final DMC $\Delta E_\textrm{int}$ estimates for the S66 dataset. The polynomial fit (either linear or cubic) used to extrapolate the zero time step valueis also reported.} \\

\toprule
 & System & $\Delta E_\textrm{int}$ [kcal/mol] & Fit type \\
\midrule
\endfirsthead



\caption[]{(continued)} \\
\endhead

\multicolumn{4}{r}{{Continued on next page}} \\
\endfoot

\bottomrule
\endlastfoot

""")
    f.write(latex_input_str)
    f.write(r"\end{longtable}")

final_binding_energy_df

  final_binding_energy_df[r'$\Delta E_\textrm{int}$ [kcal/mol]'] = [f"{final_binding_energy_df['Binding Energy'][system_id].round(2):.2f}$\pm${final_binding_energy_df['Binding Energy Error'][system_id].round(2):.2f}" for system_id in range(66)]


Unnamed: 0,\rotatebox{90}{System},\rotatebox{90}{$\Delta E_\textrm{int}$ [kcal/mol]},\rotatebox{90}{Fit type}
1,Water$\cdots$Water,-5.17$\pm$0.03,cubic
2,Water$\cdots$MeOH,-5.82$\pm$0.04,cubic
3,Water$\cdots$MeNH$_2$,-7.18$\pm$0.04,cubic
4,Water$\cdots$Peptide,-8.59$\pm$0.06,cubic
5,MeOH$\cdots$MeOH,-5.93$\pm$0.04,cubic
6,MeOH$\cdots$MeNH$_2$,-7.83$\pm$0.05,cubic
7,MeOH$\cdots$Peptide,-8.58$\pm$0.07,cubic
8,MeOH$\cdots$Water,-5.24$\pm$0.05,cubic
9,MeNH$_2$$\cdots$MeOH,-3.12$\pm$0.05,cubic
10,MeNH$_2$$\cdots$MeNH$_2$,-4.10$\pm$0.06,linear


## SI - Acetic acid dimer validation

In [122]:
acetic_acid_data = {'AE LDA': {}, 'DLA LDA': {}, 'TM LDA': {}, 'TM PBE0': {}, 'TM PBE': {}}

acetic_acid_data['AE LDA'] =  pd.read_csv('Data/Acetic_Acid_Validation/tag_LDA_AE_a5z.csv', index_col=0, skiprows=2).iloc[:,[-3,-2]]
acetic_acid_data['TM LDA'] =  pd.read_csv('Data/Acetic_Acid_Validation/tag_LDA_ccECP_a6z_TM.csv', index_col=0, skiprows=2).iloc[:,[-3,-2]]
acetic_acid_data['TM PBE0'] =  pd.read_csv('Data/Acetic_Acid_Validation/tag_PBE0_ccECP_a6z_TM.csv', index_col=0, skiprows=2).iloc[:,[-3,-2]]
acetic_acid_data['TM PBE'] =  pd.read_csv('Data/Acetic_Acid_Validation/tag_PBE_ccECP_a6z_TM.csv', index_col=0, skiprows=2).iloc[:,[-3,-2]]
acetic_acid_data['DLA LDA'] = dmc_energy_data[20]['binding_energy'][['binding_energy', 'binding_energy_err']].sort_index(ascending=True)

for method in acetic_acid_data:
    acetic_acid_data[method].columns = ['binding_energy', 'binding_energy_err']

fig, ax = plt.subplots(figsize=(4,3), dpi=300,constrained_layout=True)
ax.set_xlabel( 'DMC timestep [a.u.]' )
ax.set_xticks( [0, 0.003, 0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.1, 0.2, 0.3 ] )
ax.set_xticklabels( [ '0', '3E-3', '0.01', '0.02', '0.03', '0.04', '0.05', '0.06', '0.1', '0.2', '0.3' ], rotation=90 )
ax.set_xlim( [0,0.1*1.03] )

# Plot the total energy 
ax.errorbar(acetic_acid_data['DLA LDA'].index.tolist(),acetic_acid_data['DLA LDA']['binding_energy'].tolist(),yerr=acetic_acid_data['DLA LDA']['binding_energy_err'].tolist(),fmt='o',color='orange',markerfacecolor='none',markersize=4,label='DLA LDA',alpha=0.45,markeredgewidth=1)
ax.errorbar(acetic_acid_data['TM PBE0'].index.tolist(),acetic_acid_data['TM PBE0']['binding_energy'].tolist(),yerr=acetic_acid_data['TM PBE0']['binding_energy_err'].tolist(),fmt='x',color='red',markersize=4,label='TM PBE0',alpha=0.45,markeredgewidth=1)
ax.errorbar(acetic_acid_data['TM PBE'].index.tolist(),acetic_acid_data['TM PBE']['binding_energy'].tolist(),yerr=acetic_acid_data['TM PBE']['binding_energy_err'].tolist(),fmt='x',color='blue',markersize=4,label='TM PBE',alpha=0.45,markeredgewidth=1)
ax.errorbar(acetic_acid_data['TM LDA'].index.tolist(),acetic_acid_data['TM LDA']['binding_energy'].tolist(),yerr=acetic_acid_data['TM LDA']['binding_energy_err'].tolist(),fmt='x',color='green',markersize=4,label='TM LDA',alpha=0.45,markeredgewidth=1)
ax.errorbar(acetic_acid_data['AE LDA'].index.tolist(),acetic_acid_data['AE LDA']['binding_energy'].tolist(),yerr=acetic_acid_data['AE LDA']['binding_energy_err'].tolist(),fmt='s',color='brown',markerfacecolor='none',markersize=4,label='AE LDA',alpha=0.45,markeredgewidth=1)

# reference quantum-chemistry result
ax.axhline( ccsdt_raw_references.loc[20,'CCSD(T) Final'], c='gray', ls='--', label='CCSD(T)')
ccsdt_upper_lim = float(ccsdt_raw_references.loc[20,'CCSD(T) Final']) + float(ccsdt_raw_references.loc[20,'CCSD(T) Error'])
ccsdt_lower_lim = float(ccsdt_raw_references.loc[20,'CCSD(T) Final']) - float(ccsdt_raw_references.loc[20,'CCSD(T) Error'])
ax.fill_between([0,0.15],[ccsdt_lower_lim,ccsdt_lower_lim], [ccsdt_upper_lim,ccsdt_upper_lim], color='gray',alpha=0.2,edgecolor='none')

ax.set_ylim([-22,-18])
ax.legend(ncol=2,fontsize=7,frameon=True)
plt.savefig('Figures/Fig_SI_Acetic_Acid_Validation.png',format='png')

In [123]:
acetic_acid_data

{'AE LDA':         binding_energy  binding_energy_err
 tau                                       
 0.0010      -19.897516            0.407492
 0.0020      -19.746888            0.389640
 0.0025      -20.276174            0.169199
 0.0030      -19.801817            0.405026
 0.0040      -19.848718            0.403659
 0.0050      -19.823955            0.129218
 0.0060      -19.003465            0.396906
 0.0070      -19.939892            0.131363
 0.0080      -20.351375            0.467942
 0.0090      -19.700100            0.119384
 0.0100      -20.339481            0.377742
 0.0120      -19.268199            0.128288
 0.0150      -19.139149            0.124559
 0.0200      -18.701567            0.117786
 0.0300      -17.898261            0.407781
 0.0400      -17.453230            0.128105
 0.0500      -16.488644            0.492310
 0.0600     -640.720662           26.026489
 0.1000      245.537349          195.259932,
 'DLA LDA':        binding_energy  binding_energy_err
 tau       

In [115]:
# Table of the value for the smallest time step for each method
acetic_acid_table = {method: {r'$\tau$': f'{acetic_acid_data[method].index.tolist()[0]:.3f}', r'$\Delta E_\textrm{int}$': f"{acetic_acid_data[method]['binding_energy'].tolist()[0]:.2f}$\pm${acetic_acid_data[method]['binding_energy_err'].tolist()[0]:.2f}" } for method in acetic_acid_data}

acetic_acid_table_df = pd.DataFrame(acetic_acid_table).T
acetic_acid_table_df

# Write the DataFrame to a latex input
latex_input_str = convert_df_to_latex_input(
    acetic_acid_table_df,
    start_input = '\\begin{table}',
    label = 'tab:acetic_acid_validation',
    caption = r'Validation of the DLA localization scheme with an LDA trial wave-function for the AcOH$\cdots$AcOH dimer (ID 20). The smallest time step $\tau$ and the corresponding interaction energy $\Delta E_\textrm{int}$ are reported using various trial wave-functions, localization schemes as well as with all-electron LDA',
    end_input = '\\end{table}',
    center = True,
    df_latex_skip = 0,
    output_str = True,
    column_format = 'll' + 'r'*(len(acetic_acid_table_df.columns)-1)
)

In [116]:
print(latex_input_str)

\begin{table}
\caption{\label{tab:acetic_acid_validation}Validation of the DLA localization scheme with an LDA trial wave-function for the AcOH$\cdots$AcOH dimer (ID 20). The smallest time step $\tau$ and the corresponding interaction energy $\Delta E_\textrm{int}$ are reported using various trial wave-functions, localization schemes as well as with all-electron LDA}
\begin{adjustbox}{center}
\begin{tabular}{llr}
\toprule
 & $\tau$ & $\Delta E_\textrm{int}$ \\ 
\midrule
AE LDA & 0.001 & -19.90$\pm$0.41 \\
TM LDA & 0.002 & -20.25$\pm$0.14 \\
DLA LDA & 0.003 & -20.26$\pm$0.06 \\
DLA PBE0 & 0.010 & -20.17$\pm$0.06 \\
DLA PBE & 0.010 & -20.10$\pm$0.06 \\
\bottomrule
\end{tabular}
\end{adjustbox}
\end{table}


In [68]:
acetic_acid_data['AE LDA'] = pd.read_csv('Data/Acetic_Acid_Validation/tag_LDA_AE_a5z.csv', 
                                         index_col=0, 
                                         skiprows=2)[['Eint','Eint.1']]

KeyError: "None of [Index(['Eint', 'Eint.1'], dtype='object')] are in the [columns]"

In [72]:
pd.read_csv('Data/Acetic_Acid_Validation/tag_LDA_AE_a5z.csv', index_col=0, skiprows=2).iloc[:,[-3,-2]]


Unnamed: 0_level_0,Unnamed: 25,Unnamed: 26
tau,Unnamed: 1_level_1,Unnamed: 2_level_1
0.001,-19.897516,0.407492
0.002,-19.746888,0.38964
0.0025,-20.276174,0.169199
0.003,-19.801817,0.405026
0.004,-19.848718,0.403659
0.005,-19.823955,0.129218
0.006,-19.003465,0.396906
0.007,-19.939892,0.131363
0.008,-20.351375,0.467942
0.009,-19.7001,0.119384


In [67]:
acetic_acid_data['DLA PBE0'].index.tolist()[2:]

[nan, 'tau', '0.01', '0.03', '0.06', '0.1']

In [63]:
acetic_acid_data['DLA PBE']

Unnamed: 0,Eint,Eint.1
,ene,err
tau,,
0.01,-20.0955511638344,0.060592629848125315
0.03,-20.214913836620024,0.057200707892956
0.06,-20.693277124568613,0.05762341407965011
0.1,-21.084125347660695,0.06251478897566376


In [65]:
dmc_energy_data[20]['binding_energy']

Unnamed: 0_level_0,mol1,mol2,dmc_type,dmc_Jas,ene,err,units,TotTimeSim[au],Pop,Pop-err,Var[au],tau_eff[au],corr-time[au],machine,N_MPI,cost,target_weight,ID,tau[au],binding_energy,binding_energy_err
tau,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0.3,AcOH,AcOH,DMCdla5,Jopt,-57519.812035,0.062071,kcal/mol,1650.0,63403.690633,3.162223,1.488009,0.194067,0.597531,linuxpc-intel-slurm-parallel.peta4-cclake,224.0,0.92344,64000.0,20,0.3,-18.850375,0.087777
0.25,AcOH,AcOH,DMCdla5,Jopt,-57509.997948,0.065281,kcal/mol,1275.0,63470.790514,2.550943,1.488356,0.173859,0.561285,linuxpc-intel-slurm-parallel.peta4-cclake,224.0,0.915299,64000.0,20,0.25,-19.256745,0.089585
0.2,AcOH,AcOH,DMCdla5,Jopt,-57500.087539,0.060228,kcal/mol,1100.0,63621.092294,2.727987,1.4865,0.149777,0.475021,linuxpc-intel-slurm-parallel.peta4-cclake,224.0,0.928491,64000.0,20,0.2,-19.606654,0.084254
0.16,AcOH,AcOH,DMCdla5,Jopt,-57492.177183,0.052186,kcal/mol,1136.0,63706.778173,2.120428,1.503584,0.127282,0.372714,linuxpc-intel-slurm-parallel.peta4-cclake,224.0,0.947818,64000.0,20,0.16,-19.856915,0.079954
0.13,AcOH,AcOH,DMCdla5,Jopt,-57486.304214,0.052257,kcal/mol,1313.0,63773.811833,1.811972,1.523939,0.108265,0.383085,linuxpc-intel-slurm-parallel.peta4-cclake,224.0,0.936767,64000.0,20,0.13,-20.047808,0.07036
0.1,AcOH,AcOH,DMCdla5,Jopt,-57480.622831,0.053908,kcal/mol,1050.0,63847.457981,1.804296,1.536463,0.087204,0.355815,linuxpc-intel-slurm-parallel.peta4,64.0,3.030443,64000.0,20,0.1,-20.27142,0.079333
0.08,AcOH,AcOH,DMCdla5,Jopt,-57477.19018,0.049131,kcal/mol,1208.0,63891.433389,1.504746,1.525429,0.07193,0.302538,linuxpc-intel-slurm-parallel.peta4-cclake,224.0,0.941034,64000.0,20,0.08,-20.180076,0.07108
0.06,AcOH,AcOH,DMCdla5,Jopt,-57474.13952,0.041721,kcal/mol,1830.0,63934.43062,1.068534,1.511934,0.055601,0.295697,linuxpc-intel-slurm-parallel.peta4,160.0,1.251118,64000.0,20,0.06,-20.312392,0.065796
0.05,AcOH,AcOH,DMCdla5,Jopt,-57472.900419,0.045407,kcal/mol,1275.0,63944.537075,1.146079,1.513196,0.047025,0.280223,linuxpc-intel-slurm-parallel.peta4-cclake,224.0,0.929658,64000.0,20,0.05,-20.326628,0.067223
0.04,AcOH,AcOH,DMCdla5,Jopt,-57471.769304,0.054815,kcal/mol,1220.0,63967.903008,1.085572,1.557948,0.038169,0.308064,linuxpc-intel-slurm-parallel.peta4-icelake-zen,304.0,0.505188,64000.0,20,0.04,-20.48886,0.070655


In [61]:
acetic_acid_data['TM LDA']

Unnamed: 0,Eint,Eint.1
,ene,err
tau,,
0.002,-20.249157158590194,0.13905308862895474
0.003,-19.947756344945574,0.16819098818686676
0.005,-20.109028746743405,0.1469189612719205
0.007,-20.093354388844293,0.12892504604510083
0.01,-20.102992641055945,0.11392192030455253
0.015,-20.189274081001717,0.09500986756543489
0.02,-20.147651736326566,0.0709625410911984
0.03,-20.38186296335301,0.12067771039634907


In [55]:
acetic_acid_data['AE LDA']['Eint'][2:]

0.001     -19.897516237392956
0.002      -19.74688768785288
0.0025    -20.276174166026102
0.003     -19.801816794363685
0.004      -19.84871757467508
0.005      -19.82395530816897
0.006      -19.00346511839571
0.007     -19.939891526412172
0.008      -20.35137505425061
0.009      -19.70009957028323
0.01      -20.339481469835086
0.012     -19.268199022752626
0.015     -19.139148617712376
0.02      -18.701566920669595
0.03      -17.898260696243117
0.04      -17.453230250857544
0.05      -16.488643754078534
0.06       -640.7206622888417
0.1         245.5373486222296
Name: Eint, dtype: object

NaN                       ene
tau                       NaN
0.001     -19.897516237392956
0.002      -19.74688768785288
0.0025    -20.276174166026102
0.003     -19.801816794363685
0.004      -19.84871757467508
0.005      -19.82395530816897
0.006      -19.00346511839571
0.007     -19.939891526412172
0.008      -20.35137505425061
0.009      -19.70009957028323
0.01      -20.339481469835086
0.012     -19.268199022752626
0.015     -19.139148617712376
0.02      -18.701566920669595
0.03      -17.898260696243117
0.04      -17.453230250857544
0.05      -16.488643754078534
0.06       -640.7206622888417
0.1         245.5373486222296
Name: Eint, dtype: object

## MAIN - Comparison of DMC against CCSD(T) and CCSD(cT)

In [7]:
# Print the final data. Create a figure with three rows and plot final_binding_energy
fig, axs = plt.subplots( nrows=3, ncols=1, figsize=(6.67,7),dpi=600,constrained_layout=True)

datarange1 = list(range(1,24))
datarange2 = list(range(24,47))
datarange3 = list(range(47,67))

axs[0].axhline(0, color='k', ls='--')
axs[1].axhline(0, color='k', ls='--')
axs[2].axhline(0, color='k', ls='--')


axs[0].errorbar(datarange1,[final_binding_energy[f'{i}'][0] - final_binding_energy[f'{i}'][0] for i in datarange1], yerr = [final_binding_energy[f'{i}'][1] for i in datarange1], capsize=3, marker = 'none', ls='none', color = 'blue')
axs[1].errorbar(datarange2,[final_binding_energy[f'{i}'][0] - final_binding_energy[f'{i}'][0] for i in datarange2],yerr = [final_binding_energy[f'{i}'][1] for i in datarange2], marker = 'none',ls='none', color = 'blue', capsize=3)
axs[2].errorbar(datarange3,[final_binding_energy[f'{i}'][0] - final_binding_energy[f'{i}'][0] for i in datarange3], yerr = [final_binding_energy[f'{i}'][1] for i in datarange3], marker = 'none',ls='none', color = 'blue', capsize=3)

# Plot the Martin silver reference value
axs[0].scatter(datarange1, [ccsdt_references.loc[x,'CCSD(T) Final'] - final_binding_energy[f'{x}'][0] for x in datarange1],c='silver',marker='x', label=f'CCSD(T) [MAD: {np.mean([abs(ccsdt_references.loc[x,"CCSD(T) Final"] - final_binding_energy[f"{x}"][0]) for x in datarange1]):.2f}]')
axs[1].scatter(datarange2, [ccsdt_references.loc[x,'CCSD(T) Final'] - final_binding_energy[f'{x}'][0] for x in datarange2],c='silver',marker='x', label=f'CCSD(T) [MAD: {np.mean([abs(ccsdt_references.loc[x,"CCSD(T) Final"] - final_binding_energy[f"{x}"][0]) for x in datarange2]):.2f}]')
axs[2].scatter(datarange3, [ccsdt_references.loc[x,'CCSD(T) Final'] - final_binding_energy[f'{x}'][0] for x in datarange3],c='silver',marker='x', label=f'CCSD(T) [MAD: {np.mean([abs(ccsdt_references.loc[x,"CCSD(T) Final"] - final_binding_energy[f"{x}"][0]) for x in datarange3]):.2f}]')

# Plot the cT_data
# axs[0].scatter(datarange1, [-s66_cT_data[x-1]- final_binding_energy[f'{x}'][0] for x in datarange1],c='gold',marker='x', label=f'CCSD(cT)-fit [MAD: {np.mean([abs(-s66_cT_data[x-1] - final_binding_energy[f"{x}"][0]) for x in datarange1]):.2f}]')
axs[1].scatter(datarange2, [ccsdt_references.loc[x,'CCSD(cT)-fit Final']- final_binding_energy[f'{x}'][0] for x in datarange2],c='gold',marker='x', label=f'CCSD(cT)-fit [MAD: {np.mean([abs(ccsdt_references.loc[x,"CCSD(cT)-fit Final"] - final_binding_energy[f"{x}"][0]) for x in datarange2]):.2f}]')
# axs[2].scatter(datarange3, [-s66_cT_data[x-1]- final_binding_energy[f'{x}'][0] for x in datarange3],c='gold',marker='x', label=f'CCSD(cT)-fit [MAD: {np.mean([abs(-s66_cT_data[x-1] - final_binding_energy[f"{x}"][0]) for x in datarange3]):.2f}]')



axs[0].set_xticks(datarange1)
# Plot the names in the figure
for i in datarange1:
    axs[0].text(i,-0.9,f"{final_binding_energy[f'{i}'][0]:.2f}({int(round(100*final_binding_energy[f'{i}'][1]))})",fontsize=8,ha='center',rotation=90,  bbox=dict(facecolor='white', edgecolor='none',alpha=0.8 ))

axs[1].set_xticks(datarange2)
for i in datarange2:
    axs[1].text(i,-0.9,f"{final_binding_energy[f'{i}'][0]:.2f}({int(round(100*final_binding_energy[f'{i}'][1]))})",fontsize=8,ha='center',rotation=90,  bbox=dict(facecolor='white', edgecolor='none',alpha=0.8 ))
axs[2].set_xticks(datarange3)
for i in datarange3:
    axs[2].text(i,-0.9,f"{final_binding_energy[f'{i}'][0]:.2f}({int(round(100*final_binding_energy[f'{i}'][1]))})",fontsize=8,ha='center',rotation=90,  bbox=dict(facecolor='white', edgecolor='none',alpha=0.8 ))

axs[0].set_ylim([-1,1])
axs[1].set_ylim([-1,1])
axs[2].set_ylim([-1,1])

axs[2].set_xlabel('S66 system')

axs[0].legend(loc='upper left')
axs[1].legend(loc='upper left')
axs[2].legend(loc='upper left')

axs[0].set_title('H-bonded systems')
axs[1].set_title('Dispersion systems')
axs[2].set_title('Mixed systems')

fig.supylabel('Difference against DMC [kcal/mol]')

plt.savefig('Figures/Fig_MAIN_S66_compare.png')

In [8]:
# Plot relative differences for all systems
fig, axs = plt.subplots( nrows=3, ncols=1, figsize=(6.67,7),dpi=600,constrained_layout=True)

datarange1 = list(range(1,24))
datarange2 = list(range(24,47))
datarange3 = list(range(47,67))

axs[0].axhline(0, color='k', ls='--')
axs[1].axhline(0, color='k', ls='--')
axs[2].axhline(0, color='k', ls='--')


axs[0].errorbar(datarange1,[final_binding_energy[f'{i}'][0] - final_binding_energy[f'{i}'][0] for i in datarange1], yerr = [abs(final_binding_energy[f'{i}'][1]*100/final_binding_energy[f'{i}'][0]) for i in datarange1], capsize=3, marker = 'none', ls='none', color = 'blue')
axs[1].errorbar(datarange2,[final_binding_energy[f'{i}'][0] - final_binding_energy[f'{i}'][0] for i in datarange2],yerr = [abs(final_binding_energy[f'{i}'][1]*100/final_binding_energy[f'{i}'][0]) for i in datarange2], marker = 'none',ls='none', color = 'blue', capsize=3)
axs[2].errorbar(datarange3,[final_binding_energy[f'{i}'][0] - final_binding_energy[f'{i}'][0] for i in datarange3], yerr = [abs(final_binding_energy[f'{i}'][1]*100/final_binding_energy[f'{i}'][0]) for i in datarange3], marker = 'none',ls='none', color = 'blue', capsize=3)

# Plot the final (averaged) CCSD(T) reference value
axs[0].scatter(datarange1, [(ccsdt_references.loc[x,'CCSD(T) Final'] - final_binding_energy[f'{x}'][0])*100/final_binding_energy[f'{x}'][0] for x in datarange1],c='silver',marker='x', label=f'CCSD(T) [MRD: {np.mean([abs((ccsdt_references.loc[x,"CCSD(T) Final"] - final_binding_energy[f"{x}"][0])/final_binding_energy[f"{x}"][0])*100 for x in datarange1]):.2f}%]')
axs[1].scatter(datarange2, [(ccsdt_references.loc[x,'CCSD(T) Final'] - final_binding_energy[f'{x}'][0])*100/final_binding_energy[f'{x}'][0] for x in datarange2],c='silver',marker='x', label=f'CCSD(T) [MRD: {np.mean([abs((ccsdt_references.loc[x,"CCSD(T) Final"] - final_binding_energy[f"{x}"][0])/final_binding_energy[f"{x}"][0])*100 for x in datarange2]):.2f}%]')
axs[2].scatter(datarange3, [(ccsdt_references.loc[x,'CCSD(T) Final'] - final_binding_energy[f'{x}'][0])*100/final_binding_energy[f'{x}'][0] for x in datarange3],c='silver',marker='x', label=f'CCSD(T) [MRD: {np.mean([abs((ccsdt_references.loc[x,"CCSD(T) Final"] - final_binding_energy[f"{x}"][0])/final_binding_energy[f"{x}"][0])*100 for x in datarange3]):.2f}%]')

# Plot the cT_data
# axs[0].scatter(datarange1, [(-s66_cT_data[x-1]- final_binding_energy[f'{x}'][0])*100/final_binding_energy[f'{x}'][0] for x in datarange1],c='gold',marker='x', label=f'CCSD(cT)-fit [MAD: {np.mean([abs(-s66_cT_data[x-1] - final_binding_energy[f"{x}"][0]) for x in datarange1]):.2f}]')
axs[1].scatter(datarange2, [(ccsdt_references.loc[x,'CCSD(cT)-fit Final']- final_binding_energy[f'{x}'][0])*100/final_binding_energy[f'{x}'][0] for x in datarange2],c='gold',marker='x', label=f'CCSD(cT)-fit [MRD: {np.mean([abs((ccsdt_references.loc[x,"CCSD(cT)-fit Final"] - final_binding_energy[f"{x}"][0])/final_binding_energy[f"{x}"][0])*100 for x in datarange2]):.2f}%]')
# axs[2].scatter(datarange3, [(-s66_cT_data[x-1]- final_binding_energy[f'{x}'][0])*100/final_binding_energy[f'{x}'][0] for x in datarange3],c='gold',marker='x', label=f'CCSD(cT)-fit [MAD: {np.mean([abs(-s66_cT_data[x-1] - final_binding_energy[f"{x}"][0]) for x in datarange3]):.2f}]')

axs[0].set_xticks(datarange1)
axs[1].set_xticks(datarange2)
axs[2].set_xticks(datarange3)

axs[0].set_ylim([-10,20])
axs[1].set_ylim([-10,20])
axs[2].set_ylim([-10,20])

axs[2].set_xlabel('S66 system')
axs[0].legend(loc='upper center')
axs[1].legend(loc='upper center')
axs[2].legend(loc='upper center')

axs[0].set_title('H-bonded systems')
axs[1].set_title('Dispersion systems')
axs[2].set_title('Mixed systems')

fig.supylabel('Relative difference against DMC \%')
plt.savefig('Figures/Fig_SI_S66_compare_relative.png')

In [None]:
# # Make the data for the dictionary of the 

# with open('s66_rel_ene.txt','w') as f:
#     f.write("System DMC_Int_Ene (T)_Int_Ene (cT)_Int_Ene (T)_Diff (cT)_Diff (T)_Rel_Diff (cT)_Rel_Diff\n")
#     for i in range(1,67):
#         f.write(f'{i:02d}      {final_binding_energy[f"{i}"][0]:-7.3f}    {ccsdt_references.loc[i,"Martin_Silver"]:-7.3f}     {-s66_cT_data[i-1]:-7.3f}     {(ccsdt_references.loc[i,"Martin_Silver"] -  final_binding_energy[f"{i}"][0]):-7.3f}    {(-s66_cT_data[i-1]- final_binding_energy[f"{i}"][0]):-7.3f}     {abs((ccsdt_references.loc[i,"Martin_Silver"] - final_binding_energy[f"{i}"][0])*100/final_binding_energy[f"{i}"][0]):7.1f}         {abs((-s66_cT_data[i-1]- final_binding_energy[f"{i}"][0])*100/final_binding_energy[f"{i}"][0]):4.1f}\n')


## MAIN - Analysis of differences based on SAPT

In [11]:
# Plot the error between DMC and CCSD(T) against the dispersion/electrostatic ratio

import Data.Sherrill_Biofragment_SAPT_S66 as sapt_s66

binding_energy_decomposition = pd.DataFrame( sapt_s66.DATA )

binding_energy_decomposition['ELST DISP+ELST RATIO'] = binding_energy_decomposition['SAPT ELST ENERGY'] /(binding_energy_decomposition['SAPT DISP ENERGY'] + binding_energy_decomposition['SAPT ELST ENERGY'])

binding_energy_decomposition['LOG(ELST DISP RATIO)'] = np.log(binding_energy_decomposition['SAPT ELST ENERGY'] /(binding_energy_decomposition['SAPT DISP ENERGY']))


fig, axs = plt.subplots(figsize=(3.36,3.5),dpi=600,constrained_layout=True)

quantity_to_look_at = 'LOG(ELST DISP RATIO)'

axs.scatter(np.array(binding_energy_decomposition[quantity_to_look_at].tolist())[[x-1 for x in datarange1]], [(ccsdt_references.loc[x,'Martin_Silver'] - final_binding_energy[f'{x}'][0])*100/final_binding_energy[f'{x}'][0] for x in datarange1],c='red',marker='x', label='Electrostatic')
axs.scatter(np.array(binding_energy_decomposition[quantity_to_look_at].tolist())[[x-1 for x in datarange2]], [(ccsdt_references.loc[x,'Martin_Silver'] - final_binding_energy[f'{x}'][0])*100/final_binding_energy[f'{x}'][0] for x in datarange2],c='blue',marker='x',label='Dispersion')
axs.scatter(np.array(binding_energy_decomposition[quantity_to_look_at].tolist())[[x-1 for x in datarange3]], [(ccsdt_references.loc[x,'Martin_Silver'] - final_binding_energy[f'{x}'][0])*100/final_binding_energy[f'{x}'][0] for x in datarange3],c='green',marker='x', label='Mixed')

axs.set_xlabel(r'LOG(ELST/DISP) ratio from SAPT')
axs.set_ylabel(r'|DMC-CCSD(T)|/|DMC| [%]')
axs.legend()

plt.savefig('Figures/Fig_MAIN_Error_decomposition.png')
