In [1]:
import dataloader as dl
import numpy as np
import pandas as pd

path = '/Users/dominicwelti/Documents/Master_Thesis_Data_Set/ani1x-release.h5'

In [2]:
import ase.io as io

class Xyz:
    '''Class for handling extended-xyz data format utilized by MACE.'''
    def __init__(self, folder:str, name:str, assignment:dict={}):
        self.name = name
        self.assign = assignment # dictionary with number to atom name assignment
        self.path = f'{folder}/{name}'

    def unpack(self, lst):
        '''Unpack list (2 levels) and turn into string'''
        unpacked = ''
        for entry in lst:
            unpacked = f'{unpacked} {" ".join(entry)}'
        return unpacked[1:]

    def type_to_symbol(self, no: int):
        '''Method for turning atom type numbers to atomic symbols.'''
        for type, symbol in self.assign.items():
            if type==no:
                species=symbol
        return species

    def write(self, data:pd.DataFrame):
        '''Write configuration data to extended XYZ file for use in MACE'''
        with open(f'{self.path}.xyz', 'w') as f:
            for conform, config in zip(data['Conformation'],data['Configuration']):
                subset = data.query('Conformation==@conform and Configuration==@config')
                f.write(f"{subset['Config. size'].to_string(index=False)}\n")
                f.write(f'Energy={float(subset["Energy"])} Properties=species:S:1:pos:R:3\n')
                for species, position in zip(subset['Atom'][0], subset['Position'][0]):
                    if species in self.assign:
                        f.write(f'{self.type_to_symbol(species)}{position[0]:>24}{position[1]:>24}{position[2]:>24}\n')
                    else:
                        print(f'Type {species} not contained within self.assign.')

In [3]:
data_keys = ['ccsd(t)_cbs.energy'] # The coupled cluster ANI-1ccx data set (https://doi.org/10.1038/s41467-019-10827-4)

df=[]
config=1

for data in dl.iter_data_buckets(path,keys=data_keys):
    conform=1
    for energy, position in zip(data['ccsd(t)_cbs.energy'],data['coordinates']):
        entry = pd.DataFrame({
                        'Configuration': config,
                        'Conformation': conform,
                        'Config. size': len(data['atomic_numbers']),
                        'Energy': energy,
                        'Atom': [data['atomic_numbers']],
                        'Position': [position]
                        })
        df.append(entry)
        conform+=1
    config+=1

df_comp=pd.concat(df)

In [4]:
data_keys = ['ccsd(t)_cbs.energy'] # The coupled cluster ANI-1ccx data set (https://doi.org/10.1038/s41467-019-10827-4)
dfs=[]

for data in dl.iter_data_buckets(path,keys=data_keys):
    dfs.append(data)

data

{'ccsd(t)_cbs.energy': array([-225.24572037, -225.2484541 , -225.2504756 , -225.24943494,
        -225.24264658, -225.25353792, -225.25369254, -225.24979692,
        -225.24570025, -225.24399321, -225.25497357, -225.25089869,
        -225.25132849, -225.25458548, -225.25155565, -225.25388082,
        -225.24562211, -225.25074729, -225.24357892, -225.25167897,
        -225.25013918, -225.25519899, -225.25452526, -225.25544954,
        -225.25111702, -225.25476744, -225.25350263, -225.25497492,
        -225.25410652, -225.2482528 , -225.25507996, -225.25386345,
        -225.25600133, -225.24738212, -225.25036709, -225.24597883,
        -225.25525652, -225.25557926, -225.25508023, -225.24646719,
        -225.25396793, -225.25431599, -225.24490939, -225.25239582,
        -225.24439881, -225.24686772, -225.24832876, -225.24815523,
        -225.25210006, -225.25064596, -225.25321892, -225.24862765,
        -225.25350195, -225.24246368, -225.2441465 , -225.25472841,
        -225.25325124, -22

In [13]:
test=list(data.keys())

In [15]:
next(x for x in test if '.energy' in x)

'ccsd(t)_cbs.energy'

In [5]:
# atom_assign = {
#     1: 'H',
#     2: 'He',
#     3: 'Li',
#     4: 'Be',
#     5: 'B',
#     6: 'C',
#     7: 'N',
#     8: 'O'
# }

xyz=Xyz('/Users/dominicwelti/Documents/Master_Thesis_Data_Set', 'ANI-1ccx')

In [6]:
df_comp['Energy'] = df_comp['Energy']*27.211386245988 # convert Ha to eV

In [7]:
xyz.write(df_comp)