In [2]:
import numpy as np
import pandas as pd

class Cfg:
    def __init__(self, file_name:str, path:str):
        self.name = file_name
        self.path = path

    def read(self):
        '''Read file and save lines in list self.lines.'''
        lines = []
        with open(self.path, 'r') as file:
            for line in file:
                lines.append(line.rstrip())
        self.lines = lines
        self.no_configs = self.lines.count('BEGIN_CFG')
        
    def parse(self):
        '''Parse previously read self.lines to DataFrame self.data.'''
        config = 0
        data_lst = []
        
        for i, line in enumerate(self.lines):
            if 'BEGIN_CFG' in line:
                config+=1
                data = pd.DataFrame({
                    'Name': self.name,
                    'Lattice': [[]],
                    'Configuration': config,
                    'Config. size': 0,
                    'Energy': 0,
                    'Atom': [[]],
                    'Position': [[]],
                    'Force': [[]]
                    })
            elif 'Size' in line:
                size = int(self.lines[i+1]) # number of atoms in config
                data['Config. size'] = size
            elif ('SuperCell' in line) or ('Supercell' in line):
                data['Lattice']=[[
                    list(filter(lambda x: len(x) > 0,self.lines[i+1].split(' '))),
                    list(filter(lambda x: len(x) > 0,self.lines[i+2].split(' '))),
                    list(filter(lambda x: len(x) > 0,self.lines[i+3].split(' ')))
                    ]]
            elif 'AtomData' in line:
                atoms, pos, force = [], [], []
                for entry in range(1,size+1):
                    values = list(filter(lambda x: len(x) > 0,self.lines[i+entry].strip('\t').split(' '))) # clean up string, turn into list, remove empty entries
                    atoms.append(int(values[1]))
                    pos.append(values[2:5])
                    force.append(values[5:8])
                data['Atom']=[atoms]
                data['Position']=[pos]
                data['Force']=[force]
            elif 'Energy' in line:
                data['Energy'] = float(self.lines[i+1].strip('\t'))
            elif 'END_CFG' in line:
                data_lst.append(data)

        self.data = pd.concat(data_lst)

    def write(self, df:pd.DataFrame):
        '''Write data to .cfg file for use with MLIP.'''
        cols = df.columns
        with open(f'{self.path}.xyz', 'w') as f:
            for index, row in df.iterrows():
                f.write('BEGIN_CFG\n')
                f.write(f'Size\n\t {row["Config. size"]}\n')
                if 'Lattice' in cols:
                    lat = row['Lattice']
                    f.write('SuperCell')
                    f.write(f'\t \t {lat[0][0]:>10}{lat[0][1]:>10}{lat[0][2]:>10}\n')
                    f.write(f'\t \t {lat[1][0]:>10}{lat[1][1]:>10}{lat[1][2]:>10}\n')
                    f.write(f'\t \t {lat[2][0]:>10}{lat[2][1]:>10}{lat[2][2]:>10}\n')
                f.write(f'AtomData: {"id":>10}{"type":>10}{"cartes_x":>10}{"cartes_y":>10}{"cartes_z":>10}')
                if 'Force' in cols:
                    f.write(f'{"fx":>10}{"fy":>10}{"fz":>10}\n')
                    id=1
                    for atom, pos, forces in zip(row['Atom'], row['Position'],row['Force']):
                        f.write(f'{id:>5}{atom:>5}{pos[0]:>10}{pos[1]:>10}{pos[2]:>10}{forces[0]:>10}{forces[1]:>10}{forces[2]:>10}\n')
                        id+=1
                else:
                    f.write('\n')
                    id=1
                    for atom, pos in zip(row['Atom'], row['Position']):
                        f.write(f'{id:>5}{atom:>5}{pos[0]:>10}{pos[1]:>10}{pos[2]:>10}\n')
                        id+=1
                f.write(f'Energy\n \t \t \t {row["Energy"]}\n')
                if 'Stress' in cols:
                    f.write(f'PlusStress {"xx":>10}{"yy":>10}{"zz":>10}{"yz":>10}{"xz":>10}{"xy":>10}\n')
                    f.write(f'{"":>10}{row["Stress"][0]}{row["Stress"][1]}{row["Stress"][2]}{row["Stress"][3]}{row["Stress"][4]}{row["Stress"][5]}')
                f.write('END_CFG\n \n')


In [None]:
import dataloader as dl #requires dataloader.py obtained from https://github.com/aiqm/ANI1x_datasets/blob/master/dataloader.py

class Hdf5:
    '''Class for handling HDF-5 data format provided by ANI-1. Only energy and position implemented yet (coupled cluster data set).'''
    def __init__(self, path:str, name:str, data_keys:list):
        self.name = name
        self.path = path
        self.keys = data_keys # keys indicate which type of data set to extract, e.g. 'ccsd(t)_cbs.energy' for coupled cluster data

    def parse(self):
        '''Parse H5 file into self.data.'''
        df=[]
        config=1

        for data in dl.iter_data_buckets(self.path,keys=self.keys):
            conform=1
            self.properties=list(data.keys()) # contained properties e.g. energy, position, forces 
            e_desc = next(x for x in self.properties if '.energy' in x)

            for energy, position in zip(data[e_desc],data['coordinates']):
                entry = pd.DataFrame({
                                'Configuration': config,
                                'Conformation': conform,
                                'Config. size': len(data['atomic_numbers']),
                                'Energy': energy,
                                'Atom': [data['atomic_numbers']],
                                'Position': [position]
                                })
                df.append(entry)
                conform+=1
            config+=1

        self.data=pd.concat(df)

    def convert_ha_ev(self):
        '''Convert energy values from Hartree to eV.'''
        self.data['Energy'] = self.data['Energy']*27.211386245988 # convert Ha to eV

In [3]:
import ase.io as io

class Xyz:
    '''Class for handling extended-xyz data format utilized by MACE.'''
    def __init__(self, folder:str, name:str, assignment:dict={}):
        self.name = name
        self.assign = assignment # dictionary with number to atom species assignment
        self.path = f'{folder}/{name}'

    def unpack(self, lst):
        '''Unpack list (2 levels) and turn into string'''
        unpacked = ''
        for entry in lst:
            unpacked = f'{unpacked} {" ".join(entry)}'
        return unpacked[1:]

    def type_to_symbol(self, no: int):
        '''Method for turning atom type numbers to atomic symbols.'''
        for type, symbol in self.assign.items():
            if type==no:
                species=symbol
        return species

    def write(self, data:pd.DataFrame):
        '''Write configuration data to extended XYZ file for use in MACE.'''
        with open(f'{self.path}.xyz', 'w') as f:
            for index, subset in data.iterrows():
                f.write(f"{subset['Config. size'].to_string(index=False)}\n")
                if 'Force' in data.columns:
                    f.write(f'Energy={float(subset["Energy"])} Lattice=\"{self.unpack(subset["Lattice"].to_list()[0])}\" Properties=species:S:1:pos:R:3:forces:R:3\n')
                    for species, position, force in zip(subset['Atom'][0], subset['Position'][0], subset['Force'][0]):
                        if not species in self.assign:
                            print(f'Type {species} not contained within self.assign.')
                            return
                        f.write(f'{self.type_to_symbol(species)}{position[0]:>12}{position[1]:>12}{position[2]:>12}{force[0]:>12}{force[1]:>12}{force[2]:>12}\n')
                else:
                    f.write(f'Energy={float(subset["Energy"])} Lattice=\"{self.unpack(subset["Lattice"].to_list()[0])}\" Properties=species:S:1:pos:R:3\n')
                    for species, position in zip(subset['Atom'][0], subset['Position'][0]):
                        if not species in self.assign:
                            print(f'Type {species} not contained within self.assign.')
                            return
                        f.write(f'{self.type_to_symbol(species)}{position[0]:>24}{position[1]:>24}{position[2]:>24}\n')

In [4]:
atom_types = {0:'Ta', 1:'V', 2:'Cr', 3:'W'} 

In [20]:
# testing out individual CFG file
cfg = Cfg('test', '/Users/dominicwelti/Library/CloudStorage/Dropbox/Master_Thesis/data_npj/Dataset/cfg/in_distribution_splits/4comp.cfg_train_0')
cfg.read()
cfg.parse()

In [5]:
#retrieve paths to CFG files in project
import os
folder_path='/Users/dominicwelti/Library/CloudStorage/Dropbox/Master_Thesis/data_npj/Dataset/cfg'

files=[]
for (dirpath, dirnames, filenames) in os.walk(folder_path):
    for filename in filenames:
        files.append(f'{dirpath}/{filename}')

f=filter(lambda x: '.cfg' in x, files)
files=list(f)

In [6]:
# only select CFG files containing string in file name
subset=['deformed_']
files_train=[]
for pattern in subset:
    files_train.append(list(filter(lambda x: pattern in x, files)))

files_train = np.array(files_train)
files_train=files_train.flatten()

In [7]:
# retrieve data contained in relevant CFG files
data_lst = []
for file in files_train:
    cfg = Cfg(file.split('/')[-1], file)
    cfg.read()
    cfg.parse()
    data_lst.append(cfg.data)

data = pd.concat(data_lst)

In [8]:
data

Unnamed: 0,Name,Lattice,Configuration,Config. size,Energy,Atom,Position,Force
0,deformed_TaW.cfg,"[[18.654200, 0.000000, 0.000000], [0.000000, 1...",1,432,-5358.871179,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[7.772610, 17.099700, 18.146600], [10.881600,...","[[-0.000957, -0.002846, 0.000488], [0.000502, ..."
0,deformed_TaCr.cfg,"[[18.507600, 0.000000, 0.000000], [0.000000, 1...",1,432,-4618.564247,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[16.965300, 16.965300, 13.971400], [13.880700...","[[0.001056, 0.000972, 0.000515], [-0.000195, -..."
0,deformed_TaV.cfg,"[[18.437400, 0.000000, 0.000000], [0.000000, 1...",1,432,-4465.183401,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[16.900900, 16.900900, 14.718900], [13.828000...","[[0.001118, 0.000897, 0.000271], [-0.000488, 0..."
0,deformed_VW.cfg,"[[18.507500, 0.000000, 0.000000], [0.000000, 1...",1,432,-4764.441315,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[[0.000000, 0.000000, 18.450700], [12.338300, ...","[[-0.000086, -0.000445, -0.000754], [0.000175,..."
0,deformed_CrW.cfg,"[[18.437400, 0.000000, 0.000000], [0.000000, 1...",1,432,-4836.233374,"[2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ...","[[0.000000, 0.000000, 17.865100], [12.291600, ...","[[0.000181, 0.000489, -0.000833], [0.000617, 0..."
0,deformed_VCr.cfg,"[[18.660200, 0.000000, 0.000000], [0.000000, 1...",1,432,-3971.51625,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[[0.000000, 0.000000, 0.400200], [12.440200, 1...","[[-0.001625, 0.000955, 0.000323], [-0.000115, ..."


In [31]:
# retrieve number of configurations contained
data.groupby(by='Name').max('Configuration').sum()['Configuration']

6.0

In [32]:
# produce combined xyz file
xyz = Xyz('/Users/dominicwelti/Library/CloudStorage/Dropbox/Master_Thesis/data_npj/', 'deformed', atom_types)
xyz.write(data)

In [18]:
# produce seperate xyz files 
for name in data['Name'].unique():
    subset = data.query('Name==@name')
    xyz = Xyz('/Users/dominicwelti/Library/CloudStorage/Dropbox/Master_Thesis/data_npj/mace', name, atom_types)
    xyz.write(subset)

In [36]:
# build 432 atoms config for MD computation cost testing
path='/Users/dominicwelti/Library/CloudStorage/Dropbox/Master_Thesis/data_npj/Dataset/cfg/in_distribution_splits/4comp.cfg_valid_0'
cfg=Cfg('4comp_0k',path)
cfg.read()
cfg.parse()

In [46]:
xyz=Xyz('/Users/dominicwelti/Library/CloudStorage/Dropbox/Master_Thesis/data_npj', cfg.name, atom_types)

In [47]:
cfg.data

Unnamed: 0,Name,Lattice,Configuration,Config. size,Energy,Atom,Position,Force
0,4comp_0k,"[[12.417093, -0.000002, 0.000002], [-0.000002,...",1,128,-1383.026392,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[[0.000020, 12.417090, 0.193190], [3.104330, 0...","[[0.003742, -0.001385, -0.000639], [-0.003647,..."
0,4comp_0k,"[[12.372950, -0.091678, 0.000003], [-0.091678,...",2,128,-1377.955951,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[[0.000090, 0.000000, 0.004660], [6.117710, 9....","[[-0.001207, 0.001275, 0.001526], [0.000403, -..."
0,4comp_0k,"[[12.385524, 0.023617, -0.005930], [0.023442, ...",3,128,-1380.740406,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[[10.826210, 7.769670, 1.577610], [3.073840, 3...","[[0.002529, -0.000909, -0.004608], [-0.008655,..."
0,4comp_0k,"[[12.347574, -0.017404, 0.018104], [-0.017464,...",4,128,-1377.910324,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[[0.053940, 0.034310, 12.388140], [6.176370, 9...","[[0.000534, 0.001173, 0.002910], [0.001066, 0...."
0,4comp_0k,"[[12.415658, 0.000000, 0.000002], [0.000000, 1...",5,128,-1383.025328,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[[3.103790, 6.207770, 6.366180], [6.207870, 3....","[[-0.001545, 0.002482, -0.001277], [0.000657, ..."
...,...,...,...,...,...,...,...,...
0,4comp_0k,"[[19.000019, 0.000000, 0.000000], [0.000000, 1...",114,432,-4605.767404,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[[9.500010, 9.500010, 6.333340], [6.333340, 9....","[[-0.510694, 0.122917, 0.072949], [0.003821, 0..."
0,4comp_0k,"[[18.578200, 0.007106, -0.044461], [0.007112, ...",115,432,-4656.265863,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[18.529100, 18.567900, 18.384600], [7.794420,...","[[-0.006446, 0.054703, 0.172828], [-0.029112, ..."
0,4comp_0k,"[[18.583700, 0.000034, -0.005524], [0.000031, ...",116,432,-4657.052548,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[13.967600, 10.803300, 1.495280], [7.721730, ...","[[-0.046911, 0.231170, 0.199007], [-0.052442, ..."
0,4comp_0k,"[[18.509500, -0.035271, 0.001863], [-0.035275,...",117,432,-4641.428585,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[[10.770700, 7.703330, 7.987260], [4.639520, 7...","[[0.111748, -0.064748, -0.008068], [0.034885, ..."


In [48]:
configs_432=cfg.data.query('`Config. size`==432')

In [49]:
xyz.write(configs_432)