In [82]:
import numpy as np
import pandas as pd

class Cfg:
    def __init__(self, file_name:str, path:str):
        self.name = file_name
        self.path = path

    def read(self):
        '''Read file and save lines in list self.lines.'''
        lines = []
        with open(self.path, 'r') as file:
            for line in file:
                lines.append(line.rstrip())
        self.lines = lines
        self.no_configs = self.lines.count('BEGIN_CFG')
        
    def parse(self):
        '''Parse previously read self.lines to DataFrame self.data.'''
        config = 0
        data_lst = []
        
        for i, line in enumerate(self.lines):
            if 'BEGIN_CFG' in line:
                config+=1
                data = pd.DataFrame({
                    'Name': self.name,
                    'Lattice': [[]],
                    'Configuration': config,
                    'Config. size': 0,
                    'Energy': 0,
                    'Atom': [[]],
                    'Position': [[]],
                    'Force': [[]]
                    })
            elif 'Size' in line:
                size = int(self.lines[i+1]) # number of atoms in config
                data['Config. size'] = size
            elif 'SuperCell' in line:
                data['Lattice']=[[
                    list(filter(lambda x: len(x) > 0,self.lines[i+1].split(' '))),
                    list(filter(lambda x: len(x) > 0,self.lines[i+2].split(' '))),
                    list(filter(lambda x: len(x) > 0,self.lines[i+3].split(' ')))
                    ]]
            elif 'AtomData' in line:
                atoms, pos, force = [], [], []
                for entry in range(1,size+1):
                    values = list(filter(lambda x: len(x) > 0,self.lines[i+entry].strip('\t').split(' '))) # clean up string, turn into list, remove empty entries
                    atoms.append(int(values[1]))
                    pos.append(values[2:5])
                    force.append(values[5:8])
                data['Atom']=[atoms]
                data['Position']=[pos]
                data['Force']=[force]
            elif 'Energy' in line:
                data['Energy'] = float(self.lines[i+1].strip('\t'))
            elif 'END_CFG' in line:
                data_lst.append(data)

        self.data = pd.concat(data_lst)

In [145]:
class Xyz:
    def __init__(self, folder:str, name:str, assignment:dict={}):
        self.name = name
        self.assign = assignment # dictionary with number to atom species assignment
        self.path = f'{folder}/{name}'

    def unpack(lst):
        '''Unpack list (2 levels) and turn into string'''
        unpacked = ''
        for i, entry in enumerate(lst):
            unpacked = f'{unpacked} {" ".join(entry)}'
        return unpacked[1:]

    def type_to_symbol(self, no: int):
        '''Method for turning atom type numbers to atomic symbols.'''
        for type, symbol in self.assign.items():
            if type==no:
                species=symbol
        return species

    def write(self, data:pd.DataFrame):
        '''Write configuration data to extended XYZ file for use in MACE'''
        with open(f'{self.path}.xyz', 'w') as f:
            for run, config in zip(data['Name'],data['Configuration']):
                subset = data.query('Name==@run and Configuration==@config')
                f.write(f"{subset['Config. size'].to_string(index=False)}\n")
                f.write(f'Energy={float(subset["Energy"])} Lattice=\"{unpack(subset["Lattice"][0])}\" Properties=species:S:1:pos:R:3:forces:R:3\n')
                for species, position, force in zip(subset['Atom'][0], subset['Position'][0], subset['Force'][0]):
                    f.write(f'{self.type_to_symbol(species)}{position[0]:>12}{position[1]:>12}{position[2]:>12}{force[0]:>12}{force[1]:>12}{force[2]:>12}\n')
        

In [140]:
atom_types = {0:'Ta', 1:'V', 2:'Cr', 3:'W'} 

In [155]:
cfg = Cfg('test', '/Users/dominicwelti/Library/CloudStorage/Dropbox/Master_Thesis/data_npj/Dataset/cfg/in_distribution_splits/4comp.cfg_train_0', atom_types)
cfg.read()
cfg.parse()
cfg.type_to_symbol()

In [3]:
#retrieve paths to CFG files in project
import os
folder_path='/Users/dominicwelti/Library/CloudStorage/Dropbox/Master_Thesis/data_npj/Dataset/cfg'

files=[]
for (dirpath, dirnames, filenames) in os.walk(folder_path):
    for filename in filenames:
        files.append(f'{dirpath}/{filename}')

f=filter(lambda x: '.cfg' in x, files)
files=list(f)

In [4]:
# only select CFG files containing string in file name
files_train = list(filter(lambda x: 'train_0' in x, files))

In [83]:
# retrieve data contained in relevant CFG files
data_lst = []
for file in files_train:
    cfg = Cfg(file.split('/')[-1], file, atom_types)
    cfg.read()
    cfg.parse()
    data_lst.append(cfg.data)

data = pd.concat(data_lst)

In [9]:
# retrieve number of configurations contained
data.groupby(by='Name').max('Configuration').sum()['Configuration']

5373.0

In [None]:
test = [cfg.data['Atom']]

In [None]:
for type, symbol in cfg.atom_types.items():
    cfg.data.loc[cfg.data['Atom']==type,'Atom'] = symbol

In [146]:
xyz = Xyz('/Users/dominicwelti/Library/CloudStorage/Dropbox/Master_Thesis/data_npj/', 'test', atom_types)

In [147]:
xyz.write(data)