In [11]:
import numpy as np
import pandas as pd

class Cfg:
    def __init__(self, file_name:str, path:str, atom_types: dict):
        self.name = file_name
        self.path = path
        self.atom_types = atom_types

    def read(self):
        '''Read file and save lines in list self.lines.'''
        lines = []
        with open(self.path, 'r') as file:
            for line in file:
                lines.append(line.rstrip())
        self.lines = lines
        self.no_configs = self.lines.count('BEGIN_CFG')
    
    def type_to_symbol(self):
        '''Method for turning atom type numbers in .CFG file to atomic symbols.'''
        for type, symbol in self.atom_types.items():
            self.data.loc[self.data['Atom']==type,'Atom'] = symbol
        
    def parse(self):
        '''Parse previously read lines to DataFrame self.data.'''
        config = 0
        configs, atom_types, energies, positions, forces = np.array([]),[],np.array([]),[],[]
        for i, line in enumerate(self.lines):
            if 'BEGIN_CFG' in line:
                config+=1
            elif 'Size' in line:
                size = int(self.lines[i+1]) # number of atoms in config
            elif 'AtomData' in line:
                for entry in range(1,size+1):
                    values = list(filter(lambda x: len(x) > 0,self.lines[i+entry].strip('\t').split('  '))) # clean up string, turn into list, remove empty entries
                    atom_types.append(int(values[1]))
                    positions.append(values[2:5])
                    forces.append(values[5:8])
                    configs = np.append(configs,config)
            elif 'Energy' in line:
                energies = np.append(energies,np.repeat(float(self.lines[i+1].strip('\t')),size))

        self.data = pd.DataFrame({
            'Name': self.name,
            'Configuration': configs,
            'Energy': energies,
            'Atom': atom_types,
            'Position': positions,
            'Force': forces
            })

In [12]:
atom_types = {0:'Ta', 1:'V', 2:'Cr', 3:'W'} 

In [13]:
cfg = Cfg('test', '/Users/dominicwelti/Library/CloudStorage/Dropbox/Master_Thesis/data_npj/Dataset/cfg/in_distribution_splits/4comp.cfg_train_0', atom_types)
cfg.read()
cfg.parse()
cfg.type_to_symbol()

In [14]:
import os
folder_path='/Users/dominicwelti/Library/CloudStorage/Dropbox/Master_Thesis/data_npj/Dataset/cfg'

files=[]
for (dirpath, dirnames, filenames) in os.walk(folder_path):
    for filename in filenames:
        files.append(f'{dirpath}/{filename}')

f=filter(lambda x: '.cfg' in x, files)
files=list(f)

In [15]:
files_train = list(filter(lambda x: 'train' in x, files))

In [16]:
data = pd.DataFrame()
for file in files_train:
    cfg = Cfg(file.split('/')[-1], file, atom_types)
    cfg.read()
    cfg.parse()
    cfg.type_to_symbol()

    data = pd.concat([cfg.data, data])

In [18]:
len(data['Name'].unique())

120