In [1]:
from pymatgen.io import vasp

class Job:
    '''Job class containing the results of a VASP ab-initio MD run. Parses vasprun.xml and POSCAR.'''
    def __init__(self, name: str, path_job: str):
        self.name = name # name of file
        self.path = path_job # path to folder containing POSCAR & vasprun.xml
        self.run = vasp.Vasprun(f'{self.path}/vasprun.xml')
        self.poscar = vasp.Poscar.from_file(f'{self.path}/POSCAR')
        print(f'{self.name} job instance started')

In [2]:
import pandas as pd
import numpy as np

class ML_AB:
    def __init__(self, job:Job, save_path:str):
        self.path = save_path
        self.jobs = [job.name]
        self.steps = []

    def get_header(self, job:Job):
        self.header = pd.DataFrame({
            'The number of configurations': len(job.run.structures),
            'The maximum number of atom type': len(job.poscar.site_symbols),
            'The atom types in the data file': [job.poscar.site_symbols],
            'The maximum number of atoms per system': sum(job.poscar.natoms),
            'The maximum number of atoms per atom type': max(job.poscar.natoms),
            'Reference atomic energy (eV)':[ len(job.poscar.site_symbols)*['0.000000000000000E+000']],
            'Atomic mass': [job.run.parameters['POMASS']],
            'The numbers of basis sets per atom type': [len(job.poscar.site_symbols)*['1']]
        })

        # add placeholder for basis sets per atom    
        for atom_type in self.header['The atom types in the data file'].to_list()[0]:
            self.header[f'Basis set for {atom_type}'] = [['1','1']]

    def update_header(self, job:Job):
        '''Update header data with new job'''
        if self.header['The atom types in the data file'].to_list()==[job.poscar.site_symbols]:
            print('Same atoms types')
        else:
            print('Different atom types: needs adjustment to code!')

        self.header['The number of configurations']+=len(job.run.structures)
        self.jobs.append(job.name)

    def get_atoms_type_no(self, job:Job):
        lines=[]
        for i, element in enumerate(job.poscar.site_symbols):
            lines.append([element,job.poscar.natoms[i]])
        return lines
    
    def get_step(self, job:Job, config: int, step: int):
        '''Produces data frame with entries for an individual MD step'''
        self.step = pd.DataFrame({
            'Configuration num.': config,
            'System name': job.poscar.comment,
            'The number of atom types': self.header['The maximum number of atom type'],
            'The number of atoms': self.header['The maximum number of atoms per system'],
            'Atom types and atom numbers': [self.get_atoms_type_no(job)],
            'Primitive lattice vectors (ang.)': [job.run.structures[step].lattice._matrix],
            'Atomic positions (ang.)': [job.run.structures[step].cart_coords],
            'Total energy (eV)': job.run.ionic_steps[step]['e_0_energy'],
            'Forces (eV ang.^-1)': [job.run.ionic_steps[step]['forces']],
            'Stress (kbar)': '',
            'XX YY ZZ': [[job.run.ionic_steps[step]['stress'][0][0],job.run.ionic_steps[step]['stress'][1][1],job.run.ionic_steps[step]['stress'][2][2]]],
            'XY YZ ZX': [[job.run.ionic_steps[step]['stress'][1][0],job.run.ionic_steps[step]['stress'][1][2],job.run.ionic_steps[step]['stress'][0][2]]],
        })
    
    def save_steps(self):
        self.steps.append(self.step)

    def write_header(self):
        '''Create file and write initial headers'''
        with open(f'{self.path}/ML_AB.txt', 'w') as f:
            f.write(' 1.0 Version\n')
            for parameter in self.header.columns:
                f.write('**************************************************\n')
                f.write(f'     {parameter}\n')
                f.write('--------------------------------------------------\n')
                f.write('     ')
                try:
                    i=1
                    for entry in self.header[parameter][0]:
                        if i<=3:
                            f.write(f'{entry} ')
                            i+=1
                        else:
                            f.write('\n')
                            f.write(f'     {entry} ')
                            i=1
                    f.write('\n')
                except:
                    f.write(f'{self.header[parameter].to_string(index=False)}\n')

    def write_step(self, data:pd.DataFrame, ctifor:str='10.000000000000000E-002'):
        '''Append step entries to file'''
        with open(f'{self.path}/ML_AB.txt', 'a') as f:
            f.write('**************************************************\n')
            f.write(f'     Configuration num.\t{int(data["Configuration num."])+1}\n')

            f.write('==================================================\n')
            f.write(f'     System name\n')
            f.write('--------------------------------------------------\n')
            f.write(f'     {data["System name"].to_string(index=False)}\n')

            f.write('==================================================\n')
            f.write(f'     The number of atom types\n')
            f.write('--------------------------------------------------\n')
            f.write(f'       {int(data["The number of atom types"])}\n')

            f.write('==================================================\n')
            f.write(f'     The number of atoms\n')
            f.write('--------------------------------------------------\n')
            f.write(f'         {int(data["The number of atoms"])}\n')

            f.write('**************************************************\n')
            f.write(f'     Atom types and atom numbers\n')
            f.write('--------------------------------------------------\n')
            for atom, no in data['Atom types and atom numbers'][0]:
                f.write(f'     {atom}     {no}\n')

            f.write('==================================================\n')
            f.write(f'     CTIFOR\n')
            f.write('--------------------------------------------------\n')
            f.write(f'         {ctifor}\n')

            f.write('==================================================\n')
            f.write(f'     Primitive lattice vectors (ang.)\n')
            f.write('--------------------------------------------------\n')
            for line in data['Primitive lattice vectors (ang.)'][0]:
                f.write(f'   {line[0]}      {line[1]}      {line[2]}\n')

            f.write('==================================================\n')
            f.write(f'     Atomic positions (ang.)\n')
            f.write('--------------------------------------------------\n')
            for line in data['Atomic positions (ang.)'][0]:
                f.write(f'   {line[0]}      {line[1]}      {line[2]}\n')

            f.write('==================================================\n')
            f.write(f'     Total energy (eV)\n')
            f.write('--------------------------------------------------\n')
            f.write(f'   {data["Total energy (eV)"][0]}\n')
    
            f.write('==================================================\n')
            f.write(f'     Forces (eV ang.^-1)\n')
            f.write('--------------------------------------------------\n')
            for line in data['Forces (eV ang.^-1)'][0]:
                f.write(f'   {line[0]}      {line[1]}      {line[2]}\n')

            f.write('==================================================\n')
            f.write(f'     Stress (kbar)\n')
            f.write('--------------------------------------------------\n')
            f.write(f'     XX YY ZZ\n')
            f.write('--------------------------------------------------\n')
            for line in data['XX YY ZZ']:
                f.write(f'   {line[0]}\t{line[1]}\t{line[2]}\n')
            f.write('--------------------------------------------------\n')
            f.write(f'     XY YZ ZX\n')
            f.write('--------------------------------------------------\n')
            for line in data['XY YZ ZX']:
                f.write(f'   {line[0]}      {line[1]}      {line[2]}\n')

In [3]:
master_dir = '/Users/dominicwelti/Library/CloudStorage/Dropbox/Master_Thesis/Compare_ML/Ref_VASP_MD/'

In [4]:
# list jobs in folder to be loaded
import os
jobs = os.listdir(f'{master_dir}')
jobs = list(filter(lambda x: x.startswith('v'),jobs))

jobs.sort()

In [5]:
# load and save data
job_init = Job(jobs[0], f'{master_dir}/{jobs[0]}')
ml_ab = ML_AB(job_init, master_dir)
ml_ab.get_header(job_init)
no_config_prev = 0

for i, job in enumerate(jobs):
    job = Job(job, f'{master_dir}/{job}')

    #update no. of configurations
    if i>0:
        no_config_prev = int(ml_ab.header['The number of configurations'])
        ml_ab.update_header(job)

    md_step = 0 #tracking MD step within individual run
    for config in range(no_config_prev,int(ml_ab.header['The number of configurations'])): #iterate over added configurations by new md run
        ml_ab.get_step(job,config, md_step)
        ml_ab.save_steps()
        md_step+=1



v085 job instance started




v085 job instance started




v090 job instance started
Same atoms types




v095 job instance started
Same atoms types




v100 job instance started
Same atoms types




v105 job instance started
Same atoms types




v110 job instance started
Same atoms types




v115 job instance started
Same atoms types




v120 job instance started
Same atoms types




v125 job instance started
Same atoms types


In [7]:
def decimate(ml_ab:ML_AB):
    '''Remove 9/10 of all configurations to reduce computational demand during retraining. Every 10th configurations is kept (1, 11, 21, ...).'''
    i=0 # index altered df
    j=0 # index original df
    for config in ml_ab.steps:
        if config['Configuration num.']%10==1:
            ml_ab.steps[j]['Configuration num.'] = i
            i+=1
        else:
            ml_ab.steps.pop(j)
        j+=1
    
    # update total number of configurations
    ml_ab.header['The number of configurations'] = j
    
    return ml_ab


In [1]:
ml_ab_og = ml_ab
ml_ab = decimate(ml_ab_og)

NameError: name 'ml_ab' is not defined

In [17]:
#write header
ml_ab.write_header()

In [18]:
#write all steps (configurations)
for step in ml_ab.steps:
    ml_ab.write_step(data=step,ctifor='8.000000000000000E-002')

In [22]:
ml_ab.steps[4001]['Primitive lattice vectors (ang.)'].to_list()

[array([[9.22120276, 0.        , 0.        ],
        [0.        , 9.22120276, 0.        ],
        [0.        , 0.        , 9.22120276]])]