# Create DeepMD model

In [1]:
import dpdata
import numpy as np
from ase.io import read,write
from ase.atoms import Atoms
from ase.calculators.singlepoint import SinglePointCalculator
from deepmd.calculator import DP
import ase.io
from tqdm import tqdm
from os import makedirs

2022-10-20 10:18:04.304682: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0


Instructions for updating:
non-resource variables are not supported in the long term


  _bootstrap._exec(spec, module)


In [2]:
train_set = np.load('raw_data/CSD-10k_train_set.npy', allow_pickle=True)

In [107]:
val_set = np.load('raw_data/CSD-10k_val_set.npy', allow_pickle=True)

In [108]:
sett = "val_set" #match name of set above (train_set or val_set)
if sett == "train_set":
    path_to_sys = 'deepMD_data/train_systems/'
elif sett == "val_set":
    path_to_sys = 'deepMD_data/val_systems/'
else:
    print("Path not set!!")

In [109]:
crystals = []
for frm in eval(sett):
    splits = frm.info['name'].split('_')
    crystal = splits[0]
    crystals.append(crystal)
    frm.info['crystal'] = crystal
crystals = np.unique(crystals)

In [12]:
#determine number of crystals with frames < 50 atoms in them
i=0
for name in tqdm(crystals):
    bla = False
    for frm in train_set:
        if frm.info['crystal'] == name:
            if len(frm) < 50:
                bla = True
    if bla == True:
        i+=1
print(i)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2238/2238 [00:33<00:00, 67.30it/s]

414





In [17]:
dataset_deepmd = {}
for name in tqdm(crystals, leave=False):
    crystal_frames = []
    for i, d in enumerate(eval(sett)): # from davide's friend
        if d.info['crystal'] == name:
            energy = d.info['energy']
            forces = d.arrays['forces']

            atoms = Atoms(symbols=d.get_chemical_symbols(),
                          positions=d.positions,
                          masses=d.get_masses(),
                          cell=d.cell,
                          pbc=d.pbc)

            atoms.calc = SinglePointCalculator(atoms, energy = energy,
                                               forces = forces)

            crystal_frames.append(atoms)
    
    dataset_deepmd[name] = crystal_frames

                                                                                                                                                                                           

In [18]:
for name in tqdm(crystals, leave=False):
    ms = dpdata.LabeledSystem()
    for d in dataset_deepmd[name]:
        sys = dpdata.LabeledSystem(d, fmt='ase/structure')
        sys.apply_pbc()
        ms.append(sys)
    ms.apply_type_map(type_map=['H', 'C', 'N', 'O'])
    ms.to_deepmd_raw(path_to_sys+name)
    ms.to_deepmd_npy(path_to_sys+name)

                                                                                                                                                                                           

# Test model
Model was tested using the command `dp test -m potential.pb -s val_systems/ -n 1000 -d test.out`

In [104]:
#Import energies from output file
energies = np.zeros((1000,2), dtype=float)
with open('deepMD_data/test.e.out') as f:
    file = f.readlines()
j=0 #counter for number of energies imported
u=0 #counter of names imported
ilines = [] #helps determine number of predictions for each crystal
names = [] #get names, in correct order, of each crystal in predictions

for iline, line in enumerate(file):
    if line.startswith('#'): #ignore comment lines
        names.append(line.split(':')[0][14:])
        u+=1
        ilines.append(iline)
        final_nameline=iline
    else:
        energs = line[:-1].split(' ') #get energies from line
        energies[j] = [float(energs[0]), float(energs[1])] #append energies to array
        j+=1
    final_line = iline


#Determine the number of predictions for each crystal, in the correct order
structs_per_cryst=[] #determine number of structures per crystal
u=0 #counter
for ii, i in enumerate(ilines):
    try:
        structs_per_cryst.append(ilines[ii+1]-i-1)
        u+=1
    except:
        structs_per_cryst.append(final_line-final_nameline)
        u+=1
    

#Import forces from output file
forces = np.zeros((90975,6), dtype=float)

with open('deepMD_data/test.f.out') as f:
    file = f.readlines()
j=0 #counter for number of forces imported
for iline, line in enumerate(file):
    if line.startswith('#'): #ignore comment lines
        pass
    else:
        forcs = line[:-1].split(' ')
        forces[j] = [float(k) for k in forcs]
        j+=1

ABAQIG
1


In [114]:
len(names), len(structs_per_cryst)

(825, 825)

In [119]:
#Get per-atom energy errors for validation set
counter = 0
energy_err_pa = []
for iname, name in enumerate(names):
    found = False
    for frm in val_set:
        if frm.info['crystal'] == name and found==False:
            nat = len(frm)
            found=True
    n_preds = structs_per_cryst[iname]
    for pred in range(n_preds):
        energy_err_pa.append((energies[counter+pred, 0] - energies[counter+pred, 1])/nat)
    counter += n_preds

[-0.026422509182535775, 0.03292011632910885, -0.006870532335824464, 0.02635485950958759, 0.02459105901759055, 0.009350189542691411, 0.017198845283878147, 0.011543181275930478, 0.0036410295803815484, 0.016486640226149676, -0.006556702671089928, -0.004352975019961984, -0.00769626272111574, -0.007313486800583746, -0.0197986730630267, -0.0009023746142702294, 0.005847272524054436, 0.003954140510077629, -0.03397092737748822, -0.012484741454183835, 0.0017605365868533867, 0.0003453528230028706, 0.008117508721503934, 0.012220020612162711, 0.011909625399291392, 0.04042104483626948, -0.01728228728724985, -0.0012699689538067105, 0.025378208985994907, -0.022763472384643758, 0.04408434938021344, 0.04567973932249255, -0.018443395293946774, 0.03845082642376784, 0.0341258458845593, 0.007820730495422463, 0.0687600762135634, 0.021983330325785668, 0.012792721420942144, 0.011930157340839288, -0.005814020655844405, 0.013300185403068099, 0.014798332063553578, 0.002767923594896834, -0.011448019595036385, 0.00

In [128]:
rmse_forces=np.sqrt(np.mean(((forces[:, 0:3] - forces[:, 3:6])**2)))
rmse_energies=np.sqrt(np.mean([en**2 for en in energy_err_pa]))
mae_energies = np.mean([np.abs(er) for er in energy_err_pa])

In [129]:
mae_energies

0.01951631103284067

In [124]:
rmse_energies, rmse_forces

(0.027806060280943304, 0.3000284701468127)

In [75]:
for frm in val_set:
    print(frm.info['name'])

BABBUB_99
UNANED01_70
URICAC_96
WIKZUL_67
IJARAO_25
PHENAN14_35
YEKSOX_40
DUNVEN_79
LEPWEI
ICASOW_72
POVKIW_58
FEZHAV
METAMI_49
BELBUP01
NUVXIM02_47
BATWOK_11
CEKVIX_36
JOTSIW_25
WATFED_56
PUFFOL01
ROHBUL_55
QEHSUS_73
UTUVAI_100
ACRLAC04_69
BAZGOY05_96
FEFBEX01_99
PUVSOO_25
YOTYOW
KEPKIZ_27
MALNAC02_51
NUZHUL_45
TCTDTO20_25
MMXPDZ10_76
EDAJIE_34
FUTFIJ
WASJIL_69
NEXJUW_87
OKUXUP_30
VEGZOW_99
LONMAD_27
HONZIV_77
UREANT03_26
QIJWOV_53
VOFDEZ
BCBDNT
AMIHAH_49
BIJJIP_47
YIGTOY_77
BEWNAT01_94
PUVVUX_47
TESDOL01_89
SEPNUX_88
GIXDIA_96
VEZHOY_72
FEMFUY01_41
EQICEN_21
NUPBIL_12
UNUSED_14
PYRGAL03_92
FORTCH_37
CYAMPD01
ESIWUZ
HAVKOE_60
ZZZKAY02_91
FELCIK_70
KOPYIX_63
GODPUK_70
BAMXAR_56
DAZDOZ_100
RAFSID_35
DIALAC02_39
SIKKOO_24
AVEBUA_59
QUJTOE_89
FUNGUS_79
CEKVIX_69
AYAHUE_55
ECADIV_91
ZEFFEX_65
ARAGUV_68
EBUSIG_10
MNETAM01_41
EBUSIG_100
LILLEY_9
GEFBAV_83
POVKEQ_51
AHEYAO_55
BACXAE10_64
HEYQUY_73
MIXTCA_41
UJUKOZ_67
AQIKOA_78
FUTMUE
FIHHIP_76
REVMIP_45
LOLJAX
PHGLOL01_66
OXAZDO_36
JINYIP
IMI

In [73]:
rmse_forces

0.3000284701468127

# Scraps

In [74]:
ms['atom_names'], ms['atom_numbs'], ms['atom_types']

(['H', 'C', 'N', 'O'],
 [40, 24, 0, 4],
 array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0]))

In [39]:
ms.sort_atom_names(type_map=['H', 'C', 'N', 'O'])

In [37]:
ms.add_atom_names(['N', 'O', 'C', 'H'])

In [52]:
ms.apply_type_map(type_map=['H', 'C', 'N', 'O'])

In [53]:
ms['atom_names']

['H', 'C', 'N', 'O']

In [54]:
ms['atom_names'], ms['atom_numbs'], ms['atom_types']

(['H', 'C', 'N', 'O'],
 [40, 24, 0, 4],
 array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0]))

In [18]:
ms.to_deepmd_raw('train_set-deepmd')
ms.to_deepmd_npy('train_set-deepmd')

MultiSystems (1907 systems containing 22618 frames)

In [None]:
from ase.io import read, write
from ase.atoms import Atoms
from ase.calculators.singlepoint import SinglePointCalculator
import dpdata
import numpy as np

dataset = read('gp_iter6_sparse9k.xml.xyz', index = ':')

dataset_deepmd = []
dataset_bart = []

for i, d in enumerate(dataset):
    if np.linalg.det(d.cell)>0:

        if 'dft_force' in d.arrays:
            lbl_f = 'dft_force'
        else:
            lbl_f = 'DFT_force'
        if 'dft_energy' in d.info:
            lbl_e = 'dft_energy'
        else:
            lbl_e = 'DFT_energy'
        if 'dft_virial' in d.info:
            lbl_v = 'dft_virial'
        elif 'DFT_virial' in d.info:
            lbl_v = 'DFT_virial'
        else:
            lbl_v = None

        atoms = Atoms(symbols=d.get_chemical_symbols(),
                      positions=d.positions,
                      masses=d.get_masses(),
                      cell=d.cell,
                      pbc=d.pbc)

        if lbl_v is None:
            atoms.calc = SinglePointCalculator(atoms, energy = d.info[lbl_e],
                                               forces = d.arrays[lbl_f])
        else:
            atoms.calc = SinglePointCalculator(atoms, energy = d.info[lbl_e],
                                               forces = d.arrays[lbl_f],
                                               stress = d.info[lbl_v])

        dataset_deepmd.append(atoms)

    else:
        print(i, d.get_number_of_atoms())

ms = dpdata.MultiSystems()
ms_vir = dpdata.MultiSystems()

for d in dataset_deepmd:
    sys = dpdata.LabeledSystem(d, fmt='ase/structure')
    sys.apply_pbc()
    if sys.has_virial():
        ms_vir.append(sys)
    else:
        ms.append(sys)

ms.to_deepmd_raw('dataset-deepmd-novirial')
ms.to_deepmd_npy('dataset-deepmd-novirial')
ms_vir.to_deepmd_raw('dataset-deepmd-yesvirial')
ms_vir.to_deepmd_npy('dataset-deepmd-yesvirial')

In [20]:
for frm in train_set:
    if frm.info['crystal'] == 'AMXPYZ':
        print(frm.get_chemical_formula())
    

C20H28N12O4
C20H28N12O4
C20H28N12O4
C20H28N12O4
C20H28N12O4
C20H28N12O4
C20H28N12O4
C20H28N12O4
C20H28N12O4
C20H28N12O4
C20H28N12O4


In [29]:
ase.io.extxyz.write_extxyz('deepMD_data/CSD-10k_train_set.extxyz', train_set)

In [30]:
crystals = np.unique(crystals)

In [32]:
for name in tqdm(crystals):
    crystal_frames = []
    for frm in train_set:
        if frm.info['crystal'] == name:
            crystal_frames.append(frm)

2238

In [73]:
train_set[10].arrays

{'numbers': array([8, 8, 8, 8, 6, 6, 6, 6, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 7, 7,
        7, 7, 6, 6, 6, 6, 8, 8, 8, 8, 7, 7, 7, 7, 6, 6, 6, 6, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 6, 6, 6, 6, 6, 6, 6, 6, 1, 1, 1, 1, 1, 1,
        1, 1, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 7, 7, 7, 7,
        7, 7, 7, 7, 6, 6, 6, 6, 6, 6, 6, 6, 1, 1, 1, 1, 1, 1, 1, 1]),
 'positions': array([[6.6537500e+00, 4.2678900e-02, 3.4200000e-02],
        [4.3770500e+00, 7.3837400e+00, 6.3705000e+00],
        [1.2275000e+00, 7.3407300e+00, 2.3500000e-02],
        [1.0001000e+01, 1.6541700e-02, 6.4194500e+00],
        [5.9854900e+00, 2.3628700e+00, 4.8724800e-01],
        [4.9904400e+00, 5.1223900e+00, 6.9842500e+00],
        [6.0893300e-01, 5.1045200e+00, 7.7067600e-01],
        [1.0653200e+01, 2.2979400e+00, 7.0508700e+00],
        [6.7298600e+00, 2.1682800e+00, 1.3271300e+00],
        [4.1560600e+00, 5.1383800e+00, 7.7002500e+00],
        [1.4676900e+00, 5.2331700e+00, 1.4089500e+00],
     

In [77]:
cryst = 'HAJMOU'
crystal_frames = []
for frm in train_set:
    if frm.info['crystal'] == cryst:
        crystal_frames.append(frm)
print(len(crystal_frames))
path = 'deepMD_data/all_systems/'+cryst+'/'
makedirs(path, exist_ok=True)
ase.io.extxyz.write_extxyz(path+'frames.xyz', crystal_frames)
ase.io.extxyz.write_extxyz(path+'one_frame.xyz', crystal_frames[2])

11


In [30]:
d_all = dpdata.MultiSystems.from_file('deepMD_data/CSD-10k_train_set.extxyz', fmt='ase/structure')

In [69]:
one_sys = dpdata.LabeledSystem('deepMD_data/all_systems/AMXPYZ/frames.xyz', fmt='ase/structure')

AttributeError: 'str' object has no attribute 'get_chemical_symbols'