## Importing modules and defining functions

In [45]:
#Import modules
from matplotlib import pylab as plt
from tqdm import tqdm
import itertools

import pickle
import numpy as np
from sklearn import linear_model
import skcosmo.feature_selection
from skcosmo.sample_selection import PCovCUR, FPS
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.kernel_ridge import KernelRidge

import ase
from ase.io import read, write
from ase.build import make_supercell
from ase.visualize import view
import numpy as np
# If installed -- not essential, though
try:
    from tqdm.notebook import tqdm
except ImportError:
    tqdm = (lambda i, **kwargs: i)

from time import time

from rascal.models import KRR
from rascal.utils import dump_obj, load_obj

from rascal.models import Kernel, train_gap_model, compute_KNM
from rascal.representations import SphericalInvariants
from rascal.utils import from_dict, to_dict, CURFilter, FPSFilter, dump_obj, load_obj
from rascal.utils import get_score
from rascal.utils.io import load_json, dump_json


In [3]:
#Define functions
def do_fps(x, d=0):
    """
    Function for performing farthest-point-sampling for a given feature matrix.
    d gives the number of farthest-point-sampled feature vectors that will be outputted. 
    If d==0, the entire set will be FPS-sorted.
    Returns the FPS-sorted IDs, as well as the FPS distances.
    """
    if d == 0 : d = len(x)
    n = len(x)
    iy = np.zeros(d, int)
    # faster evaluation of Euclidean distance
    n2 = np.sum(x**2,axis=1)
    iy[0] = 0
    dl = n2 + n2[iy[0]] - 2* np.dot(x, x[iy[0]])
    dss = []
    for i in range(1,d):
        iy[i] = np.argmax(dl)
        nd = n2 + n2[iy[i]] - 2*np.dot(x,x[iy[i]])
        dl = np.minimum(dl, nd)
        dss.append(max(dl))
    return iy,dss

def get_forces(frames):
    frc = np.zeros((np.sum([len(frm) for frm in frames]),3))
    iat = 0
    for ifrm,frm in enumerate(frames):
        frc[iat:iat+len(frm)] = frm.get_array('forces') 
        iat += len(frm)
    return frc

#Get feature vectors for a list of atoms objects. This requires soap to be set, as well as all_species, which is
#a list with one atoms object that contains all species HCNO
def get_features(atoms_objects, batch_size=1000):
    struct_feat=[] #create empty feature vector of n_structures x n_features
    for i in range(int(len(atoms_objects)/batch_size+0.9999)): #Get feature vectors in batches of 1000
        for frm in atoms_objects[i*batch_size:(i+1)*batch_size]:
            frm.wrap(eps=1e-13) #wrap atoms in unit cell
        frames=all_species.copy() #add initial frame with all species
        frames.extend(atoms_objects[i*batch_size:(i+1)*batch_size]) #extend initial frame with frames of batch
        manager = soap.transform(frames) #calculate soap features for all structures in batch
        env_feat_batch = manager.get_features(soap)[len(all_species.copy()[0]):] #get feature vectors for all structures in batch
        # (having removed the initial frame)
        atom_counter=0 #count atoms
        for ifrm,frm in enumerate(atoms_objects[i*batch_size:(i+1)*batch_size]): #iterate over frames in batch
            nat=len(frm) #count atoms in frame
            struct_feat.append(np.mean(env_feat_batch[atom_counter:atom_counter+nat],axis=0)) #average vectors
            # for atoms in the same frame. Done by averaging vectors between atom_counter and atom_counter + nat
            atom_counter+=nat #add atoms to counter
    struct_feat_array=np.asarray(struct_feat)
    return struct_feat_array

In [4]:
#Import initial combined dataset (geop+100MD per crystal)
dictionary_file = open("raw_data/CSD-10k_combined_w_kpts.pickle", "rb") #open saved pickle database
db = pickle.load(dictionary_file)
names=list(db.keys())

# FPS-Sort Configurations

In [21]:
#Define soap hyperparameters for FPS sorting configurations
hypers = dict(soap_type="PowerSpectrum",
              interaction_cutoff=4, #cutoff distance in angstroms
              max_radial=6, #no. of radial basis functions
              max_angular=4, #no. of angular basis functions
              gaussian_sigma_constant=0.4, #sigma width (i.e. amount of 'smearing')
              gaussian_sigma_type="Constant",
              cutoff_function_type="RadialScaling",
              cutoff_smooth_width=0.5,
              cutoff_function_parameters=
                    dict(
                            rate=1,
                            scale=2,
                            exponent=2
                        ),
              radial_basis="GTO",
              normalize=True,
              optimization=
                    dict(
                            Spline=dict(
                               accuracy=1.0e-05
                            )
                        ),
              compute_gradients=False #don't care about forces right now
              )
soap = SphericalInvariants(**hypers)


In [55]:
#FPS-sort initial training set to get 11 configurations per crystal
db_FPS = {}
selector = FPS(n_to_select=11, initialize=0) #initialize at 0 so that first selection is always geop

for name in tqdm(names):
    all_feats = []
    for ifrm, frm in enumerate(db[name]):
        frm.wrap(eps=1e-13)
        feats = soap.transform(frm).get_features(soap)
        all_feats.append(np.mean(feats, axis=0))
    selector.fit(all_feats)
    frames = []
    for i in selector.selected_idx_:
        frames.append(db[name][i])
    db_FPS[name] = frames

  0%|          | 0/2238 [00:00<?, ?it/s]

In [66]:
#Save dataset
with open('delta_data/CSD-10k_combined_w_kpts_11_pc.pickle', 'wb') as f:
    pickle.dump(db_FPS, f)

# Calculate DFTB Energies and Forces

Note that these calculations were done locally on CosmoPC12, and they will not run automatically here. That is because I had to change some ASE scripts as it was not producing DFTB+ input files correctly for the parameters I had provided.

I thoroughly checked the input files it produced, and it matches the input files that were used to run DFTB+ with the Delta potential of Aditi/Edgar in the past.

Also **important to note** that the "Temperatute" passed through to the DFTB+ input file had to be converted from Kelvin to Hartree (DFTB+'s internal parameters), as it was very difficult to set "Temperature [K]" instead of just "Temperature" through ASE's python interface.

Please **skip the next few cells below**, and just reload the database that contains the DFTB+ results as well. 

In [None]:
#Load database
with open('delta_data/CSD-10k_combined_w_kpts_11_pc.pickle', 'rb') as f:
    db = pickle.load(f)

In [None]:
#get array of all crystal names
names = []
for key in db.keys():
    names.append(key)

In [None]:
#set preliminary DFTB params
#These should be the exact same as I have found in DFTB input scripts from Edgar/Aditi (I thoroughly checked)
calc_preliminary = Dftb(Hamiltonian_='DFTB',
           Hamiltonian_Charge=0,
           Hamiltonian_DampXH='Yes',
           Hamiltonian_DampXHExponent = 4.0,
           Hamiltonian_Dispersion_ = 'DftD3',
           Hamiltonian_Dispersion_Damping='BeckeJohnson{}',
           Hamiltonian_Eigensolver='RelativelyRobust{}',
           Hamiltonian_HubbardDerivs_='',
           Hamiltonian_MaxAngularMomentum_='',
           Hamiltonian_SCC='Yes',
           Hamiltonian_SCCTolerance=1e-4,
           Hamiltonian_MaxSCCIterations=1000,
           Hamiltonian_SlaterKosterFiles_='Type2FileNames',
           Hamiltonian_SlaterKosterFiles_Prefix='"./dftb-param/"',
           Hamiltonian_SlaterKosterFiles_Separator='"-"',
           Hamiltonian_SlaterKosterFiles_Suffix='".skf"',
           Hamiltonian_SlaterKosterFiles_LowerCaseTypeName='No',
           Hamiltonian_ThirdOrderFull='Yes',
           Hamiltonian_Filling_='Fermi',
           Hamiltonian_Filling_Temperature=300*0.316681534524639E-05, #convert from K to Hartree!
           #Options_WriteResultsTag='No',
           #Options_WriteDetailedOut='No',
           #Options_WriteBandOut='No',
           ParserOptions_='',
           ParserOptions_ParserVersion=4,
           ParserOptions_IgnoreUnprocessedNodes = 'No',
           do_mulliken=False)

In [None]:
#Data that will need to be set in the loop, depending on which species are present

#Hamiltonian_HubbardDerivs_H=-0.1857, 
#Hamiltonian_HubbardDerivs_C=-0.1492, 
#Hamiltonian_HubbardDerivs_N=-0.1535, 
#Hamiltonian_HubbardDerivs_O=-0.1575,
#Hamiltonian_MaxAngularMomentum_H='"s"', 
#Hamiltonian_MaxAngularMomentum_C='"p"', 
#Hamiltonian_MaxAngularMomentum_N='"p"', 
#Hamiltonian_MaxAngularMomentum_O='"p"',

In [None]:
#Rename "forces" array to "PBE-D2_forces" to avoid confusion, and rename "energy" to "PBE-D2_energy"
for name in names:
    for frm in db[name]:
        frm.arrays['PBE-D2_forces'] = frm.arrays['forces'].copy() #add new array with good name
        frm.set_array('forces', None) #remove old array
        frm.info['PBE-D2_energy'] = frm.info['energy'].copy() #add new info with energy
        frm.info.pop('energy') #remove old energy info

In [None]:
#Compute DFTB energies and forces, and add them to the frames
for name in tqdm(names, desc="Crystal Progress", leave=False):
    for frm in tqdm(db[name], desc="Config Progress", leave=False):
        
        kpts = np.array2string(frm.info['kpts'], separator=' ')[1:-1] #Get K-points
        if len(kpts) != 5: #Print k points if they are longer than useful (for security) 
            print(name, frm) 
        kpts += ' 1.0' #add the k points "weight" for DFTB+
        
        calc=Dftb(**calc_preliminary.todict()) #Get preliminary calculator
        calc.set(Hamiltonian_KPointsAndWeights="{"+kpts+"}") #Set k points
        
        #Set Hubbard derivs and max angular momenta for the relevant species present
        if "H" in frm.get_chemical_symbols():
            calc.set(Hamiltonian_HubbardDerivs_H=-0.1857)
            calc.set(Hamiltonian_MaxAngularMomentum_H='"s"')
        if "C" in frm.get_chemical_symbols():
            calc.set(Hamiltonian_HubbardDerivs_C=-0.1492)
            calc.set(Hamiltonian_MaxAngularMomentum_C='"p"')
        if "N" in frm.get_chemical_symbols():
            calc.set(Hamiltonian_HubbardDerivs_N=-0.1535)
            calc.set(Hamiltonian_MaxAngularMomentum_N='"p"')
        if "O" in frm.get_chemical_symbols():
            calc.set(Hamiltonian_HubbardDerivs_O=-0.1575)
            calc.set(Hamiltonian_MaxAngularMomentum_O='"p"')
        
        frm.set_calculator(calc) #set calculator for the frame
        
        frm.info['DFTB_energy'] = frm.get_potential_energy() #calculate potential energy, and add it as info
        frm.arrays['DFTB_forces'] = frm.get_forces() #calculate forces, and add them as info

In [None]:
#Save dataset
with open('delta_data/CSD-10k_combined_w_kpts_11_pc_w_DFTB.pickle', 'wb') as f:
    pickle.dump(db, f)

## Reload database with DFTB+ results

In [None]:
#(Re-)Open dataset
with open('delta_data/CSD-10k_combined_w_kpts_11_pc_w_DFTB.pickle', 'rb') as f:
    bla = pickle.load(f)