# Record consistency checks

This notebook provides consistency checks between

- Records stored at potentials.nist.gov and in the potentials-library github repository
- Records of different styles that share content

Import potentials and load the database records

In [238]:
import uuid
import datetime
import shutil
from pathlib import Path

import potentials

import numpy as np
import pandas as pd

import iprPy

from IPython.core.display import display, HTML

In [2]:
#machine = 'desktop'
machine = 'laptop'

In [3]:
# Set passwordfile based on machine
if machine == 'laptop':
    passwordfile = Path('C:/Users/lmh1/Documents/potentials_nist_gov/password.txt')
elif machine == 'desktop':
    passwordfile = Path('E:/potentials_nist_gov/password.txt')
else:
    raise ValueError(f'passwordfile not set for machine {machine}')

# Load database username + password
with open(Path(passwordfile)) as f:
    username, password = f.read().strip().split()

# Load default database and fetch global workspace    
db = potentials.Database(username=username, password=password)
global_workspace = db.cdcs.global_workspace

## 1. Check Citations

Create databases that separately load local and remote Citations

In [153]:
db_local = potentials.Database(load='citations', username=username, password=password, verbose=True, remote=False, local=True)
db_remote = potentials.Database(load='citations', username=username, password=password, verbose=True, remote=True, local=False)

Loaded 355 local citations
Loaded 355 remote citations


### 1.1. Correct names of Citation records

Occasionally, major updates of the CDCS database require that records be backed up and copied over.  I think there's a bug in the record transfer scripts (not mine) that replace underscores with spaces.  This changes them back.

In [None]:
# Fetch raw record content using CDCS query
rawcites = db_remote.cdcs.query(template='Citation')
for i in rawcites.index:
    rawcite = rawcites.loc[i]
    title = rawcite.title
    
    # Check if title contains spaces
    if ' ' in title:
        title = title.replace(' ', '_')
        content = rawcite.xml_content
        
        db_remote.cdcs.delete_record(rawcite, verbose=True)
        db_remote.cdcs.upload_record('Citation', content=content, title=title, workspace=global_workspace, verbose=True)

In [239]:
# Fetch raw record content using CDCS query
rawcites = db_remote.cdcs.query(template='crystal_prototype')
for i in rawcites.index:
    rawcite = rawcites.loc[i]
    title = rawcite.title
    
    # Check if title contains spaces
    if ' ' in title:
        title = title.replace(' ', '_')
        content = rawcite.xml_content
        
        db_remote.cdcs.delete_record(rawcite, verbose=True)
        db_remote.cdcs.upload_record('crystal_prototype', content=content, title=title, workspace=global_workspace, verbose=True)

record L2 1--AlCu2Mn--heusler (5fb561b626ed1e0037171170) has been deleted.
record L2_1--AlCu2Mn--heusler (607db6a1e5dbce003645a181) successfully uploaded.
record 607db6a1e5dbce003645a181 assigned to workspace 5fb55e4826ed1e0015e846a9
record L1 0--AuCu (5fb561b626ed1e002a17117d) has been deleted.
record L1_0--AuCu (607db6a4e5dbce003845a17b) successfully uploaded.
record 607db6a4e5dbce003845a17b assigned to workspace 5fb55e4826ed1e0015e846a9
record L1 2--AuCu3 (5fb561b726ed1e0038171175) has been deleted.
record L1_2--AuCu3 (607db6a7e5dbce003745a17d) successfully uploaded.
record 607db6a7e5dbce003745a17d assigned to workspace 5fb55e4826ed1e0015e846a9
record D0 3--BiF3 (5fb561b826ed1e003917116f) has been deleted.
record D0_3--BiF3 (607db6aae5dbce003a45a268) successfully uploaded.
record 607db6aae5dbce003a45a268 assigned to workspace 5fb55e4826ed1e0015e846a9


### 1.2. Check for missing records

In [6]:
print('missing from local:')
db_remote.citations_df[(~db_remote.citations_df.doi.isin(db_local.citations_df.doi.tolist()))
                      |(~db_remote.citations_df.note.isin(db_local.citations_df.note.tolist()))]

missing from local:


Unnamed: 0,ENTRYTYPE,ID,author,note,title,year,abstract,address,booktitle,day,month,pages,doi,journal,number,publisher,url,volume,numpages


In [7]:
print('missing from remote:')
db_local.citations_df[(~db_local.citations_df.doi.isin(db_remote.citations_df.doi.tolist()))
                     |(~db_local.citations_df.note.isin(db_remote.citations_df.note.tolist()))]

missing from remote:


Unnamed: 0,ENTRYTYPE,ID,abstract,author,doi,journal,month,number,pages,publisher,title,url,volume,year,numpages,note,address,booktitle,day


### 1.3. Compare record contents

In [154]:
allgood = True
for local_cite in db_local.citations:
    try:
        match = db_remote.citations_df.doi == local_cite.doi
    except:
        match = db_remote.citations_df.note == local_cite.note
    if sum(match) > 1:
        print('multiple matches for', local_cite.doifname)
    elif sum(match) == 0:
        print('no matches for', local_cite.doifname)
    else:
        remote_cite = db_remote.citations[match][0]
        remote_bib = remote_cite.bibtex
        local_bib = local_cite.bibtex
        if local_bib != remote_bib:
            allgood = False
            print('different data for', local_cite.doifname)
            print()
            print('local:')
            print(local_bib)
            print('remote:')
            print(remote_bib)
            print()
            same = ''
            for i in range(len(local_bib)):
                if local_bib[i] != remote_bib[i]:
                    print(local_bib[:i+1])
                    break
            break

if allgood:
    print('All records match')

All records match


### 1.4. Update a citation

In [151]:
# Update a local citation to remote
#citation = db_local.get_citation(local_cite.doi, verbose=True)
#db_remote.upload_citation(citation, workspace=global_workspace, verbose=True)

Citation retrieved from loaded citations
record 10.1016_j.commatsci.2017.01.002 (6065f10926ed1e003628ef9a) has been updated.
record 6065f10926ed1e003628ef9a assigned to workspace 5fb55e4826ed1e0015e846a9


In [None]:
# Update a remote citation to local
#citation = db_remote.get_citation(remote_cite.doi, verbose=True)
#db_local.save_citations(citation, overwrite=True, verbose=True)

## 2. Check Potentials

In [80]:
db_local = potentials.Database(load='potentials', username=username, password=password, verbose=True, remote=False, local=True)
db_remote = potentials.Database(load='potentials', username=username, password=password, verbose=True, remote=True, local=False)

Loaded 642 local potentials
Loaded 642 remote potentials


### 2.1. Check for missing records

In [26]:
print('missing from local:')
db_remote.potentials_df[~db_remote.potentials_df.id.isin(db_local.potentials_df.id.tolist())]

missing from local:


Unnamed: 0,key,id,recorddate,notes,fictional,elements,othername,modelname,citations,implementations


In [27]:
print('missing from remote:')
db_local.potentials_df[~(db_local.potentials_df.id.isin(db_remote.potentials_df.id.tolist()))]

missing from remote:


Unnamed: 0,key,id,recorddate,notes,fictional,elements,othername,modelname,citations,implementations


### 2.2. Compare record contents

In [81]:
allgood = True
for local_pot in db_local.potentials:
    match = db_remote.potentials_df.id == local_pot.id
    if sum(match) > 1:
        print('multiple matches for', local_pot.id)
    elif sum(match) == 0:
        print('no matches for', local_pot.id)
    else:
        remote_pot = db_remote.potentials[match][0]
        remote_json = remote_pot.asmodel().json()
        local_json = local_pot.asmodel().json()
        if local_json != remote_json:
            allgood = False
            print('different data for', local_pot.id)
            print()
            print('local:')
            display(HTML(local_pot.html()))
            print('remote:')
            display(HTML(remote_pot.html()))
            print()
            same = ''
            for i in range(len(local_json)):
                if local_json[i] != remote_json[i]:
                    print(local_json[:i+1])
                    break
            break

if allgood:
    print('All records match')

All records match


### 2.3. Update records

In [37]:
# Update a local potential to remote
#db_remote.upload_potential(local_pot, workspace=global_workspace, verbose=True)
#db_remote = potentials.Database(load='potentials', username=username, password=password, verbose=True, remote=True, local=False)

record potential.2019--Mendelev-M-I-Sun-Y-Zhang-F-et-al--Cu-Zr (5fb71fd826ed1e003c17888a) has been updated.
record 5fb71fd826ed1e003c17888a assigned to workspace 5fb55e4826ed1e0015e846a9
Loaded 642 remote potentials


In [None]:
# Update a remote citation to local
#potential = db_remote.get_potential(remote_cite.doi, verbose=True)
#db_local.save_potentials(potential, overwrite=True, verbose=True)
#db_local = potentials.Database(load='potentials', username=username, password=password, verbose=True, remote=False, local=True)

## 3. Check citations in potentials 

In [182]:
# Only load local because local and remote records should match now
db_local = potentials.Database(load=['citations','potentials'], username=username, password=password,
                               verbose=True, remote=False, local=True)

Loaded 355 local citations
Loaded 642 local potentials


### 3.1. Compare Citation records to citations embedded in Potential records

In [183]:
allgood = True
for k, potential in enumerate(db_local.potentials):
    for j, pot_cite in enumerate(potential.citations):
        try:
            match = db_local.citations_df.doi == pot_cite.doi
        except:
            match = db_local.citations_df.note == pot_cite.note
        if sum(match) > 1:
            print('multiple matches for', pot_cite.doifname)
        elif sum(match) == 0:
            print('no matches for', pot_cite.doifname)
        else:
            cite = db_local.citations[match][0]
            bib = cite.bibtex
            pot_bib = pot_cite.bibtex
            if pot_bib != bib:
                allgood = False
                print(f'{k+1}/{len(db_local.potentials)}')
                print('different data for', pot_cite.doifname, potential.id)
                print()
                print('Citation:')
                print(bib)
                print('Potential:')
                print(pot_bib)
                print('Citation:')
                display(HTML(cite.html()))
                print('Potential:')
                display(HTML(pot_cite.html()))
                print()
                same = ''
                for i in range(len(pot_bib)):
                    if pot_bib[i] != bib[i]:
                        print(pot_bib[:i])
                        break
                break
    
    if not allgood:
        break

if allgood:
    print('All records match')

All records match


### 3.2. Update records

In [181]:
# Update citation embedded in the potential record to match the citation record
#potential.citations[j] = cite
#db.upload_potential(potential, verbose=True)
#db.save_potentials(potential, verbose=True, format='json', indent=4, overwrite=True)

record potential.2018--Jang-H-S-Kim-K-M-Lee-B-J--Zn-Mg (5fb71e2026ed1e003c17887e) has been updated.
1 potentials saved to localpath
 - 1 existing potentials updated


## 4. Check potential_LAMMPS records

In [202]:
db_local = potentials.Database(load='lammps_potentials', username=username, password=password,
                               verbose=True, remote=False, local=True, kim_models=[])
db_remote = potentials.Database(load='lammps_potentials', username=username, password=password,
                                verbose=True, remote=True, local=False, kim_models=[])

Loaded 401 local LAMMPS potentials
Building lammps potentials for kim models
No kim models identified
Loaded 401 remote LAMMPS potentials
Building lammps potentials for kim models
No kim models identified


### 4.1. Check for missing records

In [185]:
print('missing from local:')
db_remote.lammps_potentials_df[~db_remote.lammps_potentials_df.id.isin(db_local.lammps_potentials_df.id.tolist())]

missing from local:


Unnamed: 0,id,key,potid,potkey,units,atom_style,allsymbols,pair_style,status,symbols,elements,artifacts


In [186]:
print('missing from local:')
db_local.lammps_potentials_df[~db_local.lammps_potentials_df.id.isin(db_remote.lammps_potentials_df.id.tolist())]

missing from local:


Unnamed: 0,id,key,potid,potkey,units,atom_style,allsymbols,pair_style,status,symbols,elements,artifacts


### 4.2. Compare record contents

In [208]:
allgood = True
for k, local_pot in enumerate(db_local.lammps_potentials):
    match = db_remote.lammps_potentials_df.id == local_pot.id
    if sum(match) > 1:
        print('multiple matches for', local_pot.id)
    elif sum(match) == 0:
        print('no matches for', local_pot.id)
    else:
        remote_pot = db_remote.lammps_potentials[match][0]
        remote_json = remote_pot.asmodel().json()
        local_json = local_pot.asmodel().json()
        if local_json != remote_json:
            allgood = False
            print(f'{k+1}/{len(db_local.lammps_potentials)}')
            print('different data for', local_pot.id)
            print()
            print('local:')
            print(local_json)
            print('remote:')
            print(remote_json)
            print()
            same = ''
            for i in range(len(local_json)):
                if local_json[i] != remote_json[i]:
                    print(local_json[:i+1])
                    break
            break

if allgood:
    print('All records match')

All records match


### 4.3. Update records

In [198]:
#db_local.save_lammps_potentials(remote_pot, verbose=True, format='json', indent=4, overwrite=True)

1 LAMMPS potentials saved to localpath
 - 1 existing potentials updated


## 5. Compare Potential and potential_LAMMPS records

In [225]:
# Only load local because local and remote records should match now
db_local = potentials.Database(load=['potentials','lammps_potentials'], username=username, password=password,
                               verbose=True, remote=False, local=True, status=None, kim_models=[])

Loaded 642 local potentials
Loaded 454 local LAMMPS potentials
Building lammps potentials for kim models
No kim models identified


### 5.1. Check common fields

- Do keys and ids match for both the potential and implementation?
- Are all but known missing entries present in both records?
- Do the artifacts match?

In [235]:
bad_imps = [
    '2009--Kim-H-K--Fe-Ti-C--LAMMPS--ipr1',
    '2012--Jelinek-B--Al-Si-Mg-Cu-Fe--LAMMPS--ipr1',
    '2013--Gao-H--AgTaO3--LAMMPS--ipr1',
    '2014--Liyanage-L-S-I--Fe-C--LAMMPS--ipr1',
    '2015--Ko-W-S--Ni-Ti--LAMMPS--ipr1',
    '2015--Pascuet-M-I--Al-U--LAMMPS--ipr1'
]

In [236]:
for lammps_potential in db_local.lammps_potentials:
    
    # Find potential record corresponding to the lammps potential
    potential = db_local.potentials_df[(db_local.potentials_df.id == lammps_potential.potid)
                                      |(db_local.potentials_df.key == lammps_potential.potkey)]
    if len(potential) != 1:
        print(len(potential), 'matches found', lammps_potential.id)
    else:
        potential = potential.iloc[0]
    
    # Check that potential key and id are the same in both records
    if potential.id != lammps_potential.potid:
        print('potential id mismatch', lammps_potential.id)
        print(potential.id, lammps_potential.potid)
    if potential.key != lammps_potential.potkey:
        print('potential key mismatch', lammps_potential.id)
        print(potential.key, lammps_potential.potkey)
    
    # Find potential's implementation matching the lammps potential
    for implementation in potential.implementations:
        match = False
        if implementation.id == lammps_potential.id or implementation.key == lammps_potential.key:
            match = True
            break
    
    # Issue message if no matching implementation listing found
    if match is False:
        if lammps_potential.id not in bad_imps:
            print('No listing found for', lammps_potential.id)
        continue
    
    # Check that implementation key and id match
    if implementation.id != lammps_potential.id:
        print('implementation id mismatch')
        print(implementation.id, lammps_potential.id)
    if implementation.key != lammps_potential.key:
        print('implementation key mismatch', lammps_potential.id)
        print(implementation.key, lammps_potential.key)
    
    # Check that the artifacts match between lammps potential and implementation
    if len(implementation.artifacts) != len(lammps_potential.artifacts):
        print('different numbers of artifacts', lammps_potential.id)
    else:
        for n in range(len(implementation.artifacts)):
            if implementation.artifacts[n].asmodel().json() != lammps_potential.artifacts[n].asmodel().json():
                print('artifact mismatch', lammps_potential.id, lammps_potential.artifacts[n].filename)

In [237]:
# Reverse search for implementation listings with no lammps potential records
for potential in db_local.potentials:
    for implementation in potential.implementations:
        if 'LAMMPS' in implementation.id:
            
            lammps_potential = db_local.lammps_potentials_df[db_local.lammps_potentials_df.id == implementation.id]
            if len(lammps_potential) == 0:
                print('No lammps potential found for', implementation.id)

No match for 2016--Lee-E--Si-O--LAMMPS--ipr1
No match for 2016--Lee-E--Ti-O--LAMMPS--ipr1
No match for 2017--Beeler-B--U-Si--LAMMPS--ipr1
No match for 2017--Lee-E--Li-Mn-O--LAMMPS--ipr1
No match for 2018--Lee-E--Li-Co-O--LAMMPS--ipr1
