# Record consistency checks

This notebook provides consistency checks between

- Records stored at potentials.nist.gov and in the potentials-library github repository
- Records of different styles that share content

Import potentials and load the database records

In [1]:
import uuid
import datetime
import shutil
from pathlib import Path

import potentials

import numpy as np
import pandas as pd

import iprPy

from IPython.core.display import display, HTML

In [2]:
machine = 'desktop'
#machine = 'laptop'

In [3]:
# Load default database and fetch global workspace    
db = potentials.Database(remote_name='potentials')#username=username, password=password)
workspace = 'Global Public Workspace'

## 1. Check Citations

Create databases that separately load local and remote Citations

In [4]:
local_citations, local_citations_df = db.get_citations(local=True, remote=False, return_df=True, refresh_cache=True, verbose=True)
remote_citations, remote_citations_df = db.get_citations(local=False, remote=True, return_df=True, verbose=True)

Found 360 matching Citation records in local library
Found 360 matching Citation records in remote library


### 1.1 Content corrections

#### 1.1.1. Correct names of Citation records

Occasionally, major updates of the CDCS database require that records be backed up and copied over.  I think there's a bug in the record transfer scripts (not mine) that replace underscores with spaces.  This changes them back.

In [8]:
# Fetch raw record content using CDCS query
rawcites = db.remote_database.cdcs.query(template='Citation')
for i in rawcites.index:
    rawcite = rawcites.loc[i]
    title = rawcite.title
    
    # Check if title contains spaces
    if ' ' in title:
        print(title)
        continue
        #title = title.replace(' ', '_')
        #content = rawcite.xml_content
        
        #db_remote.cdcs.delete_record(rawcite, verbose=True)
        #db_remote.cdcs.upload_record('Citation', content=content, title=title, workspace=global_workspace, verbose=True)

In [239]:
# Fetch raw record content using CDCS query
rawcites = db_remote.cdcs.query(template='crystal_prototype')
for i in rawcites.index:
    rawcite = rawcites.loc[i]
    title = rawcite.title
    
    # Check if title contains spaces
    if ' ' in title:
        title = title.replace(' ', '_')
        content = rawcite.xml_content
        
        db_remote.cdcs.delete_record(rawcite, verbose=True)
        db_remote.cdcs.upload_record('crystal_prototype', content=content, title=title, workspace=workspace, verbose=True)

record L2 1--AlCu2Mn--heusler (5fb561b626ed1e0037171170) has been deleted.
record L2_1--AlCu2Mn--heusler (607db6a1e5dbce003645a181) successfully uploaded.
record 607db6a1e5dbce003645a181 assigned to workspace 5fb55e4826ed1e0015e846a9
record L1 0--AuCu (5fb561b626ed1e002a17117d) has been deleted.
record L1_0--AuCu (607db6a4e5dbce003845a17b) successfully uploaded.
record 607db6a4e5dbce003845a17b assigned to workspace 5fb55e4826ed1e0015e846a9
record L1 2--AuCu3 (5fb561b726ed1e0038171175) has been deleted.
record L1_2--AuCu3 (607db6a7e5dbce003745a17d) successfully uploaded.
record 607db6a7e5dbce003745a17d assigned to workspace 5fb55e4826ed1e0015e846a9
record D0 3--BiF3 (5fb561b826ed1e003917116f) has been deleted.
record D0_3--BiF3 (607db6aae5dbce003a45a268) successfully uploaded.
record 607db6aae5dbce003a45a268 assigned to workspace 5fb55e4826ed1e0015e846a9


#### 1.1.2. Add missing page (paper) numbers

In [5]:
local_citations_df[local_citations_df.pages.isna()]

Unnamed: 0,name,year_authors,year,volume,url,title,publisher,pages,number,month,...,doi,author,abstract,ENTRYTYPE,ID,numpages,note,day,booktitle,address
107,10.1017_cbo9781139003582,2009--Tadmor-E-B-Miller-R-E,2009,,https://doi.org/10.1017%2Fcbo9781139003582,Modeling Materials,Cambridge University Press,,,,...,10.1017/cbo9781139003582,Ellad B. Tadmor and Ronald E. Miller,,book,Tadmor_2009,,,,,
344,1990--ackland-g-j--pt,1990--Ackland-G-J,1990,,,unpublished,,,,,...,,G.J. Ackland,,unpublished,1990--Ackland-G-J--Pt,,1990--Ackland-G-J--Pt,,,
346,2015--elliott-r-s-akerson-a,2015--Elliott-R-S-Akerson-A,2015,,,"Efficient ""universal"" shifted Lennard-Jones mo...",,,,,...,,Ryan S. Elliott and Andrew Akerson,,unpublished,2015--Elliott-R-S-Akerson-A,,2015--Elliott-R-S-Akerson-A,,,
347,2015--mendelev-m-i--cu,2015--Mendelev-M-I,2015,,,to be published,,,,,...,,M.I. Mendelev,,unpublished,2015--Mendelev-M-I--Cu,,2015--Mendelev-M-I--Cu,,,
348,2015--mendelev-m-i--fictional-w,2015--Mendelev-M-I,2015,,,to be published,,,,,...,,M.I. Mendelev,,unpublished,2015--Mendelev-M-I--fictional-W,,2015--Mendelev-M-I--fictional-W,,,
349,2015--mendelev-m-i--mg,2015--Mendelev-M-I,2015,,,to be published,,,,,...,,M.I. Mendelev,,unpublished,2015--Mendelev-M-I--Mg,,2015--Mendelev-M-I--Mg,,,
350,2015--mendelev-m-i--w,2015--Mendelev-M-I,2015,,,to be published,,,,,...,,M.I. Mendelev,,unpublished,2015--Mendelev-M-I--W,,2015--Mendelev-M-I--W,,,
351,2016--gibson-j--ti,2016--Gibson-J,2016,,,to be published,,,,,...,,Joshua Gibson,Titanium model for multi-state modfied embedde...,unpublished,2016--Gibson-J--Ti,,2016--Gibson-J--Ti,,,
352,2017--purja-pun-g-p--au,2017--Purja-Pun-G-P,2017,,,to be published,,,,,...,,G.P. Purja Pun,,unpublished,2017--Purja-Pun-G-P--Au,,2017--Purja-Pun-G-P--Au,,,
353,2018--mendelev-m-i--ag-ni,2018--Mendelev-M-I,2018,,,to be published,,,,,...,,M.I. Mendelev,,unpublished,2018--Mendelev-M-I--Ag-Ni,,2018--Mendelev-M-I--Ag-Ni,,,


In [19]:
ii = local_citations_df[local_citations_df.pages.isna()].index

In [54]:
i = ii[]
cit = local_citations[i]
print(cit.bib)

{'year': '2018', 'title': 'to be published', 'note': '2018--Mendelev-M-I--W', 'author': 'M.I. Mendelev', 'ENTRYTYPE': 'unpublished', 'ID': '2018--Mendelev-M-I--W'}


In [43]:
cit.bib['pages'] = "97--100"
#cit.bib['pages'] = cit.bib['doi'].split('.')[-1]
db.save_citation(cit, overwrite=True, verbose=True)
db.upload_citation(cit, workspace=workspace, overwrite=True, verbose=True)

Citation record named 10.1524_zkri.2009.1085 updated in C:\Users\lmh1\Documents\library
Citation record named 10.1524_zkri.2009.1085 updated in https://potentials.nist.gov/
Citation record named 10.1524_zkri.2009.1085 assigned to workspace Global Public Workspace


### 1.2. Check for missing records

In [6]:
print('missing from local:')
remote_citations_df[(~remote_citations_df.doi.isin(local_citations_df.doi.tolist()))
                   |(~remote_citations_df.note.isin(local_citations_df.note.tolist()))]

missing from local:


Unnamed: 0,name,year_authors,year,title,note,author,ENTRYTYPE,ID,abstract,pages,...,day,booktitle,address,volume,url,publisher,number,journal,doi,numpages


In [7]:
print('missing from remote:')
local_citations_df[(~local_citations_df.doi.isin(remote_citations_df.doi.tolist()))
                  |(~local_citations_df.note.isin(remote_citations_df.note.tolist()))]

missing from remote:


Unnamed: 0,name,year_authors,year,volume,url,title,publisher,pages,number,month,...,doi,author,abstract,ENTRYTYPE,ID,numpages,note,day,booktitle,address


### 1.3. Compare record contents

In [10]:
allgood = True
for local_cite in local_citations:
    try:
        match = remote_citations_df.doi == local_cite.bib['doi']
    except:
        match = remote_citations_df.note == local_cite.bib['note']
    if sum(match) > 1:
        print('multiple matches for', local_cite.doifname)
    elif sum(match) == 0:
        print('no matches for', local_cite.doifname)
    else:
        remote_cite = remote_citations[match][0]
        remote_bib = remote_cite.build_bibtex()
        local_bib = local_cite.build_bibtex()
        if local_bib != remote_bib:
            allgood = False
            print('different data for', local_cite.doifname)
            print()
            print('local:')
            print(local_bib)
            print('remote:')
            print(remote_bib)
            print()
            same = ''
            for i in range(len(local_bib)):
                if local_bib[i] != remote_bib[i]:
                    print(local_bib[:i+1])
                    break
            break

if allgood:
    print('All records match')

All records match


### 1.4. Update a citation

In [13]:
# Update a local citation to remote
#cit = db.get_citation(name=local_cite.name, remote=False, local=True, verbose=True)
#db.upload_citation(cit, workspace=workspace, overwrite=True, verbose=True)

Matching record retrieved from local


In [14]:
# Update a remote citation to local
#cit = db.get_citation(name=remote_cite.name, remote=True, local=False, verbose=True)
#db.save_citation(cit, overwrite=True, verbose=True)

Matching record retrieved from remote


## 2. Check Potentials

In [5]:
local_potentials, local_potentials_df = db.get_potentials(local=True, remote=False, return_df=True, refresh_cache=True, verbose=True)
remote_potentials, remote_potentials_df = db.get_potentials(local=False, remote=True, return_df=True, verbose=True)

Found 647 matching Potential records in local library
Found 647 matching Potential records in remote library


### 2.1. Check for missing records

In [6]:
print('missing from local:')
remote_potentials_df[~remote_potentials_df.id.isin(local_potentials_df.id.tolist())]

missing from local:


Unnamed: 0,name,key,id,recorddate,notes,fictional,elements,othername,modelname,citations,implementations


In [7]:
print('missing from remote:')
local_potentials_df[~(local_potentials_df.id.isin(remote_potentials_df.id.tolist()))]

missing from remote:


Unnamed: 0,name,key,id,recorddate,notes,fictional,elements,othername,modelname,citations,implementations


### 2.2. Compare record contents

In [9]:
allgood = True
for local_pot in local_potentials:
    match = remote_potentials_df.id == local_pot.id
    if sum(match) > 1:
        print('multiple matches for', local_pot.id)
    elif sum(match) == 0:
        print('no matches for', local_pot.id)
    else:
        remote_pot = remote_potentials[match][0]
        remote_json = remote_pot.build_model().json()
        local_json = local_pot.build_model().json()
        if local_json != remote_json:
            allgood = False
            print('different data for', local_pot.id)
            print()
            print('local:')
            display(HTML(local_pot.html()))
            print('remote:')
            display(HTML(remote_pot.html()))
            print()
            same = ''
            for i in range(len(local_json)):
                if local_json[i] != remote_json[i]:
                    print(local_json[:i+1])
                    break
            break

if allgood:
    print('All records match')

All records match


### 2.3. Update records

In [37]:
# Update a local potential to remote
#db_remote.upload_potential(local_pot, workspace=global_workspace, verbose=True)
#db_remote = potentials.Database(load='potentials', username=username, password=password, verbose=True, remote=True, local=False)

record potential.2019--Mendelev-M-I-Sun-Y-Zhang-F-et-al--Cu-Zr (5fb71fd826ed1e003c17888a) has been updated.
record 5fb71fd826ed1e003c17888a assigned to workspace 5fb55e4826ed1e0015e846a9
Loaded 642 remote potentials


In [None]:
# Update a remote citation to local
#potential = db_remote.get_potential(remote_cite.doi, verbose=True)
#db_local.save_potentials(potential, overwrite=True, verbose=True)
#db_local = potentials.Database(load='potentials', username=username, password=password, verbose=True, remote=False, local=True)

## 3. Check citations in potentials 

In [182]:
# Only load local because local and remote records should match now
db_local = potentials.Database(load=['citations','potentials'], username=username, password=password,
                               verbose=True, remote=False, local=True)

Loaded 355 local citations
Loaded 642 local potentials


### 3.1. Compare Citation records to citations embedded in Potential records

In [15]:
allgood = True
for k, potential in enumerate(local_potentials):
    for j, pot_cite in enumerate(potential.citations):
        try:
            match = local_citations_df.doi == pot_cite.bib['doi']
        except:
            match = local_citations_df.note == pot_cite.bib['note']
        if sum(match) > 1:
            print('multiple matches for', pot_cite.doifname)
        elif sum(match) == 0:
            print('no matches for', pot_cite.doifname)
        else:
            cite = local_citations[match][0]
            bib = cite.build_bibtex()
            pot_bib = pot_cite.build_bibtex()
            if pot_bib != bib:
                #if 'pages' in cite.bib and 'pages' not in pot_cite.bib:
                #    potential.citations[j] = cite
                #    db.save_potential(potential, overwrite=True, verbose=True)
                #    db.upload_potential(potential, workspace=workspace, overwrite=True, verbose=True)
                #else:
                allgood = False
                print(f'{k+1}/{len(local_potentials)}')
                print('different data for', pot_cite.doifname, potential.id)
                print()
                print('Citation:')
                print(bib)
                print('Potential:')
                print(pot_bib)
                print('Citation:')
                display(HTML(cite.html()))
                print('Potential:')
                display(HTML(pot_cite.html()))
                print()
                same = ''
                for i in range(len(pot_bib)):
                    if pot_bib[i] != bib[i]:
                        print(pot_bib[:i])
                        break
                break
    
    if not allgood:
        break

if allgood:
    print('All records match')

All records match


### 3.2. Update records

In [11]:
# Update citation embedded in the potential record to match the citation record
potential.citations[j] = cite
db.save_potential(potential, overwrite=True, verbose=True)
db.upload_potential(potential, workspace=workspace, overwrite=True, verbose=True)

Potential record named potential.2001--Lee-B-J-Baskes-M-I-Kim-H-Cho-Y-K--Cr updated in C:\Users\lmh1\Documents\library
Potential record named potential.2001--Lee-B-J-Baskes-M-I-Kim-H-Cho-Y-K--Cr updated in https://potentials.nist.gov/
Potential record named potential.2001--Lee-B-J-Baskes-M-I-Kim-H-Cho-Y-K--Cr assigned to workspace Global Public Workspace


In [12]:
local_potentials, local_potentials_df = db.get_potentials(local=True, remote=False, return_df=True, refresh_cache=True, verbose=True)

Found 647 matching Potential records in local library


## 4. Check potential_LAMMPS records

In [35]:
local_lmppots, local_lmppots_df = db.get_lammps_potentials(local=True, remote=False, return_df=True, refresh_cache=True,
                                                           kim_models=[], verbose=True)
remote_lmppots, remote_lmppots_df = db.get_lammps_potentials(local=False, remote=True, return_df=True,
                                                             kim_models=[], verbose=True)


Found 409 matching potential_LAMMPS records in local library
No KIM potentials added: list of models is empty
Found 409 matching potential_LAMMPS records in remote library
No KIM potentials added: list of models is empty


### 4.1. Check for missing records

In [17]:
print('missing from local:')
remote_lmppots_df[~remote_lmppots_df.id.isin(local_lmppots_df.id.tolist())]

missing from local:


Unnamed: 0,name,id,key,potid,potkey,units,atom_style,allsymbols,pair_style,status,symbols,elements,artifacts,comments,dois


In [18]:
print('missing from local:')
local_lmppots_df[~local_lmppots_df.id.isin(remote_lmppots_df.id.tolist())]

missing from local:


Unnamed: 0,name,id,key,potid,potkey,units,atom_style,allsymbols,pair_style,status,symbols,elements,artifacts,comments,dois


### 4.2. Compare record contents

In [36]:
allgood = True
for k, local_pot in enumerate(local_lmppots):
    match = remote_lmppots_df.id == local_pot.id
    if sum(match) > 1:
        print('multiple matches for', local_pot.id)
    elif sum(match) == 0:
        print('no matches for', local_pot.id)
    else:
        remote_pot = remote_lmppots[match][0]
        remote_json = remote_pot.build_model().json()
        local_json = local_pot.build_model().json()
        if local_json != remote_json:
            allgood = False
            print(f'{k+1}/{len(local_lmppots)}')
            print('different data for', local_pot.id)
            print()
            print('local:')
            print(local_json)
            print('remote:')
            print(remote_json)
            print()
            same = ''
            for i in range(len(local_json)):
                if local_json[i] != remote_json[i]:
                    print(local_json[:i+1])
                    break
            break

if allgood:
    print('All records match')

All records match


In [32]:
for fname in Path('C:/Users/lmh1/Documents/library/potential_LAMMPS').glob('*.json'):
    with open(fname, encoding='UTF-8') as f:
        content = f.read()
    if '"1e-06"' in content:
        content = content.replace('"1e-06"', '1e-06')
        with open(fname, 'w', encoding='UTF-8') as f:
            f.write(content)

### 4.3. Update records

In [28]:
db.save_lammps_potential(remote_pot, verbose=True, overwrite=True)

potential_LAMMPS record named 2008--Chenoweth-K--C-H-O--LAMMPS--ipr1 updated in C:\Users\lmh1\Documents\library


## 5. Compare Potential and potential_LAMMPS records

In [225]:
# Only load local because local and remote records should match now
db_local = potentials.Database(load=['potentials','lammps_potentials'], username=username, password=password,
                               verbose=True, remote=False, local=True, status=None, kim_models=[])

Loaded 642 local potentials
Loaded 454 local LAMMPS potentials
Building lammps potentials for kim models
No kim models identified


### 5.1. Check common fields

- Do keys and ids match for both the potential and implementation?
- Are all but known missing entries present in both records?
- Do the artifacts match?

In [235]:
bad_imps = [
    '2009--Kim-H-K--Fe-Ti-C--LAMMPS--ipr1',
    '2012--Jelinek-B--Al-Si-Mg-Cu-Fe--LAMMPS--ipr1',
    '2013--Gao-H--AgTaO3--LAMMPS--ipr1',
    '2014--Liyanage-L-S-I--Fe-C--LAMMPS--ipr1',
    '2015--Ko-W-S--Ni-Ti--LAMMPS--ipr1',
    '2015--Pascuet-M-I--Al-U--LAMMPS--ipr1'
]

In [236]:
for lammps_potential in db_local.lammps_potentials:
    
    # Find potential record corresponding to the lammps potential
    potential = db_local.potentials_df[(db_local.potentials_df.id == lammps_potential.potid)
                                      |(db_local.potentials_df.key == lammps_potential.potkey)]
    if len(potential) != 1:
        print(len(potential), 'matches found', lammps_potential.id)
    else:
        potential = potential.iloc[0]
    
    # Check that potential key and id are the same in both records
    if potential.id != lammps_potential.potid:
        print('potential id mismatch', lammps_potential.id)
        print(potential.id, lammps_potential.potid)
    if potential.key != lammps_potential.potkey:
        print('potential key mismatch', lammps_potential.id)
        print(potential.key, lammps_potential.potkey)
    
    # Find potential's implementation matching the lammps potential
    for implementation in potential.implementations:
        match = False
        if implementation.id == lammps_potential.id or implementation.key == lammps_potential.key:
            match = True
            break
    
    # Issue message if no matching implementation listing found
    if match is False:
        if lammps_potential.id not in bad_imps:
            print('No listing found for', lammps_potential.id)
        continue
    
    # Check that implementation key and id match
    if implementation.id != lammps_potential.id:
        print('implementation id mismatch')
        print(implementation.id, lammps_potential.id)
    if implementation.key != lammps_potential.key:
        print('implementation key mismatch', lammps_potential.id)
        print(implementation.key, lammps_potential.key)
    
    # Check that the artifacts match between lammps potential and implementation
    if len(implementation.artifacts) != len(lammps_potential.artifacts):
        print('different numbers of artifacts', lammps_potential.id)
    else:
        for n in range(len(implementation.artifacts)):
            if implementation.artifacts[n].asmodel().json() != lammps_potential.artifacts[n].asmodel().json():
                print('artifact mismatch', lammps_potential.id, lammps_potential.artifacts[n].filename)

In [237]:
# Reverse search for implementation listings with no lammps potential records
for potential in db_local.potentials:
    for implementation in potential.implementations:
        if 'LAMMPS' in implementation.id:
            
            lammps_potential = db_local.lammps_potentials_df[db_local.lammps_potentials_df.id == implementation.id]
            if len(lammps_potential) == 0:
                print('No lammps potential found for', implementation.id)

No match for 2016--Lee-E--Si-O--LAMMPS--ipr1
No match for 2016--Lee-E--Ti-O--LAMMPS--ipr1
No match for 2017--Beeler-B--U-Si--LAMMPS--ipr1
No match for 2017--Lee-E--Li-Mn-O--LAMMPS--ipr1
No match for 2018--Lee-E--Li-Co-O--LAMMPS--ipr1


## 6. Compare LAMMPS parameter files between library and website

In [63]:
webroot = 'e:/website/IPR-website'

for lmppot in local_lmppots:
    for artifact in lmppot.artifacts:
        libpath = Path(db.local_database.host, 'potential_LAMMPS', lmppot.id, artifact.filename)
        webpath = Path(artifact.url.replace('https://www.ctcms.nist.gov', webroot))
        if not libpath.is_file():
            print(f'Library file {lmppot.id} {artifact.filename} not found')
        elif not webpath.is_file():
            print(f'Website file {lmppot.id} {artifact.filename} not found')
        else:
            if libpath.suffix in ['.pdf', '.docx', '.gz', '.zip']:
                continue
            try:
                with open(libpath) as f:
                    libcontent = f.read()
                with open(webpath) as f:
                    webcontent = f.read()
                if libcontent != webcontent:
                    print(f'Content differs for file {lmppot.id} {artifact.filename}')
            except:
                print(f'Failed to read file {lmppot.id} {artifact.filename}')

In [57]:
libpath.suffix

'.t'

In [59]:
fn = Path(db.local_database.host, 'potential_LAMMPS', '2008--Kim-Y-M--Ti-C--LAMMPS--ipr1', 'library.meam')
with open(fn) as f:
    print(f.read())

# DATE: 2015-02-12 CONTRIBUTOR: Kim, Y.-M. and B.-J. Lee, Modified embedded-atom method interatomic potentials for the Ti–C and Ti–N binary systems. Acta Materialia, 2008. 56(14): p. 3481-3489. 
# meam data from vax files fcc,bcc,dia    11/4/92
# elt        lat     z       ielement     atwt
# alpha      b0      b1      b2           b3      alat    esub    asub
# t0         t1              t2           t3              rozero  ibar


'Ti'	'hcp'	12	1	47.880 
4.7194566335 	2.70 	1.00 	3.00 	1.00 	2.9200000000 	4.87 	0.66 
1	6.80 	-2.00 	-12.00 	1.00 	3

'N'	'dim'	1	1	14.007 
5.9600000000 	2.75 	4.00 	4.00 	4.00 	1.1000000000 	4.88 	1.80 
1	0.05 	1.00 	0.00 	18.00 	3

'C'	'dia'	4	1	12.011 
4.3651999166 	4.25 	2.80 	2.00 	5.00 	3.5564776582 	7.37 	1.18 
1	3.20 	1.44 	-4.48 	6.00 	3

