## TODO
1. Ask Martin about how Wyckoffs should be represented (per site? reduced?)
2. aiida CIF database
    1. Convert to `pymatgen.core.Structure` s
    2. Create mongoDB collection for `pymatgen.core.Structure` s
3. Fingerprint mongoDB
    1. Make a collection for each flavor of fingerprint
    2. Add all related metadata, tolerances, etc. (spglib / STRUCTURE TIDY / pymatgen)
4. Create RESTful interface for structures, fingerprinting, etc.
    1. GET fingerprints by sending structure data (CIF, `pymatgen.core.Structure`, etc.)
    2. GET structures by query
    3. GET structure comparisons
        1. pymatgen comparison
        2. XRD subtraction using pymatgen XRD
        3. ...

In [1]:
import pymatgen
import pymatgen.symmetry.structure
import re
class StidyParser(object):
    def __init__(self: object, stidy_output: str) -> None:
        self.output = stidy_output
        return
    
    @property
    def formula(self: object) -> str:
        '''str: Reduced chemical formula'''
        regexp = re.compile('Structure Tidy Results for\s*(\w*)')
        match = regexp.search(self.output)
        return match.group(1)

    @property
    def space_group(self: object) -> str:
        '''str: Space group descriptor e.g. "P 1"'''
        regexp = re.compile('Structure Tidy Results for\s*\w*\s*(.*)')
        match = regexp.search(self.output)
        return str(match.group(1).strip())

    @property
    def axes_change(self: object) -> (str):
        '''tuple: New axes in terms of originals e.g. ('a', 'b+c', 'b')'''
        regexp = re.compile('Axes changed to : (.*)')
        match = regexp.search(self.output)
        return tuple(match.group(1).strip().split(','))

    @property
    def pearson(self: object) -> str:
        '''str: Pearson code e.g. "aP"'''
        regexp = re.compile('Pearson code : (\w*)')
        match = regexp.search(self.output)
        return match.group(1)

    @property
    def cell(self: object) -> ((float)):
        '''tuple: abc and angles of the standardized cell'''
        regexp = re.compile('^Cell :.*$', re.MULTILINE)
        match = regexp.search(self.output).group(0)
        abc = tuple(map(float, match.split(':')[-1].strip().split()[:3]))
        angles = tuple(map(float, match.split(':')[-1].strip().split()[3:]))
        return (abc, angles)

    @property
    def itc_number(self: object) -> int:
        '''int: ???'''
        regexp = re.compile('\s*Number in IT :\s*(\d+)')
        match = regexp.search(self.output)
        return int(match.group(1))
    
    @property
    def setting(self: object) -> [(str)]:
        '''
        list: ???
              One entry per output structure
        '''
        regexp = re.compile('Setting\s*([-\w]*),([-\w]*),([-\w]*)')
        return regexp.findall(self.output)
    
    @property
    def origin(self: object) -> [(float)]:
        '''
        list: New origin in the old cell
        One entry per output structure
        '''
        regexp = re.compile('Origin\s*\(.*\)')
        matches = regexp.findall(self.output)
        for i, match in enumerate(matches):
            matches[i] = tuple(map(float, match.strip('() ').split()[2:]))
        return matches
    
    @property
    def gamma(self: object) -> [float]:
        '''
        list: Gamma values for standardization minimization
              One entry per output structure
        '''
        regexp = re.compile('Gamma\s*=\s*.*')
        matches = regexp.findall(self.output)
        return [float(match.split()[-1]) for match in matches]
   
    @property
    def sites(self: object) -> list:
        '''
        list: Site data including:
            str: numbered species e.g. 'Mo1'
            str: wyckoff site e.g. '2(d)'
            float: x
            float: y
            float: z
            str: species e.g. 'Mo'
            int: number
        One set of sites per output structure
        '''
        regexp = re.compile('\s+([a-zA-Z]{1,2})(\d)+\s+([\w\(\)]{4,5})\s+([\d\/\.]+)\s+([\d\/\.]+)\s+([\d\/\.]+)\s+(\w+)\s+(\d+)')
        match_blocks = []
        for block in self.output.split('Wyckoff'):
            matches = []
            matches = regexp.findall(block)
            for m, match in enumerate(matches):
                match = list(match)
                for i in range(3,6):
                    if '/' in match[i]:
                        j = [float(k) for k in match[i].split('/')]
                        num = j[0] / j[1]
                        match[i] = num
                    else:
                        match[i] = float(match[i])
                match[0] = ''.join(match[0:2])
                del match[1]
                match[6] = int(match[6])
                matches[m] = tuple(match)
            if matches:
                match_blocks.append(matches)
        return match_blocks
            
    @property
    def wyckoff(self: object) -> [(str)]:
        '''
        list: Wyckoff occupations
        One set of sites per output structure
        '''
        # regexp = re.compile('Wyckoff sequence :\s*(.*)')
        # matches = regexp.findall(self.output)
        # return [tuple([m.strip() for m in match.split()]) for match in matches]
        wyckoff = []
        for sites in self.sites:
            sequence = []
            for site in sites:
                sequence.append(site[1].split('(')[-1].split(')')[0])
            wyckoff.append(sequence)
        return wyckoff
    
    @property
    def summary_and_remarks(self: object) -> str:
        '''str: Unprocessed summary and remarks cut from output file'''
        regexp = re.compile('^Summary and Remarks.*-\n', re.MULTILINE|re.DOTALL)
        return regexp.search(self.output).group(0)
    
    @property
    def structure(self: object) -> [pymatgen.core.Structure]:
        '''
        pymatgen.core.Structure: Structure object
        One entry per output structure
        '''
        structures = []
        cell = self.cell
        for sites in self.sites:
            species = [site[-2] for site in sites]
            fractional_coords = [tuple(site[2:5]) for site in sites]
            lattice = pymatgen.Lattice.from_lengths_and_angles(abc=cell[0], ang=cell[1])
            structure = pymatgen.Structure(lattice=lattice,
                                           species=species,
                                           coords=fractional_coords)
            # symmetry_structure = pymatgen.symmetry..structure.SymmetrizedStructure(structure, self.itc, )
            structures.append(structure)
        return structures
    
    @property
    def wyckoff_fingerprint(self: object) -> str:
        '''str: Wickoff fingerprint of the form [IT SPACEGROUP]_[_-SEPARATED WYCKOFF OCCUPANCIES]'''
        fingerprint = []
        for sequence in self.wyckoff:
            fingerprint.append('_'.join([str(self.itc_number)] + sequence))
        return fingerprint

In [2]:
import pymatgen
import distutils
import pymatgen.io.cif
import tempfile
import subprocess
import os
import pathlib
def stidy(structure: pymatgen.core.Structure) -> StidyParser:
    '''
    Run STRUCTURE TIDY as implemented in the PLATON software package.
    PLATON must either be in the PATH or in ../bin.
    
    References:
        A. L. Spek (2009). Acta Cryst., D65, 148-155. 
        E. Parthé and L. M. Gelato (1984). Acta Cryst., A40, 169-183.
        L. M. Gelato and E. Parthé (1987). J. Appl. Cryst. 20, 139-143.
        S-Z. Hu and E. Parthé (2004). Chinese J. Struct. Chem. 23, 1150-1160.
    
    Args:
        structure (pymatgen.core.Structure): Pymatgen Structure object for the (probably) untidy structure
    Returns:
        pymatgen.core.Structure: STRUCTURE TIDY standardized structure in Pymatgen object form
        
    '''
    PLATON = distutils.spawn.find_executable('platon')
    if not PLATON: 
        PLATON = '../bin/platon'

    structure_cif = str(pymatgen.io.cif.CifWriter(structure))
    with tempfile.NamedTemporaryFile(suffix='.cif') as temp_file:
        # write temporary cif file
        temp_file.write(bytes(structure_cif, encoding='utf-8'))
        temp_file.flush()
        temp_file_path = pathlib.Path(temp_file.name)
        # run ADDSYM_SHX to make PLATON recognize symmetries
        addsym_shx_process = subprocess.Popen(['platon', '-o', temp_file.name],
                                              stdout=subprocess.PIPE,
                                              stderr=subprocess.STDOUT,
                                              stdin=subprocess.PIPE)
        addsym_shx_data = addsym_shx_process.communicate(input=b'ADDSYM_SHX')
        # call STIDY on the ADDSYM_SHX output
        temp_file_spf = str(temp_file_path.parent / (str(temp_file_path.stem) + '_pl.spf'))
        stidy_process = subprocess.Popen(['platon', '-o', temp_file_spf],
                                          stdout=subprocess.PIPE,
                                          stderr=subprocess.STDOUT,
                                          stdin=subprocess.PIPE)
        stidy_data = stidy_process.communicate(input=b'STIDY')
    stidy_output = stidy_data[0].decode('utf-8')
    
    # clean up files
    if pathlib.Path('check.def').exists():
        os.remove('check.def')

    return StidyParser(stidy_output)

In [3]:
import pymatgen
import pymatgen.symmetry.analyzer
def wyckoff_fingerprint(structure: pymatgen.core.Structure) -> str:
    SYMPREC = 1e-5
    ANGLE_TOLERANCE = -1
    spacegroup_analyzer = pymatgen.symmetry.analyzer.SpacegroupAnalyzer(structure,
                                                                          symprec=SYMPREC,
                                                                          angle_tolerance=ANGLE_TOLERANCE)
    return '_'.join([str(spacegroup_analyzer.get_space_group_number())] + 
                    spacegroup_analyzer.get_symmetry_dataset()['wyckoffs'])

In [4]:
SYMPREC = 1e-3
ANGLE_TOLERANCE = 5

cifs = ['../data/MoS2_mp-1434_computed.cif',
        '../data/MoS2_mp-1434_conventional_standard.cif',
        '../data/MoS2_mp-1434_primitive.cif',
        '../data/MoS2_mp-1434_symmetrized.cif']

structures = []
for cif in cifs:
    structures += pymatgen.io.cif.CifParser(cif).get_structures(primitive=False)

for structure in structures:
    sp = stidy(structure) # StidyParser
    ts = sp.structure[0] # tidy Structure
    wf = sp.wyckoff_fingerprint[0] # Wyckoff fingerprint
    display(structure, ts, wf)
    print(''.join(['*']*80))   

Structure Summary
Lattice
    abc : 7.3409526 7.34095259 7.3409526
 angles : 25.103119069999998 25.103119139999997 25.103119100000026
 volume : 62.6493334321811
      A : 3.11438974864272 0.0 6.64756809434818
      B : 1.4800076654772434 2.7402555974884333 6.647568089097668
      C : 0.0 0.0 7.3409526
PeriodicSite: Mo (4.5934, 2.7396, 20.6315) [0.9998, 0.9998, 0.9998]
PeriodicSite: S (1.8678, 1.1140, 8.3893) [0.4065, 0.4065, 0.4065]
PeriodicSite: S (1.1931, 0.7116, 5.3589) [0.2597, 0.2597, 0.2597]

Structure Summary
Lattice
    abc : 3.1906 3.1906 21.3183
 angles : 90.0 90.0 119.99999999999999
 volume : 187.94376510952148
      A : 3.1906 0.0 1.9536790386797724e-16
      B : -1.5952999999999993 2.7631406533146303 1.9536790386797724e-16
      C : 0.0 0.0 21.3183
PeriodicSite: S (0.0000, 0.0000, 0.0000) [0.0000, 0.0000, 0.0000]
PeriodicSite: S (0.0000, 0.0000, 3.1306) [0.0000, 0.0000, 0.1469]
PeriodicSite: Mo (0.0000, 0.0000, 8.6714) [0.0000, 0.0000, 0.4068]

'160_a_a_a'

********************************************************************************


Structure Summary
Lattice
    abc : 3.19064344 3.1906434400000006 21.31819994
 angles : 90.0 90.0 119.99999999999999
 volume : 187.9480006902036
      A : 3.19064344 0.0 1.95370563800825e-16
      B : -1.5953217199999994 2.763178273458171 1.95370563800825e-16
      C : 0.0 0.0 21.31819994
PeriodicSite: Mo (1.5953, 0.9211, 7.1013) [0.6667, 0.3333, 0.3331]
PeriodicSite: Mo (-0.0000, 1.8421, 14.2074) [0.3333, 0.6667, 0.6664]
PeriodicSite: Mo (0.0000, 0.0000, 21.3134) [0.0000, 0.0000, 0.9998]
PeriodicSite: S (-0.0000, 1.8421, 1.5605) [0.3333, 0.6667, 0.0732]
PeriodicSite: S (0.0000, 0.0000, 5.5361) [0.0000, 0.0000, 0.2597]
PeriodicSite: S (0.0000, 0.0000, 8.6666) [0.0000, 0.0000, 0.4065]
PeriodicSite: S (1.5953, 0.9211, 12.6421) [0.6667, 0.3333, 0.5930]
PeriodicSite: S (1.5953, 0.9211, 15.7727) [0.6667, 0.3333, 0.7399]
PeriodicSite: S (-0.0000, 1.8421, 19.7482) [0.3333, 0.6667, 0.9264]

Structure Summary
Lattice
    abc : 3.1906 3.1906 21.3182
 angles : 90.0 90.0 119.99999999999999
 volume : 187.94288350186463
      A : 3.1906 0.0 1.9536790386797724e-16
      B : -1.5952999999999993 2.7631406533146303 1.9536790386797724e-16
      C : 0.0 0.0 21.3182
PeriodicSite: S (0.0000, 0.0000, 0.0000) [0.0000, 0.0000, 0.0000]
PeriodicSite: S (0.0000, 0.0000, 3.1306) [0.0000, 0.0000, 0.1469]
PeriodicSite: Mo (0.0000, 0.0000, 8.6712) [0.0000, 0.0000, 0.4068]

'160_a_a_a'

********************************************************************************


Structure Summary
Lattice
    abc : 7.3409526 7.3409526 7.3409526
 angles : 25.10311911999998 25.10311911999998 25.103119119999967
 volume : 62.64933359662721
      A : 3.1143897463222805 0.0 6.647568095435307
      B : 1.4800076695142357 2.7402556067229202 6.647568095435307
      C : 0.0 0.0 7.3409526
PeriodicSite: Mo (4.5934, 2.7396, 20.6315) [0.9998, 0.9998, 0.9998]
PeriodicSite: S (1.8678, 1.1140, 8.3893) [0.4065, 0.4065, 0.4065]
PeriodicSite: S (1.1931, 0.7116, 5.3589) [0.2597, 0.2597, 0.2597]

Structure Summary
Lattice
    abc : 3.1906 3.1906 21.3183
 angles : 90.0 90.0 119.99999999999999
 volume : 187.94376510952148
      A : 3.1906 0.0 1.9536790386797724e-16
      B : -1.5952999999999993 2.7631406533146303 1.9536790386797724e-16
      C : 0.0 0.0 21.3183
PeriodicSite: S (0.0000, 0.0000, 0.0000) [0.0000, 0.0000, 0.0000]
PeriodicSite: S (0.0000, 0.0000, 3.1306) [0.0000, 0.0000, 0.1469]
PeriodicSite: Mo (0.0000, 0.0000, 8.6714) [0.0000, 0.0000, 0.4068]

'160_a_a_a'

********************************************************************************


Structure Summary
Lattice
    abc : 3.19064343 3.19064343 21.31819994
 angles : 90.0 90.0 119.99999999999999
 volume : 187.94799951208387
      A : 3.19064343 0.0 1.953705631885016e-16
      B : -1.5953217149999994 2.763178264797917 1.953705631885016e-16
      C : 0.0 0.0 21.31819994
PeriodicSite: Mo (0.0000, 0.0000, 0.0048) [0.0000, 0.0000, 0.0002]
PeriodicSite: Mo (1.5953, 0.9211, 7.1108) [0.6667, 0.3333, 0.3336]
PeriodicSite: Mo (0.0000, 1.8421, 14.2169) [0.3333, 0.6667, 0.6669]
PeriodicSite: S (0.0000, 0.0000, 12.6516) [0.0000, 0.0000, 0.5935]
PeriodicSite: S (1.5953, 0.9211, 19.7577) [0.6667, 0.3333, 0.9268]
PeriodicSite: S (0.0000, 1.8421, 5.5455) [0.3333, 0.6667, 0.2601]
PeriodicSite: S (0.0000, 0.0000, 15.7821) [0.0000, 0.0000, 0.7403]
PeriodicSite: S (1.5953, 0.9211, 1.5700) [0.6667, 0.3333, 0.0736]
PeriodicSite: S (0.0000, 1.8421, 8.6761) [0.3333, 0.6667, 0.4070]

Structure Summary
Lattice
    abc : 3.1906 3.1906 21.3182
 angles : 90.0 90.0 119.99999999999999
 volume : 187.94288350186463
      A : 3.1906 0.0 1.9536790386797724e-16
      B : -1.5952999999999993 2.7631406533146303 1.9536790386797724e-16
      C : 0.0 0.0 21.3182
PeriodicSite: S (0.0000, 0.0000, 0.0000) [0.0000, 0.0000, 0.0000]
PeriodicSite: S (0.0000, 0.0000, 3.1306) [0.0000, 0.0000, 0.1469]
PeriodicSite: Mo (0.0000, 0.0000, 8.6712) [0.0000, 0.0000, 0.4068]

'160_a_a_a'

********************************************************************************
