# Wyckoff from CSD Entry

12th May - A notebook for converting a CSD entry to .cif and then to an aflow label.

Read CSD entry and write to .cif

In [10]:
from ccdc import io, utilities
import re 

csd_reader = io.EntryReader('/Applications/CCDC/CSD_2022/DATA/CSD_543/as543be_ASER.sqlite')
# entry = csd_reader.entry('CAXZEG')
# entry = csd_reader.entry('ADAGES')

entry_name = 'CAXZEG'
entry = csd_reader.entry(entry_name)
crystal = entry.crystal

print(f'SMILES: {crystal.molecule.smiles}')
print(f'Crystal System: {crystal.crystal_system}')
print(f'Spacegroup Symbol: {crystal.spacegroup_symbol}')
print(f'Spacegroup Number: {crystal.spacegroup_number_and_setting}')
print(f'Has disorder: {crystal.has_disorder}')
print(f'Disorder details: {entry.disorder_details}')

elems = [re.sub("([A-Z]).*", r"\1", elem) for elem in crystal.formula.split(" ")]
print(f'Sorted elements: {sorted(elems)}')

print('\n'.join('%-17s %s' % (op, utilities.print_set(crystal.atoms_on_special_positions(op))) for op in crystal.symmetry_operators))

cif_string = crystal.to_string(format='cif')
with open('tmp.cif', 'w') as f:
    f.write(cif_string)


SMILES: CCCC1=C2N=C(NC(=O)C2=NN1C)c1cc(ccc1OCC)S(=O)(=O)N1CCN(C)CC1
Crystal System: monoclinic
Spacegroup Symbol: P21/n
Spacegroup Number: (14, 2)
Has disorder: False
Disorder details: None
Sorted elements: ['C', 'H', 'N', 'O', 'S']
x,y,z             {Atom(C1), Atom(C10), Atom(C11), Atom(C12), Atom(C13), Atom(C14), Atom(C15), Atom(C16), Atom(C17), Atom(C18), Atom(C19), Atom(C2), Atom(C20), Atom(C21), Atom(C22), Atom(C3), Atom(C4), Atom(C5), Atom(C6), Atom(C7), Atom(C8), Atom(C9), Atom(H1), Atom(H10), Atom(H11), Atom(H12), Atom(H13), Atom(H14), Atom(H15), Atom(H16), Atom(H17), Atom(H18), Atom(H19), Atom(H2), Atom(H20), Atom(H21), Atom(H22), Atom(H23), Atom(H24), Atom(H25), Atom(H26), Atom(H27), Atom(H28), Atom(H29), Atom(H3), Atom(H30), Atom(H4), Atom(H5), Atom(H6), Atom(H7), Atom(H8), Atom(H9), Atom(N1), Atom(N2), Atom(N3), Atom(N4), Atom(N5), Atom(N6), Atom(O1), Atom(O2), Atom(O3), Atom(O4), Atom(S1)}
1/2-x,1/2+y,1/2-z set()
-x,-y,-z          set()
1/2+x,1/2-y,1/2+z set()


Load pymatgen structure from .cif

In [64]:
from wren_code import utils
from pymatgen.core import Composition, Structure
from pymatgen.io.cif import CifParser, CifFile

class CifStringParser(CifParser):
    
    def __init__(self, cif_string, occupancy_tolerance=1.0, site_tolerance=1e-4):
        """
        Args:
            filename (str): CIF filename, bzipped or gzipped CIF files are fine too.
            occupancy_tolerance (float): If total occupancy of a site is between 1
                and occupancy_tolerance, the occupancies will be scaled down to 1.
            site_tolerance (float): This tolerance is used to determine if two
                sites are sitting in the same position, in which case they will be
                combined to a single disordered site. Defaults to 1e-4.
        """
        self._occupancy_tolerance = occupancy_tolerance
        self._site_tolerance = site_tolerance
        if isinstance(cif_string, (str,)):
            self._cif = CifFile.from_string(cif_string)
        else:
            raise TypeError('cif_string needs to be a string!')
        # store if CIF contains features from non-core CIF dictionaries
        # e.g. magCIF
        self.feature_flags = {}
        self.warnings = []
        
        def is_magcif():
            """
            Checks to see if file appears to be a magCIF file (heuristic).
            """
            # Doesn't seem to be a canonical way to test if file is magCIF or
            # not, so instead check for magnetic symmetry datanames
            prefixes = [
                "_space_group_magn",
                "_atom_site_moment",
                "_space_group_symop_magn",
            ]
            for d in self._cif.data.values():
                for k in d.data.keys():
                    for prefix in prefixes:
                        if prefix in k:
                            return True
            return False

        self.feature_flags["magcif"] = is_magcif()

        def is_magcif_incommensurate():
            """
            Checks to see if file contains an incommensurate magnetic
            structure (heuristic).
            """
            # Doesn't seem to be a canonical way to test if magCIF file
            # describes incommensurate strucure or not, so instead check
            # for common datanames
            if not self.feature_flags["magcif"]:
                return False
            prefixes = ["_cell_modulation_dimension", "_cell_wave_vector"]
            for d in self._cif.data.values():
                for k in d.data.keys():
                    for prefix in prefixes:
                        if prefix in k:
                            return True
            return False

        self.feature_flags["magcif_incommensurate"] = is_magcif_incommensurate()

        for k in self._cif.data.keys():
            # pass individual CifBlocks to _sanitize_data
            self._cif.data[k] = self._sanitize_data(self._cif.data[k])
            
struct = CifStringParser(cif_string, occupancy_tolerance=10).get_structures()[0]
# struct = CifParser('tmp.cif', occupancy_tolerance=10).get_structures()[0]
# struct = Structure.from_str(cif_string, fmt="cif", merge_tol=0.1)
wyckoff_label = utils.get_aflow_label_spglib(struct)
print(wyckoff_label)

A20BC6D16E4F2G_tP200_130_5g_a_2cg_4g_g_f_c:C-Cu-F-H-N-O-Ti


17th May - A notebook for converting a CSD entry to aflow label using aflow directly instead of spglib

In [25]:
import pandas as pd

df = pd.read_csv('aflow_vs_spg.csv')
df['csd_spacegroup'] = [int(spg[0][1:]) for spg in df['csd_spacegroup'].str.split(',')]
df

Unnamed: 0,index,identifier,smiles,csd_spacegroup,wyckoff_spg,wyckoff_aflow
0,0,AABHTZ,CC(=O)NN1C=NN=C1N(N=Cc1c(Cl)cccc1Cl)C(C)=O,2,A13B2C12D6E2_aP70_2_13i_2i_12i_6i_2i:C-Cl-H-N-O,A13B2C12D6E2_aP70_2_13i_2i_12i_6i_2i:C-Cl-H-N-O
1,1,AACFAZ10,COC1=C(C(OC1=O)c1ccccc1Cl)C(C)=NN=C(C)C1=C(OC)...,60,A13BC11DE3_oP232_60_13d_d_11d_d_3d:C-Cl-H-N-O,A13BC11DE3_oP232_60_13d_d_11d_d_3d:C-Cl-H-N-O
2,2,AACMHX10,CC(=O)OC(=C1CCCCC1c1ccccc1)c1ccccc1,61,A21B22C2_oP360_61_21c_22c_2c:C-H-O,A21B22C2_oP360_61_21c_22c_2c:C-H-O
3,3,AADAMC,[Br-].[NH3+]C1(C2CC3CC(C2)CC1C3)C(O)=O,14,AB11C18DE2_mP132_14_e_11e_18e_e_2e:Br-C-H-N-O,AB11C18DE2_mP132_14_e_11e_18e_e_2e:Br-C-H-N-O
4,4,AADMPY10,Cc1[nH+]c(N)nc(N)c1C12CC3CC(CC(C3)C1)C2.CCS(=O...,2,A17B28C4D3E_aP106_2_17i_28i_4i_3i_i:C-H-N-O-S,A17B28C4D3E_aP106_2_17i_28i_4i_3i_i:C-H-N-O-S
...,...,...,...,...,...,...
853,103,ABZCTD,O=C1CCc2ccccc2C2C1C1C2c2ccccc2CCC1=O,60,A11B10C_oP176_60_11d_10d_d:C-H-O,A11B10C_oP176_60_11d_10d_d:C-H-O
854,104,ABZEHX10,C(OCc1ccccc1)C1OC=CC(OCc2ccccc2)C1OCc1ccccc1,4,A27B28C4_mP118_4_27a_28a_4a:C-H-O,A27B28C4_mP118_4_27a_28a_4a:C-H-O
855,105,ABZNPS,CN(C)c1ccc(CS(=O)(=O)c2ccc(cc2)N(=O)=O)cc1,2,A15B16C2D4E_aP76_2_15i_16i_2i_4i_i:C-H-N-O-S,A15B16C2D4E_aP76_2_15i_16i_2i_4i_i:C-H-N-O-S
856,106,ABZSLM,[Cl-].[NH3+]c1cccc(c1)S(N)(=O)=O,19,A6BC9D2E2F_oP84_19_6a_a_9a_2a_2a_a:C-Cl-H-N-O-S,A6BC9D2E2F_oP84_19_6a_a_9a_2a_2a_a:C-Cl-H-N-O-S


In [34]:
# from smi2wyk.wren_code.utils import get_aflow_label_with_aflow_from_ccdc_crystal
from string import digits
entry_name = 'ABEDOA'
entry = csd_reader.entry(entry_name)
crystal = entry.crystal
remove_digits = str.maketrans("", "", digits)

# print(crystal.formula)
elems = sorted([elem.translate(remove_digits).replace('.', '') for elem in crystal.formula.split(" ")])
print(elems)
print(entry.molecule.smiles)
print(crystal.to_string('cif'))
# wyckoff_label_aflow = get_aflow_label_with_aflow_from_ccdc_crystal(crystal)

['C', 'H', 'N', 'O', 'S']
OCCCOCC1=CSC(=N1)C1=NC(=C(S1)C1=C(COCCCO)N=C(S1)C1=NC(=CS1)COCCCO)COCCCO

#######################################################################
#
#                 Cambridge Crystallographic Data Centre
#                                CCDC 
#
#######################################################################
#
# If this CIF has been generated from an entry in the Cambridge 
# Structural Database, then it will include bibliographic, chemical, 
# crystal, experimental, refinement or atomic coordinate data resulting 
# from the CCDC's data processing and validation procedures.
#
#######################################################################

data_ABEDOA
_symmetry_cell_setting           triclinic
_symmetry_space_group_name_H-M   'P -1'
_symmetry_Int_Tables_number      2
_space_group_name_Hall           '-P 1'
loop_
_symmetry_equiv_pos_site_id
_symmetry_equiv_pos_as_xyz
1 x,y,z
2 -x,-y,-z
_cell_length_a                   4.99020(10)
_cell_length_b 

In [26]:
# df.wyckoff_spg = df.wyckoff_spg.str.split(':').str[0]
# df.wyckoff_aflow = df.wyckoff_aflow.str.split(':').str[0]
df.query('wyckoff_spg != wyckoff_aflow')

Unnamed: 0,index,identifier,smiles,csd_spacegroup,wyckoff_spg,wyckoff_aflow
8,8,AANHOX,COc1ccc(cc1)C=NO.COc1ccc(cc1)C=NO,33,A8BC2_oP88_33_16a_2a_4a:C-N-O,A8BC2_oP88_33_16a_2a_4a:C-H-N-O
10,10,AAPYPE,CC(=O)C=C(C)Nc1cncnc1N,14,A9B4C_mP56_14_9e_4e_e:C-N-O,A9B4C_mP56_14_9e_4e_e:C-H-N-O
130,22,ABATRG,COC(=O)C1CC(CC2(C)C1CCC13CC(CCC21)C(Br)(CBr)C3...,19,A2B22C5_oP116_19_2a_22a_5a:Br-C-O,A2B22C5_oP116_19_2a_22a_5a:Br-C-H-O
151,43,ABAYIN,CCOC(CP(=O)(c1ccccc1C)c1ccccc1C)OCC.O,2,A20B29C4D_aP108_2_20i_29i_4i_i:C-H-O-P,A20B29C4D_aP108_2_20i_29i_4i_i:C-H-O.-P
159,51,ABCMHP,CC(=O)OC1C(OC(C)=O)C(O)(CBr)OC(CCl)(CBr)C1OC(C)=O,14,A2B14CD8_mP100_14_2e_14e_e_8e:Br-C-Cl-O,A2B14CD8_mP100_14_2e_14e_e_8e:Br-C-Cl-H-O
164,56,ABEBAK,COC(=O)C1=C(C)NC(=C(C1c1ccc(cc1)C1=CC(=O)c2ccc...,14,A26BC6_mP132_14_26e_e_6e:C-N-O,A26BC6_mP132_14_26e_e_6e:C-H-N-O
177,69,ABECEP,O=C1OC(N2C3C4CC(C=C4)C3C2=O)c2ccccc12,61,A16BC3_oP160_61_16c_c_3c:C-N-O,A16BC3_oP160_61_16c_c_3c:C-H-N-O
181,73,ABEDIU,C=CCOCC1=CSC(=N1)C1=NC(=C(S1)C1=C(COCC=C)N=C(S...,2,A7BCD_aP80_2_28i_4i_4i_4i:C-N-O-S,A7BCD_aP80_2_28i_4i_4i_4i:C-H-N-O-S
184,76,ABEDOA,OCCCOCC1=CSC(=N1)C1=NC(=C(S1)C1=C(COCCCO)N=C(S...,2,A7BC2D_aP44_2_14i_2i_4i_2i:C-N-O-S,A7BC2D_aP44_2_14i_2i_4i_2i:C-H-N-O-S
260,48,ABELOL,CCOC(=O)C1CC2(C(=C)CN1S(=O)(=O)c1ccc(cc1)N(=O)...,2,AB23C3D21E3F7G_aP118_2_i_23i_3i_21i_3i_7i_i:Br...,AB23C3D21E3F7G_aP118_2_i_23i_3i_21i_3i_7i_i:Br...


In [22]:
from ccdc import io, utilities
import re 

csd_reader = io.EntryReader()

entry_name = 'ABIMUW'
entry = csd_reader.entry(entry_name)
crystal = entry.crystal

print(f'SMILES: {crystal.molecule.smiles}')
print(f'Crystal System: {crystal.crystal_system}')
print(f'Spacegroup Symbol: {crystal.spacegroup_symbol}')
print(f'Spacegroup Number: {crystal.spacegroup_number_and_setting}')
print(f'Has disorder: {crystal.has_disorder}')
print(f'Disorder details: {entry.disorder_details}')

elems = [re.sub("([A-Z]).*", r"\1", elem) for elem in crystal.formula.split(" ")]
print(f'Sorted elements: {sorted(elems)}')

print('\n'.join('%-17s %s' % (op, utilities.print_set(crystal.atoms_on_special_positions(op))) for op in crystal.symmetry_operators))

# cif_string = crystal.to_string(format='cif')
# with open('tmp.cif', 'w') as f:
#     f.write(cif_string)


SMILES: C1CCC2N=Cc3cc4C=NC5CCCCC5N=Cc5cc6C=NC7CCCCC7N=Cc7cc(C=NC2C1)cc(c7)c1ccc(cc1)c1cc2C=NC7CCCCC7N=Cc7cc(C=NC8CCCCC8N=Cc8cc(C=NC9CCCCC9N=Cc(c2)c1)cc(c8)c1ccc(cc1)c(c6)c5)cc(c7)c1ccc(cc1)c(c3)c4.O.O
Crystal System: rhombohedral
Spacegroup Symbol: R-3
Spacegroup Number: (148, 1)
Has disorder: False
Disorder details: None
Sorted elements: ['C', 'H', 'N', 'O']
x,y,z             {Atom(C1), Atom(C10), Atom(C10A), Atom(C10B), Atom(C11), Atom(C11A), Atom(C11B), Atom(C12), Atom(C12A), Atom(C12B), Atom(C13), Atom(C13A), Atom(C13B), Atom(C14), Atom(C14A), Atom(C14B), Atom(C15), Atom(C15A), Atom(C15B), Atom(C16), Atom(C16A), Atom(C16B), Atom(C17), Atom(C17A), Atom(C17B), Atom(C18), Atom(C18A), Atom(C18B), Atom(C19), Atom(C19A), Atom(C19B), Atom(C1A), Atom(C1B), Atom(C2), Atom(C20), Atom(C20A), Atom(C20B), Atom(C21), Atom(C21A), Atom(C21B), Atom(C22), Atom(C22A), Atom(C22B), Atom(C23), Atom(C23A), Atom(C23B), Atom(C24), Atom(C24A), Atom(C24B), Atom(C25), Atom(C25A), Atom(C25B), Atom(C26), Atom(C