# Search CSD entries and make a random sample

12th May - A notebook for running CSD searches, and randomly sampling the search results

In [3]:
from ccdc.search import Search, SMARTSSubstructure, SubstructureSearch

substructure = SMARTSSubstructure("[X]")
carbon_search = SubstructureSearch()
carbon_search.add_substructure(substructure)

# settings = Search.Settings()
carbon_search.settings.has_3d_coordinates = True
carbon_search.settings.no_disorder = True
carbon_search.settings.no_errors = True
carbon_search.settings.only_organic = True
carbon_search.settings.max_hit_structures = 1000
carbon_search.settings.max_hits_per_structure = 1

# carbon_search.settings = settings
hits = carbon_search.search()
print(len(hits))

1000


In [22]:
from wren_code import utils
from pymatgen.core import Composition, Structure
from pymatgen.io.cif import CifParser, CifFile

class CifStringParser(CifParser):
    
    def __init__(self, cif_string, occupancy_tolerance=1.0, site_tolerance=1e-4):
        """
        Args:
            filename (str): CIF filename, bzipped or gzipped CIF files are fine too.
            occupancy_tolerance (float): If total occupancy of a site is between 1
                and occupancy_tolerance, the occupancies will be scaled down to 1.
            site_tolerance (float): This tolerance is used to determine if two
                sites are sitting in the same position, in which case they will be
                combined to a single disordered site. Defaults to 1e-4.
        """
        self._occupancy_tolerance = occupancy_tolerance
        self._site_tolerance = site_tolerance
        if isinstance(cif_string, (str,)):
            self._cif = CifFile.from_string(cif_string)
        else:
            raise TypeError('cif_string needs to be a string!')
        # store if CIF contains features from non-core CIF dictionaries
        # e.g. magCIF
        self.feature_flags = {}
        self.warnings = []
        
        def is_magcif():
            """
            Checks to see if file appears to be a magCIF file (heuristic).
            """
            # Doesn't seem to be a canonical way to test if file is magCIF or
            # not, so instead check for magnetic symmetry datanames
            prefixes = [
                "_space_group_magn",
                "_atom_site_moment",
                "_space_group_symop_magn",
            ]
            for d in self._cif.data.values():
                for k in d.data.keys():
                    for prefix in prefixes:
                        if prefix in k:
                            return True
            return False

        self.feature_flags["magcif"] = is_magcif()

        def is_magcif_incommensurate():
            """
            Checks to see if file contains an incommensurate magnetic
            structure (heuristic).
            """
            # Doesn't seem to be a canonical way to test if magCIF file
            # describes incommensurate strucure or not, so instead check
            # for common datanames
            if not self.feature_flags["magcif"]:
                return False
            prefixes = ["_cell_modulation_dimension", "_cell_wave_vector"]
            for d in self._cif.data.values():
                for k in d.data.keys():
                    for prefix in prefixes:
                        if prefix in k:
                            return True
            return False

        self.feature_flags["magcif_incommensurate"] = is_magcif_incommensurate()

        for k in self._cif.data.keys():
            # pass individual CifBlocks to _sanitize_data
            self._cif.data[k] = self._sanitize_data(self._cif.data[k])
            

In [24]:
import pandas as pd
from tqdm import tqdm

entry_identifiers = []
entry_compounds = []
entry_wyckoffs = []
for hit in tqdm(hits):
    try:
        cif_string = hit.crystal.to_string(format='cif')
        struct = CifStringParser(cif_string, occupancy_tolerance=10).get_structures()[0]
        wyckoff_label = utils.get_aflow_label_spglib(struct)

        entry_identifiers.append(hit.entry.identifier)
        entry_compounds.append(hit.entry.chemical_name)
        entry_wyckoffs.append(wyckoff_label)
    except TypeError:
        pass
    
df_wyckoff = pd.DataFrame({'identifier': entry_identifiers,
                           'compound_name': entry_compounds,
                           'wyckoff': entry_wyckoffs})

Some occupancies ([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]) sum to > 1! If they are within the occupancy_tolerance, they will be rescaled. The current occupancy_tolerance is set to: 10
Some occupancies ([2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]) sum to > 1! If they are within the occupancy_tolerance, they will be rescaled. The current occupancy_tolerance is set to: 10
Some occupancies ([3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 1, 1]) sum to > 1! If they are within the occupancy_tolerance, they will be rescaled. The current occupancy_tolerance is set to: 10
Some occupancies ([3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 

In [25]:
print(f'Successful featurisations: {len(df_wyckoff)/len(hits)*100:.2f}%')
print(f'Unique featurisations: {len(df_wyckoff.identifier.unique())/len(hits)*100:.2f}%')

df_wyckoff.to_csv('sample.csv', index=False)

Successful featurisations: 96.18%
Unique featurisations: 11.48%


Count number of wyckoff positions (number of atoms) and the number of params using native Wren code 

In [1]:
from wren_code.utils import count_wyks, count_params, count_distinct_wyckoff_letters, return_spacegroup_number
import pandas as pd
from tqdm import tqdm

tqdm.pandas()
df_wyckoff = pd.read_csv('sample.csv')
df_wyckoff['n_atoms'] = df_wyckoff['wyckoff'].progress_apply(count_wyks)
df_wyckoff['n_wyk'] = df_wyckoff['wyckoff'].progress_apply(count_distinct_wyckoff_letters)
df_wyckoff['spg'] = df_wyckoff['wyckoff'].progress_apply(return_spacegroup_number)
df_wyckoff['n_param'] = df_wyckoff['wyckoff'].progress_apply(count_params)


100%|██████████| 7962/7962 [00:00<00:00, 55497.84it/s]
100%|██████████| 7962/7962 [00:00<00:00, 268233.32it/s]
100%|██████████| 7962/7962 [00:00<00:00, 561572.78it/s]
100%|██████████| 7962/7962 [00:00<00:00, 43460.90it/s]


In [2]:
df_wyckoff

Unnamed: 0,identifier,compound_name,wyckoff,n_atoms,n_wyk,spg,n_param
0,AABHTZ,"4-Acetoamido-3-(1-acetyl-2-(2,6-dichlorobenzyl...",A13B2C12D6E2_aP70_2_13i_2i_12i_6i_2i:C-Cl-H-N-O,35,1,2,111
1,AABHTZ,"4-Acetoamido-3-(1-acetyl-2-(2,6-dichlorobenzyl...",A13B2C12D6E2_aP70_2_13i_2i_12i_6i_2i:C-Cl-H-N-O,35,1,2,111
2,AACFAZ10,"N,N'-bis(3-Acetyl-4-(2-chlorophenyl)-4-hydroxy...",A13BC11DE3_oP232_60_13d_d_11d_d_3d:C-Cl-H-N-O,29,1,60,90
3,AACFAZ10,"N,N'-bis(3-Acetyl-4-(2-chlorophenyl)-4-hydroxy...",A13BC11DE3_oP232_60_13d_d_11d_d_3d:C-Cl-H-N-O,29,1,60,90
4,AACFAZ10,"N,N'-bis(3-Acetyl-4-(2-chlorophenyl)-4-hydroxy...",A13BC11DE3_oP232_60_13d_d_11d_d_3d:C-Cl-H-N-O,29,1,60,90
...,...,...,...,...,...,...,...
7957,ACATOQ,"N-(1-(3,5-dimethyl-2,4-dioxo-6-(2-phenylethyl)...",A22B28CDE5_aP114_1_44a_56a_2a_2a_10a:C-H-I-N-O,114,1,1,348
7958,ACATOQ,"N-(1-(3,5-dimethyl-2,4-dioxo-6-(2-phenylethyl)...",A22B28CDE5_aP114_1_44a_56a_2a_2a_10a:C-H-I-N-O,114,1,1,348
7959,ACATOQ,"N-(1-(3,5-dimethyl-2,4-dioxo-6-(2-phenylethyl)...",A22B28CDE5_aP114_1_44a_56a_2a_2a_10a:C-H-I-N-O,114,1,1,348
7960,ACATOQ,"N-(1-(3,5-dimethyl-2,4-dioxo-6-(2-phenylethyl)...",A22B28CDE5_aP114_1_44a_56a_2a_2a_10a:C-H-I-N-O,114,1,1,348


In [4]:
df_wyckoff.n_wyk.value_counts()

1    7834
2     100
3      28
Name: n_wyk, dtype: int64

In [3]:
df_wyckoff.spg.value_counts()

14     1775
4      1640
19     1522
2      1502
1       425
61      219
5       113
18      112
29       92
33       90
7        63
9        62
76       38
60       38
43       31
148      24
78       19
152      19
182      18
56       17
82       16
144      16
12       15
86       15
62       14
88       13
146      13
92       12
23        8
80        7
169       7
54        4
13        2
161       1
Name: spg, dtype: int64