# Get CIF files from CSD

**Instruction**  
The aim of this notebook is to obtain crystal structures from Cambridge Structural Database (CSD).  
Obtaining CIF files by the following code requires CSD lisence to use CSD Python API.

In [1]:
# Import packages
import pandas as pd
import numpy as np
import pathlib, os
from ccdc import io
from rdkit import Chem

In [2]:
%%time
entry_reader = io.EntryReader('CSD')
csd_mol_reader = io.MoleculeReader('CSD')

#######################################
## Get crystal structure's information ##
#######################################

smiles_list = []
refcode_list = []
density_list = []
temperature_list = []
spacegroup_list = []

for i in range(len(entry_reader)):
    data = entry_reader[i]
    if i%100000==0:
        print('Processing:', i)
    try:
        if (data.is_organic is True and
            data.calculated_density > 0 and
            data.temperature is not None and
            data.has_3d_structure is True and
            data.r_factor < 10 and 
            data.pressure is None and 
            data.disorder_details is None):
            
            mol_data = csd_mol_reader.molecule(data.identifier)
            mol = Chem.MolFromSmiles(mol_data.smiles)
            refcode = data.identifier
            smiles_list.append(Chem.MolToSmiles(mol))
            refcode_list.append(refcode)
            density_list.append(data.calculated_density)
            temperature_list.append(data.temperature)
            spacegroup_list.append(data.crystal.spacegroup_symbol)
    except:
        continue

Processing: 0
Processing: 100000
Processing: 200000
Processing: 300000
Processing: 400000
Processing: 500000
Processing: 600000
Processing: 700000
Processing: 800000
Processing: 900000
Processing: 1000000
Processing: 1100000
Wall time: 33min 36s


In [3]:
df = pd.DataFrame({
    'SMILES': smiles_list, 
    'refcode': refcode_list,
    'spacegroup': spacegroup_list,
    'density': density_list,
    'temperature': temperature_list, 
})
df.head()

Unnamed: 0,SMILES,refcode,spacegroup,density,temperature
0,CC1NC(=O)CNC(=O)C(C)NC(=O)C(C)NC(=O)CNC(=O)CNC...,AAGAGG10,P212121,1.348,at -135 deg.C
1,CC1NC(=O)CNC(=O)CNC(=O)C(C)NC(=O)C(C)NC(=O)CNC...,AAGGAG10,P21,1.396,at -135 deg.C
2,COc1ccc(C=NO)cc1.COc1ccc(C=NO)cc1,AANHOX01,Pna21,1.318,at 105 K
3,COc1ccc(C2OC(=O)C(C)C2(C)CC(C)C)cc1,ABABAH,Pbca,1.163,at 193 K
4,Cc1ccc(-c2nc3ccc4ccccc4c3cc2CCO)cc1,ABABAI,P-1,1.277,at 296 K


In [4]:
# Cleaning temperature column
data = df['temperature']
data = data.str.strip('at')
error_list = []
for i in range(len(data)):
    try:
        if 'deg.C' in data[i]:
            data[i] = data[i].strip('deg.C').lstrip()
            data[i] = float(data[i])+273
        elif 'K' and '-' in data[i]:
            data[i] = data[i].strip('K').lstrip()
            ti, tf = data[i].split('-')
            data[i] = (float(ti)+float(tf))/2
        elif 'K' in data[i]:
            data[i] = data[i].strip('K').lstrip()
            data[i] = float(data[i])
        else:
            print('Error at', i)
            error_list.append(i)            
    except:
        print('Error at', i)
        error_list.append(i)

Error at 28611
Error at 34366
Error at 40417
Error at 61881
Error at 61882
Error at 61887
Error at 134741
Error at 141880
Error at 162016
Error at 176330
Error at 199866
Error at 263549


In [5]:
# Reflect to dataframe
df['temperature'] = data
df = df.drop(df.index[error_list])
df = df.reset_index(drop=True)

In [6]:
df = df[df['temperature'] > 273]
df = df[df['temperature'] < 313]
df = df.reset_index(drop=True)
df.head()

Unnamed: 0,SMILES,refcode,spacegroup,density,temperature
0,Cc1ccc(-c2nc3ccc4ccccc4c3cc2CCO)cc1,ABABAI,P-1,1.277,296.0
1,O=c1[nH]c2ccccc2c(Cl)c1-c1ccccc1,ABABEL,P21/c,1.422,295.0
2,O=c1sc(=O)n2n1CCCC2.O=c1sc(=O)n2n1CCCC2,ABABEM,P-1,1.507,293.0
3,O=c1[nH]c2ccccc2c2c1CCc1ccccc1-2,ABABIP,P21/c,1.335,293.0
4,COc1c(N2CC3CCC[NH2+]C3C2)c(F)cc2c(=O)c(C(=O)O)...,ABABIQ,P1,1.408,296.0


In [7]:
df.shape

(111939, 5)

In [8]:
## Download cif file to a local folder
def ref2cif(reflist, path):
    os.makedirs(path, exist_ok=True)
    exclude_list = []
    for ref in reflist:
        x = entry_reader.entry(ref)
        if x.has_3d_structure is True:
            x = x.crystal.to_string(format='cif')
            cif = pathlib.Path(path+ref+'.cif')
            cif.touch()
            cif = open(path+ref+'.cif', 'w', encoding='utf_8')
            cif.write(x)
            cif.close
        else:
            exclude_list.append(ref)
    return exclude_list

In [9]:
%%time
exclude_list = ref2cif(df['refcode'], 'D:datasets/NNP_modulus_rev/')
exclude_list

Wall time: 7min 36s


[]