# Get chiral and racemic pairs from CSD

**Instruction**  
This notebook aims to get molecules whose chiral and racemic crystals have been reported in the Cambridge Structural Database (CSD).  
The execution of the following code requires the CSD lisence (https://www.ccdc.cam.ac.uk/solutions/csd-licence/).  
  
**Search procedude and conditions**  
1. collect racemic and chiral crystals of the same molecule (chiral is identified by the space group) 
1. include only organic
1. exclude NaN in density, smiles, temperature
1. save racemic and chiral data

In [1]:
## Import packages
import pandas as pd
import numpy as np
import pathlib
from ccdc import io
from rdkit import Chem

In [2]:
entry_reader = io.EntryReader('CSD')
csd_mol_reader = io.MoleculeReader('CSD')

In [3]:
%%time
space_group_list = [entry_reader[i].crystal.spacegroup_symbol for i in range(len(entry_reader))]
print(len(space_group_list))
unique_space_group = list(set(space_group_list))
print(len(unique_space_group))

1153251
599
Wall time: 44min 34s


In [4]:
# Exclude data if '*' is included in space group
count = 0
unique_space_group_rev = []
for i in range(len(unique_space_group)):
    if '*' in unique_space_group[i]:
        count += 1
    else:
        unique_space_group_rev.append(unique_space_group[i])
print(count)

94


In [5]:
# Set chiral space groups
chiral_space_group = ['P1', 'P2', 'P21', 'C2', 'I2', 'P222', 'P2221', 'P21212', 'P212121', 'C2221', 'P1121',
                     'C222', 'F222', 'I222', 'I212121', 'P4', 'P41', 'P42', 'P43', 'I4', 'I41',
                     'P422', 'P4212', 'P4122', 'P41212', 'P412121', 'P4222', 'P42212', 'P4322', 'P43212',
                     'I422', 'I4122', 'P3', 'P31', 'P32', 'R3', 'P312', 'P321', 'P3112', 'P3121',
                     'P3212', 'P3221', 'R32', 'P6', 'P61', 'P65', 'P63', 'P62', 'P64', 'P622',
                     'P6122', 'P6522', 'P6222', 'P6422', 'P6322', 'P23', 'F23', 'I23', 'P213',
                     'I213', 'P432', 'P4232', 'F432', 'F4132', 'I432', 'P4332', 'P4132', 'I4132',
                     'I1', 'P22121']
len(chiral_space_group)

70

In [6]:
%%time
#######################################
## Make a dataset of chiral crystals ##
#######################################

smiles_chiral_list = []
refcode_chiral_list = []
density_chiral_list = []
temperature_chiral_list = []
pressure_chiral_list = []
spacegroup_chiral_list = []

for i in range(len(entry_reader)):
    data = entry_reader[i]
    if i%100000==0:
        print('Processing:', i)
    try:
        if (data.is_organic==True and
            data.calculated_density>0 and
            data.temperature!=None and
            data.has_3d_structure==True and
            data.r_factor<10 and
            data.crystal.spacegroup_symbol in chiral_space_group):

            # Data aquisition
            mol_data = csd_mol_reader.molecule(data.identifier)
            mol = Chem.MolFromSmiles(mol_data.smiles)
            refcode = data.identifier
            smiles_chiral_list.append(Chem.MolToSmiles(mol))
            refcode_chiral_list.append(refcode)
            density_chiral_list.append(data.calculated_density)
            temperature_chiral_list.append(data.temperature)
            pressure_chiral_list.append(data.pressure)
            spacegroup_chiral_list.append(data.crystal.spacegroup_symbol)
    except:
        continue

print(len(refcode_chiral_list))

Processing: 0
Processing: 100000
Processing: 200000
Processing: 300000
Processing: 400000
Processing: 500000
Processing: 600000
Processing: 700000
Processing: 800000
Processing: 900000
Processing: 1000000
Processing: 1100000
83447
Wall time: 34min


In [7]:
df = pd.DataFrame({
    'SMILES': smiles_chiral_list, 
    'refcode': refcode_chiral_list,
    'spacegroup': spacegroup_chiral_list,
    'density': density_chiral_list,
    'temperature': temperature_chiral_list, 
    'pressure': pressure_chiral_list,
})
df.head()

Unnamed: 0,SMILES,refcode,spacegroup,density,temperature,pressure
0,CC1NC(=O)CNC(=O)C(C)NC(=O)C(C)NC(=O)CNC(=O)CNC...,AAGAGG10,P212121,1.348,at -135 deg.C,
1,CC1NC(=O)CNC(=O)CNC(=O)C(C)NC(=O)C(C)NC(=O)CNC...,AAGGAG10,P21,1.396,at -135 deg.C,
2,COc1c(N2CC3CCC[NH2+]C3C2)c(F)cc2c(=O)c(C(=O)O)...,ABABIQ,P1,1.408,at 296 K,
3,c1ccc2c(c1)COc1nc3c(cc1-2)CCC3.c1ccc2c(c1)COc1...,ABACEM,P212121,1.307,at 295 K,
4,CC(=O)OCC1OC(=O)C(O)C1O,ABACEN,P21,1.507,at 293 K,


In [8]:
%%time
#######################################
## get racemic information           ##
#######################################
smiles_rac_list = []
refcode_rac_list = []
density_rac_list = []
temperature_rac_list = []
pressure_rac_list = []
spacegroup_rac_list = []

for i in range(len(entry_reader)):
    data = entry_reader[i]
    if i%100000==0:
        print('Processing:', i)
    try:
        # Data aquisition
        mol_data = csd_mol_reader.molecule(data.identifier)
        mol = Chem.MolFromSmiles(mol_data.smiles)
        smiles = Chem.MolToSmiles(mol)
        refcode = data.identifier
        spacegroup = data.crystal.spacegroup_symbol

        if (smiles in df['SMILES'].tolist() and 
            refcode not in df['refcode'].tolist() and 
            spacegroup not in chiral_space_group and
            spacegroup != 'Unknown' and
            data.calculated_density>0):
            smiles_rac_list.append(Chem.MolToSmiles(mol))        
            refcode_rac_list.append(refcode)
            density_rac_list.append(data.calculated_density)
            temperature_rac_list.append(data.temperature)
            pressure_rac_list.append(data.pressure)
            spacegroup_rac_list.append(data.crystal.spacegroup_symbol)
    except:
        continue

print(len(refcode_rac_list))

Processing: 0
Processing: 100000
Processing: 200000
Processing: 300000
Processing: 400000
Processing: 500000
Processing: 600000
Processing: 700000
Processing: 800000
Processing: 900000
Processing: 1000000
Processing: 1100000
2627
Wall time: 4h 5min


In [9]:
df_rac = pd.DataFrame({
    'SMILES': smiles_rac_list, 
    'refcode': refcode_rac_list,
    'spacegroup': spacegroup_rac_list,
    'density': density_rac_list,
    'temperature': temperature_rac_list, 
    'pressure': pressure_rac_list,
})
df_rac.head()

Unnamed: 0,SMILES,refcode,spacegroup,density,temperature,pressure
0,O=C(O)c1ccc(Cn2cnc3ccccc32)cc1,ABADIS,P21/c,1.358,at 293 K,
1,CC1=CCCC2(C)OC2CC(C)(C)C2OC2C1,ABEQAZ,P21/n,1.109,,
2,C[Si]1(C)c2ccc3ccccc3c2-c2c(ccc3ccccc23)[Si]1(C)C,ABIBAO,Fdd2,1.179,at 296 K,
3,C1=NC2CCCCC2N=Cc2cc3cc(c2)-c2ccc(cc2)-c2cc4cc(...,ABIMEG,C2/c,1.136,100 K,
4,C1=NC2CCCCC2N=Cc2cc3cc(c2)-c2ccc(cc2)-c2cc4cc(...,ABINAD,R-3,1.047,350 K,


In [10]:
# Cleaning temperature column
data = df['temperature']
data = data.str.strip('at')
error_list = []
for i in range(len(data)):
    try:
        if 'deg.C' in data[i]:
            data[i] = data[i].strip('deg.C').lstrip()
            data[i] = float(data[i])+273
        elif 'K' and '-' in data[i]:
            data[i] = data[i].strip('K').lstrip()
            ti, tf = data[i].split('-')
            data[i] = (float(ti)+float(tf))/2
        elif 'K' in data[i]:
            data[i] = data[i].strip('K').lstrip()
            data[i] = float(data[i])
        else:
            print('Error at', i)
            error_list.append(i)            
    except:
        print('Error at', i)
        error_list.append(i)

Error at 5667
Error at 10300
Error at 18199
Error at 42061
Error at 50169


In [11]:
data[:5]

0    138.0
1    138.0
2    296.0
3    295.0
4    293.0
Name: temperature, dtype: object

In [12]:
# Reflect to dataframe
df['temperature'] = data
df = df.drop(df.index[error_list])
df = df.reset_index(drop=True)

In [14]:
# Cleaning temperature column
data = df_rac['temperature']
data = data.str.strip('at')
error_list = []
for i in range(len(data)):
    try:
        if 'deg.C' in data[i]:
            data[i] = data[i].strip('deg.C').lstrip()
            data[i] = float(data[i])+273
        elif 'K' and '-' in data[i]:
            data[i] = data[i].strip('K').lstrip()
            ti, tf = data[i].split('-')
            data[i] = (float(ti)+float(tf))/2
        elif 'K' in data[i]:
            data[i] = data[i].strip('K').lstrip()
            data[i] = float(data[i])
        else:
            error_list.append(i)            
    except:
        error_list.append(i)
print(error_list)

[1, 5, 6, 20, 21, 33, 34, 43, 49, 50, 52, 53, 55, 59, 62, 85, 89, 102, 112, 114, 115, 123, 126, 140, 141, 143, 144, 160, 161, 162, 169, 178, 185, 186, 188, 221, 226, 227, 229, 230, 238, 239, 242, 260, 261, 266, 269, 270, 284, 286, 287, 300, 305, 308, 313, 319, 322, 323, 324, 329, 365, 367, 368, 369, 370, 373, 381, 391, 394, 395, 396, 399, 401, 402, 403, 404, 412, 426, 427, 428, 447, 448, 449, 451, 455, 456, 457, 458, 460, 463, 464, 465, 472, 473, 474, 480, 481, 484, 485, 486, 512, 513, 514, 515, 516, 517, 520, 521, 525, 539, 540, 546, 547, 550, 551, 557, 567, 583, 585, 587, 591, 592, 593, 594, 596, 600, 629, 654, 658, 662, 665, 666, 677, 678, 679, 701, 705, 707, 708, 739, 740, 741, 742, 743, 746, 804, 805, 825, 838, 859, 880, 906, 909, 911, 918, 922, 924, 979, 1037, 1043, 1044, 1047, 1049, 1054, 1056, 1057, 1062, 1069, 1078, 1089, 1117, 1118, 1128, 1141, 1146, 1154, 1178, 1187, 1188, 1189, 1190, 1195, 1230, 1232, 1238, 1239, 1241, 1265, 1267, 1268, 1269, 1291, 1292, 1293, 1307, 1309, 1

In [15]:
data[:5]

0    293.0
1     None
2    296.0
3    100.0
4    350.0
Name: temperature, dtype: object

In [16]:
# Reflect to dataframe
df_rac['temperature'] = data
df_rac = df_rac.drop(df_rac.index[error_list])
df_rac = df_rac.reset_index(drop=True)

In [17]:
# set level
df_rac['label'] = 'racemic'
df['label'] = 'chiral'

In [18]:
%%time
df_docking = pd.DataFrame(columns=df_rac.columns)
for i in range(df_rac.shape[0]):
    df_docking = df_docking.append(df_rac.iloc[i,:], ignore_index=True)
    for j in range(df.shape[0]):
        if df['SMILES'][j] == df_rac['SMILES'][i]:
            df_docking = df_docking.append(df.iloc[j,:], ignore_index=True)

Wall time: 34min 43s


In [19]:
df_docking.head()

Unnamed: 0,SMILES,refcode,spacegroup,density,temperature,pressure,label
0,O=C(O)c1ccc(Cn2cnc3ccccc32)cc1,ABADIS,P21/c,1.358,293.0,,racemic
1,O=C(O)c1ccc(Cn2cnc3ccccc32)cc1,ABADIS01,P212121,1.32,293.0,,chiral
2,C[Si]1(C)c2ccc3ccccc3c2-c2c(ccc3ccccc23)[Si]1(C)C,ABIBAO,Fdd2,1.179,296.0,,racemic
3,C[Si]1(C)c2ccc3ccccc3c2-c2c(ccc3ccccc23)[Si]1(C)C,LUMJAE,I41,1.145,296.2,,chiral
4,C1=NC2CCCCC2N=Cc2cc3cc(c2)-c2ccc(cc2)-c2cc4cc(...,ABIMEG,C2/c,1.136,100.0,,racemic


In [20]:
df_docking = df_docking.drop_duplicates()
# df_docking.to_csv('./dataset/chiral-racemic-pairs.csv') # Please uncomment out when executing.

# [Notice] Cleaning the exported csv
Data in the exported csv was manually checked one by one according to the following conditions for easier analysis.  
1. If one SMILES corresponded to one chiral crystal code and one racemic crystal code, the data were used as is.
1. If a SMILES corresponded to one chiral crystal code and several racemic crystal codes, the racemic crystal data were partially deleted. In this case, if the same space group had different measurement temperatures, only the data close to the measurement temperature of the chiral crystal was kept. If there were racemic crystals in different space groups, only one data set in each space group was kept.
1. If a SMILES corresponded to codes for several chiral crystals and one racemic crystal, the data for the chiral crystals were partially removed in the same way as described above.
1. The above data shaping left only one code per a space group when multiple data were reported. Polymorphism of chiral or racemic crystals with different space groups are kept in the revised dataset.
1. On the other hand, if the crystalline polymorphism is in the same space group, it is not included in the formatted data set because one of the data was deleted during the data formulation process.

The revised dataset was renamed as `chiral-racemic-pairs_cleaned.csv`.