# Dataset Preparation for PU learning
**Instruction**  
The aim of this notebook is to make positive and unlabeled datasets.  
Preparation of positive dataset requires `PositiveDataset_public_YYMMDD.xlsx`, which we collected manually.  
Preparation of unlabeled dataset requires Cambridge Structural Database (CSD) license (https://www.ccdc.cam.ac.uk/solutions/csd-licence/).

## Preparation of positive dataset

In [3]:
# Import libraries
from rdkit import Chem
from rdkit.Chem import rdMolDescriptors
import pandas as pd
import sys

In [4]:
# Read dataset
df = pd.read_excel('../datasets/PositiveDataset_public_221014.xlsx', sheet_name='PT', index_col=0)
df.head()

Unnamed: 0_level_0,SMILES,CCDC,Phase,CCDC.1,Phase.1,T_endo (K),dH_endo (kJ/mol),T_exo (K),dH_exo (kJ/mol),T_melt (K),dH_melt (kJ/mol),Acquisition,Acquisition memo,Ref.,Unnamed: 15
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1,CC(NC(=O)c1ccc(cc1)N=Nc1ccc(cc1)N(C)C)c1ccccc1,"1556537, 1556538, 1556539, 1556540",beta,1556541,gamma,418.2,1.31,,,,,0,,https://doi.org/10.1038/s41467-017-02549-2,
2,CC(NC(=O)c1ccc(cc1)N=Nc1ccc(cc1)N(C)C)c1ccccc1,1556541,gamma,"1556537, 1556538, 1556539, 1556540",beta,,,416.0,-1.29,,,0,,https://doi.org/10.1038/s41467-017-02549-2,
3,CC(NC(=O)c1ccc(cc1)N=Nc1ccc(cc1)N(C)C)c1ccccc1,1556541,gamma,,melt,,,,,493.0,36.2,0,,https://doi.org/10.1038/s41467-017-02549-2,
4,COC(=O)c1ccc2cc1C1=CN(CCOCCOCCOCCOc3ccc(cc3)C#...,969125,alpha,969126,beta,333.0,2.4,,,,,0,,https://doi.org/10.1002/anie.201402560,
5,COC(=O)c1ccc2cc1C1=CN(CCOCCOCCOCCOc3ccc(cc3)C#...,969126,beta,969125,alpha,,,329.0,-3.1,,,0,,https://doi.org/10.1002/anie.201402560,


In [5]:
# SMILES -> MOL -> SMILES_rdkit
permitted_atoms = ['H', 'B', 'C', 'N', 'O', 'F', 'Si', 'P', 'S', 'Cl', 'Br', 'I']
smiles_list = []

for smiles in df['SMILES']:
    try:
        mol = Chem.MolFromSmiles(smiles)
        for atom in mol.GetAtoms():
            assert atom.GetSymbol() in permitted_atoms
        smiles = Chem.MolToSmiles(mol)
        smiles_list.append(smiles)
    except:
        continue
print(len(smiles_list))

299


In [6]:
# Detele duplicate
smiles_list = list(set(smiles_list))
print(len(smiles_list))

88


In [7]:
# Save smiles
positive_smiles = pd.DataFrame({'SMILES': smiles_list})
positive_smiles.to_csv('../datasets/positive_smiles.csv')

## Preparation of unlabeled dataset

The following code requires Cambridge Structural Database (CSD) license.  
Please see the document of ```CSD Python API``` for the details (https://downloads.ccdc.cam.ac.uk/documentation/API/).

In [8]:
import ccdc, warnings
from ccdc import io
warnings.simplefilter('ignore')
print(ccdc.__version__)

3.0.14


In [9]:
%%time

# Make unlabeled dataset
entry_reader = io.EntryReader('CSD')
csd_mol_reader = io.MoleculeReader('CSD')

unlabeled_list = []
exclude_list = []

for i in range(len(entry_reader)): 
    data = entry_reader[i]
    if i%100000 == 0:
        print('Processing:', i)
    try:
        if (data.is_organic is True and
            data.has_3d_structure is True and
            data.r_factor <= 5 and
            data.has_disorder is False and
            data.is_polymeric is False and
            data.is_powder_study is False):
            
            mol_data = csd_mol_reader.molecule(data.identifier)
            mol = Chem.MolFromSmiles(mol_data.smiles)
            for atom in mol.GetAtoms():
                assert atom.GetSymbol() in permitted_atoms
            smiles = Chem.MolToSmiles(mol)
            if data.phase_transition is None:
                unlabeled_list.append(smiles)
            else:
                exclude_list.append(smiles)
    except:
        continue

Processing: 0


[10:24:53] Explicit valence for atom # 5 Ca, 3, is greater than permitted
[10:24:53] Explicit valence for atom # 8 C, 6, is greater than permitted
[10:24:54] Explicit valence for atom # 9 B, 4, is greater than permitted
[10:24:54] Explicit valence for atom # 2 O, 3, is greater than permitted
[10:24:54] Explicit valence for atom # 2 O, 3, is greater than permitted
[10:24:56] Explicit valence for atom # 0 O, 3, is greater than permitted
[10:24:56] Explicit valence for atom # 0 B, 4, is greater than permitted
[10:24:57] Explicit valence for atom # 6 B, 4, is greater than permitted
[10:24:57] Explicit valence for atom # 7 B, 4, is greater than permitted
[10:24:57] Explicit valence for atom # 1 Te, 8, is greater than permitted
[10:24:57] Explicit valence for atom # 12 N, 4, is greater than permitted
[10:24:57] Explicit valence for atom # 11 C, 6, is greater than permitted
[10:24:57] Explicit valence for atom # 11 C, 6, is greater than permitted
[10:24:57] Explicit valence for atom # 7 C, 6,

Processing: 100000


[10:28:03] Explicit valence for atom # 2 O, 3, is greater than permitted
[10:28:03] Explicit valence for atom # 1 O, 3, is greater than permitted
[10:28:03] Explicit valence for atom # 17 N, 4, is greater than permitted
[10:28:03] Explicit valence for atom # 8 H, 2, is greater than permitted
[10:28:04] Explicit valence for atom # 1 Li, 5, is greater than permitted
[10:28:04] Explicit valence for atom # 5 N, 4, is greater than permitted
[10:28:05] Explicit valence for atom # 15 C, 6, is greater than permitted
[10:28:05] Explicit valence for atom # 0 H, 2, is greater than permitted
[10:28:05] Explicit valence for atom # 8 O, 3, is greater than permitted
[10:28:05] Explicit valence for atom # 21 B, 6, is greater than permitted
[10:28:06] Explicit valence for atom # 25 B, 4, is greater than permitted
[10:28:06] Explicit valence for atom # 31 B, 4, is greater than permitted
[10:28:06] Explicit valence for atom # 0 O, 3, is greater than permitted
[10:28:06] Explicit valence for atom # 16 Li,

Processing: 200000


[10:31:11] Explicit valence for atom # 15 O, 3, is greater than permitted
[10:31:11] Explicit valence for atom # 1 O, 3, is greater than permitted
[10:31:11] Explicit valence for atom # 3 C, 6, is greater than permitted
[10:31:11] Explicit valence for atom # 1 B, 6, is greater than permitted
[10:31:11] Explicit valence for atom # 0 H, 2, is greater than permitted
[10:31:11] Explicit valence for atom # 0 H, 2, is greater than permitted
[10:31:11] Explicit valence for atom # 0 H, 2, is greater than permitted
[10:31:11] Explicit valence for atom # 5 Na, 5, is greater than permitted
[10:31:12] Explicit valence for atom # 1 B, 6, is greater than permitted
[10:31:12] Explicit valence for atom # 9 B, 4, is greater than permitted
[10:31:12] Explicit valence for atom # 1 N, 5, is greater than permitted
[10:31:12] Explicit valence for atom # 9 O, 3, is greater than permitted
[10:31:13] Explicit valence for atom # 4 N, 4, is greater than permitted
[10:31:13] Explicit valence for atom # 1 N, 4, is

Processing: 300000


[10:34:20] Explicit valence for atom # 8 B, 6, is greater than permitted
[10:34:20] Explicit valence for atom # 8 C, 6, is greater than permitted
[10:34:20] Explicit valence for atom # 9 N, 4, is greater than permitted
[10:34:20] Explicit valence for atom # 17 N, 4, is greater than permitted
[10:34:21] Explicit valence for atom # 0 H, 2, is greater than permitted
[10:34:22] Explicit valence for atom # 5 B, 4, is greater than permitted
[10:34:22] Explicit valence for atom # 7 Cl, 3, is greater than permitted
[10:34:23] Explicit valence for atom # 0 H, 2, is greater than permitted
[10:34:23] Explicit valence for atom # 0 H, 2, is greater than permitted
[10:34:23] Explicit valence for atom # 0 H, 2, is greater than permitted
[10:34:23] Explicit valence for atom # 0 H, 2, is greater than permitted
[10:34:23] Explicit valence for atom # 2 C, 5, is greater than permitted
[10:34:23] Explicit valence for atom # 14 B, 4, is greater than permitted
[10:34:23] Explicit valence for atom # 14 N, 4, 

Processing: 400000


[10:37:37] Explicit valence for atom # 8 N, 4, is greater than permitted
[10:37:37] Explicit valence for atom # 1 Be, 6, is greater than permitted
[10:37:38] Explicit valence for atom # 10 Ca, 12, is greater than permitted
[10:37:38] Explicit valence for atom # 0 B, 4, is greater than permitted
[10:37:38] Explicit valence for atom # 1 N, 4, is greater than permitted
[10:37:38] Explicit valence for atom # 1 Mg, 5, is greater than permitted
[10:37:38] Explicit valence for atom # 1 Mg, 5, is greater than permitted
[10:37:38] Explicit valence for atom # 5 N, 4, is greater than permitted
[10:37:38] Explicit valence for atom # 6 B, 4, is greater than permitted
[10:37:38] Explicit valence for atom # 6 B, 4, is greater than permitted
[10:37:39] Explicit valence for atom # 0 H, 2, is greater than permitted
[10:37:39] Explicit valence for atom # 2 O, 3, is greater than permitted
[10:37:39] Explicit valence for atom # 0 O, 3, is greater than permitted
[10:37:39] Can't kekulize mol.  Unkekulized a

Processing: 500000


[10:40:53] Explicit valence for atom # 2 Si, 8, is greater than permitted
[10:40:54] Explicit valence for atom # 0 O, 3, is greater than permitted
[10:40:54] Explicit valence for atom # 18 N, 4, is greater than permitted
[10:40:54] Explicit valence for atom # 8 C, 6, is greater than permitted
[10:40:54] Explicit valence for atom # 2 O, 3, is greater than permitted
[10:40:54] Explicit valence for atom # 2 O, 3, is greater than permitted
[10:40:54] Explicit valence for atom # 1 O, 3, is greater than permitted
[10:40:54] Can't kekulize mol.  Unkekulized atoms: 6 7 8 9 10 11 12 13 14
[10:40:54] Explicit valence for atom # 12 B, 5, is greater than permitted
[10:40:54] Explicit valence for atom # 6 N, 4, is greater than permitted
[10:40:55] Explicit valence for atom # 1 B, 6, is greater than permitted
[10:40:55] Explicit valence for atom # 14 B, 4, is greater than permitted
[10:40:55] Explicit valence for atom # 14 B, 4, is greater than permitted
[10:40:55] Explicit valence for atom # 1 Ba, 

Processing: 600000


[10:44:06] Explicit valence for atom # 3 O, 3, is greater than permitted
[10:44:07] Explicit valence for atom # 29 N, 4, is greater than permitted
[10:44:08] Can't kekulize mol.  Unkekulized atoms: 2 3 4 5 6
[10:44:09] Can't kekulize mol.  Unkekulized atoms: 4 5 6 7 8
[10:44:09] Explicit valence for atom # 6 N, 4, is greater than permitted
[10:44:09] Explicit valence for atom # 8 B, 4, is greater than permitted
[10:44:09] Explicit valence for atom # 8 B, 4, is greater than permitted
[10:44:09] Explicit valence for atom # 8 B, 4, is greater than permitted
[10:44:09] Explicit valence for atom # 4 Cl, 2, is greater than permitted
[10:44:09] Explicit valence for atom # 3 Cl, 2, is greater than permitted
[10:44:09] Explicit valence for atom # 2 Cl, 2, is greater than permitted
[10:44:09] Explicit valence for atom # 2 Br, 2, is greater than permitted
[10:44:10] Can't kekulize mol.  Unkekulized atoms: 2 3 4 5 8 9 10 12 13 14 40
[10:44:10] Explicit valence for atom # 1 B, 4, is greater than pe

Processing: 700000


[10:47:22] Explicit valence for atom # 9 Mg, 5, is greater than permitted
[10:47:22] Explicit valence for atom # 7 Mg, 5, is greater than permitted
[10:47:22] Explicit valence for atom # 13 Ba, 12, is greater than permitted
[10:47:22] Explicit valence for atom # 1 Ca, 6, is greater than permitted
[10:47:23] Explicit valence for atom # 1 Ca, 6, is greater than permitted
[10:47:23] Explicit valence for atom # 7 Li, 4, is greater than permitted
[10:47:23] Explicit valence for atom # 8 N, 4, is greater than permitted
[10:47:23] Explicit valence for atom # 3 O, 3, is greater than permitted
[10:47:23] Explicit valence for atom # 1 B, 4, is greater than permitted
[10:47:23] Explicit valence for atom # 1 B, 4, is greater than permitted
[10:47:23] Explicit valence for atom # 1 B, 4, is greater than permitted
[10:47:23] Explicit valence for atom # 1 B, 4, is greater than permitted
[10:47:23] Explicit valence for atom # 13 B, 7, is greater than permitted
[10:47:23] Explicit valence for atom # 2 O

Processing: 800000


[10:50:40] Explicit valence for atom # 9 N, 4, is greater than permitted
[10:50:41] Explicit valence for atom # 0 O, 3, is greater than permitted
[10:50:41] Explicit valence for atom # 12 N, 4, is greater than permitted
[10:50:42] Can't kekulize mol.  Unkekulized atoms: 2 3 4 5 6
[10:50:42] Can't kekulize mol.  Unkekulized atoms: 4 5 6 7 8
[10:50:42] Explicit valence for atom # 2 O, 3, is greater than permitted
[10:50:42] Explicit valence for atom # 1 N, 4, is greater than permitted
[10:50:42] Explicit valence for atom # 20 B, 4, is greater than permitted
[10:50:43] Explicit valence for atom # 8 N, 4, is greater than permitted
[10:50:43] Explicit valence for atom # 8 N, 4, is greater than permitted
[10:50:43] Explicit valence for atom # 13 B, 4, is greater than permitted
[10:50:43] Explicit valence for atom # 7 N, 4, is greater than permitted
[10:50:43] Explicit valence for atom # 1 O, 3, is greater than permitted
[10:50:43] Explicit valence for atom # 2 N, 4, is greater than permitted

Processing: 900000


[10:54:00] Explicit valence for atom # 2 O, 3, is greater than permitted
[10:54:01] Explicit valence for atom # 4 N, 4, is greater than permitted
[10:54:01] Explicit valence for atom # 4 N, 4, is greater than permitted
[10:54:01] Explicit valence for atom # 4 N, 4, is greater than permitted
[10:54:01] Explicit valence for atom # 4 N, 4, is greater than permitted
[10:54:01] Explicit valence for atom # 14 N, 4, is greater than permitted
[10:54:01] Explicit valence for atom # 15 N, 4, is greater than permitted
[10:54:01] Explicit valence for atom # 4 N, 4, is greater than permitted
[10:54:01] Explicit valence for atom # 8 N, 4, is greater than permitted
[10:54:01] Explicit valence for atom # 8 N, 4, is greater than permitted
[10:54:01] Explicit valence for atom # 8 N, 4, is greater than permitted
[10:54:01] Explicit valence for atom # 8 N, 4, is greater than permitted
[10:54:01] Explicit valence for atom # 8 N, 4, is greater than permitted
[10:54:01] Explicit valence for atom # 2 N, 4, is

Processing: 1000000


[10:57:16] Explicit valence for atom # 0 O, 3, is greater than permitted
[10:57:16] Explicit valence for atom # 1 N, 4, is greater than permitted
[10:57:16] Explicit valence for atom # 20 B, 4, is greater than permitted
[10:57:17] Explicit valence for atom # 7 Si, 8, is greater than permitted
[10:57:17] Explicit valence for atom # 1 O, 3, is greater than permitted
[10:57:17] Explicit valence for atom # 1 O, 3, is greater than permitted
[10:57:17] Explicit valence for atom # 103 N, 4, is greater than permitted
[10:57:18] Explicit valence for atom # 0 O, 3, is greater than permitted
[10:57:18] Explicit valence for atom # 9 B, 6, is greater than permitted
[10:57:18] Explicit valence for atom # 0 H, 2, is greater than permitted
[10:57:18] Explicit valence for atom # 9 B, 5, is greater than permitted
[10:57:18] Explicit valence for atom # 16 B, 5, is greater than permitted
[10:57:19] Explicit valence for atom # 4 O, 4, is greater than permitted
[10:57:19] Explicit valence for atom # 0 H, 2,

Processing: 1100000


[11:00:30] Explicit valence for atom # 2 Cl, 2, is greater than permitted
[11:00:31] Can't kekulize mol.  Unkekulized atoms: 2 3 4 5 6 7 8 9 13
[11:00:31] Can't kekulize mol.  Unkekulized atoms: 2 3 4 5 6 7 8 9 13
[11:00:31] Explicit valence for atom # 2 B, 6, is greater than permitted
[11:00:31] Explicit valence for atom # 2 B, 6, is greater than permitted
[11:00:31] Explicit valence for atom # 2 B, 6, is greater than permitted
[11:00:31] Explicit valence for atom # 0 B, 6, is greater than permitted
[11:00:32] Explicit valence for atom # 20 N, 4, is greater than permitted
[11:00:32] Explicit valence for atom # 22 Li, 2, is greater than permitted
[11:00:32] Explicit valence for atom # 3 N, 4, is greater than permitted
[11:00:32] Explicit valence for atom # 6 O, 3, is greater than permitted
[11:00:32] Explicit valence for atom # 3 N, 4, is greater than permitted
[11:00:32] Explicit valence for atom # 1 O, 3, is greater than permitted
[11:00:32] Explicit valence for atom # 8 Cl, 3, is gr

Wall time: 37min 38s


In [10]:
print(len(unlabeled_list))
print(len(exclude_list))

199987
603


In [11]:
# Detele duplicate in each list
unlabeled_list = list(set(unlabeled_list))
exclude_list = list(set(exclude_list))
print(len(unlabeled_list))
print(len(exclude_list))

185215
352


In [12]:
# Detele duplicate of lists
dupl_list = list(set(unlabeled_list) & set(exclude_list))
for d in dupl_list:
    unlabeled_list.remove(d)
print(len(unlabeled_list))
print(len(exclude_list))

185094
352


In [13]:
# Detele duplicate of lists
dupl_list = list(set(unlabeled_list) & set(smiles_list))
for d in dupl_list:
    unlabeled_list.remove(d)
print(len(unlabeled_list))
print(len(smiles_list))

185043
88


In [14]:
# Manual deletion from unlabeld to prevent overlap with positve dataset
del_list = [
    'O=C1CCC(C(=O)O)N1.O=C1CCC(C(=O)O)N1.O=C1CCC(C(=O)O)N1',
    '[N-]=[N+]=NC1OC2COC(c3ccccc3)OC2C(O)C1O.[N-]=[N+]=NC1OC2COC(c3ccccc3)OC2C(O)C1O.[N-]=[N+]=NC1OC2COC(c3ccccc3)OC2C(O)C1O.[N-]=[N+]=NC1OC2COC(c3ccccc3)OC2C(O)C1O.[N-]=[N+]=NC1OC2COC(c3ccccc3)OC2C(O)C1O.[N-]=[N+]=NC1OC2COC(c3ccccc3)OC2C(O)C1O',
    'Cc1c(C)c2c(c3c1CC[n+]1ccc4ccccc4c1-3)-c1c3ccccc3cc[n+]1CC2.Cc1c(C)c2c(c3c1CC[n+]1ccc4ccccc4c1-3)-c1c3ccccc3cc[n+]1CC2.O=S(=O)([O-])C(F)(F)F.O=S(=O)([O-])C(F)(F)F.O=S(=O)([O-])C(F)(F)F.O=S(=O)([O-])C(F)(F)F',
    'CC1=CC(=O)NS(=O)(=O)O1.CC1=CC(=O)NS(=O)(=O)O1',
    'CSCCC([NH3+])C(=O)[O-].CSCCC([NH3+])C(=O)[O-]',
    'NC(=O)c1ccc(N=Cc2cc(Cl)cc(Cl)c2O)cc1.NC(=O)c1ccc(N=Cc2cc(Cl)cc(Cl)c2O)cc1',
]
dupl_list = list(set(unlabeled_list) & set(del_list))
for d in dupl_list:
    unlabeled_list.remove(d)
print(len(unlabeled_list))
print(len(smiles_list))

185037
88


In [15]:
# Save smiles
unlabeled_smiles = pd.DataFrame({'SMILES': unlabeled_list})
unlabeled_smiles.to_csv('../datasets/unlabeled_smiles.csv')

In [16]:
ccdc.io.csd_version()

'542'