# Dataset preparation

**Instruction**  
The aim of this notebook is to prepare bandgap dataset where smiles, COD ID, and bandgap are corresponed.  
Four files are required to excecute the following code.  
1. `allcod.csv` (File containing SMILES and COD ID)
1. `structure.xyz` (File containg atom coordinates used for OMDB)
1. `CODids.csv` (File containing COD ID corresponding to structure.xyz)
1. `bandgaps.csv` (File containg calculated bandgap corresponding to structure.xyz)  

File 1 (.smi format) can be obtained from Crystallographic Open Database (http://www.crystallography.net/cod/smi/).  
Files 2-4 can be obtained from Organic Materials DataBase (https://omdb.mathub.io/dataset).  
(Please change .smi format to .csv format in your local enviroment)  

Output of this code is json files for making crystal graphs, and a csv file where SMILES, COD ID, and bandgap are corresponed.  
Due to the conversion error using RDKit module, the amount of data in the output csv is 10472.

In [94]:
import pandas as pd
import numpy as np
from rdkit import Chem
from ase.io import read
import warnings
warnings.simplefilter('ignore')

In [49]:
df_allcod = pd.read_csv('D:datasets/OMDB-GAP1_v1.1.tar/OMDB-GAP1_v1.1/allcod.csv', header=None)

In [59]:
%%time
all_smiles_list = []
all_codid_list = []
for i in range(df_allcod.shape[0]):
    data = df_allcod.iloc[i,:].iloc[-1].split('\t')
    all_smiles_list.append(data[0])
    all_codid_list.append(int(data[1]))

CPU times: total: 14.6 s
Wall time: 14.6 s


In [60]:
print(len(smiles_list))
print(len(codid_list))

210706
210706


In [38]:
path="D:datasets/OMDB-GAP1_v1.1.tar/OMDB-GAP1_v1.1/"
materials = read(path+'structures.xyz', index=':')
cods = np.loadtxt(path+"CODids.csv", dtype=int)
bandgaps = np.loadtxt(path+"bandgaps.csv")

In [66]:
print(len(materials))
print(cods.shape)
print(bandgaps.shape)

12500
(12500,)
(12500,)


In [97]:
smiles_list = []
for i in range(cods.shape[0]):
    try:
        smiles_list.append(all_smiles_list[all_codid_list.index(cods[i])])
    except:
        smiles_list.append('NaN')

In [98]:
print(len(smiles_list))

12500


In [99]:
df_curated = pd.DataFrame({
    'SMILES': smiles_list,
    'COD ID': cods,
    'Bandgap': bandgaps
})

In [100]:
# SMILES to Mol conversion
rdkit_smiles_list = []
for smiles in df_curated['SMILES']:
    try:
        mol = Chem.MolFromSmiles(smiles)
        smiles = Chem.MolToSmiles(mol)
    except:
        smiles = 'NaN'
    rdkit_smiles_list.append(smiles)

[16:04:25] Explicit valence for atom # 23 O, 3, is greater than permitted
[16:04:25] SMILES Parse Error: syntax error while parsing: NaN
[16:04:25] SMILES Parse Error: Failed parsing SMILES 'NaN' for input: 'NaN'
[16:04:25] SMILES Parse Error: syntax error while parsing: NaN
[16:04:25] SMILES Parse Error: Failed parsing SMILES 'NaN' for input: 'NaN'
[16:04:25] SMILES Parse Error: syntax error while parsing: NaN
[16:04:25] SMILES Parse Error: Failed parsing SMILES 'NaN' for input: 'NaN'
[16:04:25] SMILES Parse Error: syntax error while parsing: NaN
[16:04:25] SMILES Parse Error: Failed parsing SMILES 'NaN' for input: 'NaN'
[16:04:25] Explicit valence for atom # 22 O, 3, is greater than permitted
[16:04:25] SMILES Parse Error: syntax error while parsing: NaN
[16:04:25] SMILES Parse Error: Failed parsing SMILES 'NaN' for input: 'NaN'
[16:04:25] SMILES Parse Error: syntax error while parsing: NaN
[16:04:25] SMILES Parse Error: Failed parsing SMILES 'NaN' for input: 'NaN'
[16:04:25] SMILES 

In [101]:
df_curated['SMILES'] = rdkit_smiles_list

In [102]:
df_curated.head()

Unnamed: 0,SMILES,COD ID,Bandgap
0,,4074437,2.78762
1,COC(=O)[C@@H](O)[C@@H](Cl)[C@@H](Cl)c1ccc(Cl)c...,7226496,1.118755
2,O[C@@H]1Cc2cc3c(cc2[C@H]2NCc4c(ccc5c4OCO5)[C@H...,1507720,3.335133
3,ClCCNc1ccnc2cc(Cl)ccc12,7215594,2.854547
4,CCc1ccc2c(CC(=O)O)c[nH]c2c1,2101963,3.145922


In [108]:
df_curated = df_curated[~df_curated['SMILES'].str.contains('NaN')]

In [109]:
df_curated.head()

Unnamed: 0,SMILES,COD ID,Bandgap
1,COC(=O)[C@@H](O)[C@@H](Cl)[C@@H](Cl)c1ccc(Cl)c...,7226496,1.118755
2,O[C@@H]1Cc2cc3c(cc2[C@H]2NCc4c(ccc5c4OCO5)[C@H...,1507720,3.335133
3,ClCCNc1ccnc2cc(Cl)ccc12,7215594,2.854547
4,CCc1ccc2c(CC(=O)O)c[nH]c2c1,2101963,3.145922
5,CN1C(=O)C(O)c2ccccc21,7154039,3.467795


In [115]:
df_curated.shape

(10472, 3)

In [117]:
materials_rev = [materials[i] for i in df_curated.index]
print(len(materials_rev))

10472


In [118]:
df_curated = df_curated.reset_index(drop=True)
df_curated.head()

Unnamed: 0,SMILES,COD ID,Bandgap
0,COC(=O)[C@@H](O)[C@@H](Cl)[C@@H](Cl)c1ccc(Cl)c...,7226496,1.118755
1,O[C@@H]1Cc2cc3c(cc2[C@H]2NCc4c(ccc5c4OCO5)[C@H...,1507720,3.335133
2,ClCCNc1ccnc2cc(Cl)ccc12,7215594,2.854547
3,CCc1ccc2c(CC(=O)O)c[nH]c2c1,2101963,3.145922
4,CN1C(=O)C(O)c2ccccc21,7154039,3.467795


In [126]:
# xyz -> json
import os
from ase import io
path = 'D:datasets/Bandgap_rev/'
os.makedirs(path, exist_ok=True)
os.makedirs(path+'json/', exist_ok=True)

for i in range(len(materials_rev)):
    io.write(path+'json/{0}.json'.format(df_curated['COD ID'][i]), materials_rev[i])

In [127]:
df_curated.to_csv(path+'smiles-cod-bandgap.csv', index=None)