## Data Understanding
- load/observe mol files
- create csv files

In [1]:
import pandas as pd
import numpy as np
import re
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.simplefilter(action='ignore')

In [2]:
from glob import glob
from tqdm import tqdm

In [3]:
path = 'data/'

sample = pd.read_csv(path + 'sample_submission.csv')
train = pd.read_csv(path + 'train_set.ReorgE.csv')
test = pd.read_csv(path + 'test_set.csv')

In [4]:
train

Unnamed: 0,index,SMILES,Reorg_g,Reorg_ex
0,train_0,CC[C@H]1CCCCN1C(=O)[C@@H](C)OC(=O)c1c(C)oc(-n2...,0.631486,0.535060
1,train_1,O[C@@H](CNC1CC1)CN1CCc2sccc2C1,0.825901,1.116781
2,train_2,N#CCCNC(=O)[C@@]1(O)CCSC1,1.463943,0.964848
3,train_3,COC[C@H]1CN(c2ccc(OCC[C@@H](C)O)cc2)C(=O)O1,0.166669,0.161458
4,train_4,N#Cc1c(-c2ccccc2OCC(N)=O)[nH]c(C(N)=O)c1N,0.313820,0.338862
...,...,...,...,...
18152,train_18152,CC(=O)Nc1ccc2ccc3cccc4ccc1c2c34,0.146917,0.143084
18153,train_18153,CC(C)(C)c1ccccc1N(c1ccccc1)c1ccc(S(=O)(=O)c2cc...,0.612898,0.500668
18154,train_18154,CN(C)c1ccc(C(=O)Nc2ccccc2)cc1,1.218777,1.048954
18155,train_18155,c1ccc(N(c2ccccc2)c2ccc(-c3ncc(-c4ccc(-c5cnc(-c...,0.145292,0.182589


## Loading MOL Files
- train_set
- test_set

### train_set

In [5]:
path = 'data/mol_files/train_set/'

train_mol = sorted(glob(path + '*.mol'))

In [6]:
train_mol[:10]

['data/mol_files/train_set\\train_0_ex.mol',
 'data/mol_files/train_set\\train_0_g.mol',
 'data/mol_files/train_set\\train_10000_ex.mol',
 'data/mol_files/train_set\\train_10000_g.mol',
 'data/mol_files/train_set\\train_10001_ex.mol',
 'data/mol_files/train_set\\train_10001_g.mol',
 'data/mol_files/train_set\\train_10002_ex.mol',
 'data/mol_files/train_set\\train_10002_g.mol',
 'data/mol_files/train_set\\train_10003_ex.mol',
 'data/mol_files/train_set\\train_10003_g.mol']

### test_set

In [7]:
path = 'data/mol_files/test_set/'

test_mol = sorted(glob(path + '*.mol'))

In [8]:
test_mol[:10]

['data/mol_files/test_set\\test_0_ex.mol',
 'data/mol_files/test_set\\test_0_g.mol',
 'data/mol_files/test_set\\test_100_ex.mol',
 'data/mol_files/test_set\\test_100_g.mol',
 'data/mol_files/test_set\\test_101_ex.mol',
 'data/mol_files/test_set\\test_101_g.mol',
 'data/mol_files/test_set\\test_102_ex.mol',
 'data/mol_files/test_set\\test_102_g.mol',
 'data/mol_files/test_set\\test_103_ex.mol',
 'data/mol_files/test_set\\test_103_g.mol']

## Observing MOL Files
- file structure
- length of an element

### File Structure

In [9]:
mol_file = open(train_mol[0], 'r').read().split("\n")
mol_file

##### Anatomy of a mol file #####

# Counts Block
# 53: number of atoms
# 55: number of bonds

# Atoms Block
# 3.0263   -2.3512    0.5507: x, y, z coordinates
# C: atom symbol

# Bonds Block(1  2  1  0)
# 1: first atom row number
# 2: second atom row number
# 1: bond type
# 0: bond stereochemistry

['',
 '     RDKit          3D',
 '',
 ' 53 55  0  0  0  0  0  0  0  0999 V2000',
 '    3.0263   -2.3512    0.5507 C   0  0  0  0  0  0  0  0  0  0  0  0',
 '    3.4398   -0.9997    1.1366 C   0  0  0  0  0  0  0  0  0  0  0  0',
 '    3.3570    0.1784    0.1392 C   0  0  1  0  0  0  0  0  0  0  0  0',
 '    1.9344    0.5133   -0.3583 C   0  0  0  0  0  0  0  0  0  0  0  0',
 '    1.0782    1.2336    0.6877 C   0  0  0  0  0  0  0  0  0  0  0  0',
 '    1.8074    2.4846    1.1839 C   0  0  0  0  0  0  0  0  0  0  0  0',
 '    3.2042    2.1347    1.6952 C   0  0  0  0  0  0  0  0  0  0  0  0',
 '    3.9824    1.3938    0.6950 N   0  0  0  0  0  0  0  0  0  0  0  0',
 '    5.3098    1.7286    0.5618 C   0  0  0  0  0  0  0  0  0  0  0  0',
 '    5.8180    2.6625    1.1588 O   0  0  0  0  0  0  0  0  0  0  0  0',
 '    6.1704    0.9343   -0.4511 C   0  0  1  0  0  0  0  0  0  0  0  0',
 '    6.1652    1.6104   -1.8172 C   0  0  0  0  0  0  0  0  0  0  0  0',
 '    7.5285    0.9327   -0.009

### Length of an Element
- atoms block
- bonds block

In [10]:
# atoms block
tmp = re.sub(' +', ' ', mol_file[4].lstrip()).split(' ')

print(tmp)
len(tmp)

['3.0263', '-2.3512', '0.5507', 'C', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0']


16

In [11]:
# bonds block
tmp = re.sub(' +', ' ', mol_file[-4].lstrip()).split(' ')

print(tmp)
len(tmp)

['27', '28', '3', '0']


4

In [12]:
length = []

for i in mol_file:
    tmp = re.sub(' +', ' ', i.lstrip()).split(' ')
    length.append(len(tmp))

set(length)

{1, 2, 4, 11, 16}

## Creating CSV Files
- atoms block
 - x, y, z coordinates
 - atom symbol
- bonds block
 - bond type

### train_set

In [13]:
# convert mol files to the form of dataframe(csv files)

for i in tqdm(train_mol):
    
    atoms = []
    bonds = []
    mol_file = open(i, 'r').read().split("\n")
    
    # create nested lists
    for j in mol_file:
        tmp = re.sub(' +', ' ', j.lstrip()).split(' ') # remove blanks
        
        if len(tmp) > 11: # atoms block
            atoms.append(tmp)
        elif 4 <= len(tmp) < 10: # bonds block
            bonds.append(tmp)
            
    file_name = i.split('\\')[-1].split('.')[0] # file name without filename extension(ex: train_0_g)
    path = 'data/mol_files/train_set/'
    
    # create csv files
    pd.DataFrame(atoms).to_csv(path + file_name + '_a.csv', index=False)
    pd.DataFrame(bonds).to_csv(path + file_name + '_b.csv', index=False)

100%|███████████████████████████████████████████████████████████████████████████| 36314/36314 [01:11<00:00, 505.57it/s]


In [14]:
path = 'data/mol_files/train_set/'

# atoms block
df = pd.read_csv(path + 'train_0_g_a.csv')
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,2.8972,-2.3256,0.4309,C,0,0,0,0,0,0,0,0,0,0,0,0
1,3.3467,-1.0151,1.0796,C,0,0,0,0,0,0,0,0,0,0,0,0
2,3.3365,0.2041,0.1293,C,0,0,1,0,0,0,0,0,0,0,0,0
3,1.9424,0.6155,-0.3895,C,0,0,0,0,0,0,0,0,0,0,0,0
4,1.087,1.3235,0.6658,C,0,0,0,0,0,0,0,0,0,0,0,0


In [15]:
# bonds block
df = pd.read_csv(path + 'train_0_g_b.csv')
df.head()

Unnamed: 0,0,1,2,3
0,1,2,1,0
1,1,29,1,0
2,1,30,1,0
3,1,31,1,0
4,2,3,1,0


### test_set

In [16]:
for i in tqdm(test_mol):
    
    atoms = []
    bonds = []
    mol_file = open(i, 'r').read().split("\n")
    
    for j in mol_file:
        tmp = re.sub(' +', ' ', j.lstrip()).split(' ')
        
        if len(tmp) > 11:
            atoms.append(tmp)
        elif 4 <= len(tmp) < 10:
            bonds.append(tmp)
            
    file_name = i.split('\\')[-1].split('.')[0]
    path = 'data/mol_files/test_set/'
    
    pd.DataFrame(atoms).to_csv(path + file_name + '_a.csv', index=False)
    pd.DataFrame(bonds).to_csv(path + file_name + '_b.csv', index=False)

100%|███████████████████████████████████████████████████████████████████████████████| 914/914 [00:01<00:00, 484.10it/s]
