## Data Preparation (2)
- derive variables from complete data

In [1]:
import pandas as pd
import numpy as np
import re
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.simplefilter(action='ignore')

In [2]:
from glob import glob
from tqdm import tqdm

In [3]:
path = 'data/'

sample = pd.read_csv(path + 'sample_submission.csv')
train = pd.read_csv(path + 'train_set.ReorgE.csv')
test = pd.read_csv(path + 'test_set.csv')

In [4]:
train

Unnamed: 0,index,SMILES,Reorg_g,Reorg_ex
0,train_0,CC[C@H]1CCCCN1C(=O)[C@@H](C)OC(=O)c1c(C)oc(-n2...,0.631486,0.535060
1,train_1,O[C@@H](CNC1CC1)CN1CCc2sccc2C1,0.825901,1.116781
2,train_2,N#CCCNC(=O)[C@@]1(O)CCSC1,1.463943,0.964848
3,train_3,COC[C@H]1CN(c2ccc(OCC[C@@H](C)O)cc2)C(=O)O1,0.166669,0.161458
4,train_4,N#Cc1c(-c2ccccc2OCC(N)=O)[nH]c(C(N)=O)c1N,0.313820,0.338862
...,...,...,...,...
18152,train_18152,CC(=O)Nc1ccc2ccc3cccc4ccc1c2c34,0.146917,0.143084
18153,train_18153,CC(C)(C)c1ccccc1N(c1ccccc1)c1ccc(S(=O)(=O)c2cc...,0.612898,0.500668
18154,train_18154,CN(C)c1ccc(C(=O)Nc2ccccc2)cc1,1.218777,1.048954
18155,train_18155,c1ccc(N(c2ccccc2)c2ccc(-c3ncc(-c4ccc(-c5cnc(-c...,0.145292,0.182589


## Deriving Variables from Complete Data
- create derived variables
    - Euclidean distance from the origin
    - atomic information from the periodic table
    - bond information
- aggregation
    - np.mean

### Preparing Materials
- symbols list
- dictionaries for mapping
- bond energy table

#### symbols list

In [5]:
symbols = ['N', 'C', 'Si', 'I', 'B', 'O', 'P', 'Br', 'S', 'F', 'H', 'Cl']

#### dictionaries for mapping

In [6]:
table = pd.read_csv('https://pubchem.ncbi.nlm.nih.gov/rest/pug/periodictable/CSV/?response_type=display')
table.index = table['Symbol']
tmp = table.loc[symbols].reset_index(drop=True)

In [7]:
tmp.columns

Index(['AtomicNumber', 'Symbol', 'Name', 'AtomicMass', 'CPKHexColor',
       'ElectronConfiguration', 'Electronegativity', 'AtomicRadius',
       'IonizationEnergy', 'ElectronAffinity', 'OxidationStates',
       'StandardState', 'MeltingPoint', 'BoilingPoint', 'Density',
       'GroupBlock', 'YearDiscovered'],
      dtype='object')

In [8]:
at_number = {}
at_weight = {}
electro_neg = {}
ion_energy = {}
electron_aff = {}
melting_p = {}
boiling_p = {}
density = {}

for i in range(12):
    key = tmp.iloc[i]['Symbol']
    val1 = tmp.iloc[i]['AtomicNumber']
    val2 = tmp.iloc[i]['AtomicMass']
    val3 = tmp.iloc[i]['Electronegativity']
    val4 = tmp.iloc[i]['IonizationEnergy']
    val5 = tmp.iloc[i]['ElectronAffinity']
    val6 = tmp.iloc[i]['MeltingPoint']
    val7 = tmp.iloc[i]['BoilingPoint']
    val8 = tmp.iloc[i]['Density']
    
    at_number[key] = val1
    at_weight[key] = val2
    electro_neg[key] = val3
    ion_energy[key] = val4
    electron_aff[key] = val5
    melting_p[key] = val6
    boiling_p[key] = val7
    density[key] = val8

#### bond energy table

In [9]:
table = pd.read_html('https://www.wiredchemist.com/chemistry/data/bond_energies_lengths.html')

In [10]:
t1 = table[0]
t2 = table[1]
t3 = table[2]
t4 = table[3]
t5 = table[4].loc[0:22]
t6 = table[5].drop(4)
t7 = table[6]

In [11]:
t12 = pd.concat([t1, t2])
t34 = pd.concat([t3, t4])
t56 = pd.concat([t5, t6])

In [12]:
t1234 = pd.concat([t12, t34])
t567 = pd.concat([t56, t7])
t_all = pd.concat([t1234, t567])

In [13]:
en_df = t_all.iloc[:, 0:2].dropna().reset_index(drop=True)
en_df.columns = ['bond', 'energy']

In [14]:
en_df['type'] = 1
en_df['at_1'] = 1
en_df['at_2'] = 1

In [15]:
for idx, bond in enumerate(en_df['bond']):
    if '-' in bond:
        at_1 = bond.split('-')[0]
        at_2 = bond.split('-')[1]
        en_df.loc[idx, 'at_1'] = at_1
        en_df.loc[idx, 'at_2'] = at_2
    elif '=' in bond:
        at_1 = bond.split('=')[0]
        at_2 = bond.split('=')[1]
        en_df.loc[idx, 'type'] = 2
        en_df.loc[idx, 'at_1'] = at_1
        en_df.loc[idx, 'at_2'] = at_2
    elif '≡' in bond:
        at_1 = bond.split('≡')[0]
        at_2 = bond.split('≡')[1]
        en_df.loc[idx, 'type'] = 3
        en_df.loc[idx, 'at_1'] = at_1
        en_df.loc[idx, 'at_2'] = at_2

### Creating New Dataset
- train_set
    - train_g, train_ex
- test_set
    - test_g, test_ex

#### train_set

In [16]:
##### ground state #####

data_g = []
path = 'data/mol_files/train_set/'

for name in tqdm(train['index']):
    
    g_a = pd.read_csv(path + name + '_g_a.csv')
    g_b = pd.read_csv(path + name + '_g_b.csv')
    
    
    # creating derived variables
    
    # (1) distance from the origin
    g_a['dist'] = np.sqrt(g_a['0']**2 + g_a['1']**2 + g_a['2']**2)
    
    # (2) atomic information
    g_a['at_n'] = g_a['3'].map(at_number)
    g_a['at_w'] = g_a['3'].map(at_weight)
    g_a['el_neg'] = g_a['3'].map(electro_neg)
    g_a['ion_en'] = g_a['3'].map(ion_energy)
    g_a['el_aff'] = g_a['3'].map(electron_aff)
    g_a['mp'] = g_a['3'].map(melting_p)
    g_a['bp'] = g_a['3'].map(boiling_p)
    g_a['den'] = g_a['3'].map(density)
    
    # (3) bond information
    g_a.index = range(1, len(g_a)+1)
    
    # bond length
    at_list_1 = []
    at_list_2 = []
    len_list = []

    for i in g_b.index:

        idx_1 = g_b.iloc[i, 0]
        idx_2 = g_b.iloc[i, 1]

        atom_1 = g_a.loc[idx_1]
        atom_2 = g_a.loc[idx_2]

        sym_1 = atom_1['3']
        sym_2 = atom_2['3']
        bond_len = np.sqrt((atom_1['0'] - atom_2['0'])**2 + (atom_1['1'] - atom_2['1'])**2 + (atom_1['2'] - atom_2['2'])**2)

        at_list_1.append(sym_1)
        at_list_2.append(sym_2)
        len_list.append(bond_len)
    
    bond_df = pd.DataFrame()
    bond_df['at_1'] = at_list_1
    bond_df['at_2'] = at_list_2
    bond_df['type'] = g_b['2']
    bond_df['len'] = len_list
    
    # bond energy
    bond_df['bond_en'] = 1
    set_list = []

    for i in bond_df.index:
        at_set = {bond_df.loc[i, 'at_1'], bond_df.loc[i, 'at_2']}
        set_list.append(at_set)

    for idx, bond in enumerate(set_list):

        for j in en_df.index:
            at_set = {en_df.loc[j, 'at_1'], en_df.loc[j, 'at_2']}

            if (bond == at_set) and (bond_df.loc[idx, 'type'] == en_df.loc[j, 'type']):
                bond_df.loc[idx, 'bond_en'] = en_df.loc[j, 'energy']
    
    
    # aggregation of created variables
    
    a_cols = ['dist', 'at_n', 'at_w', 'el_neg', 'ion_en', 'el_aff', 'mp', 'bp', 'den']
    b_cols = ['len', 'bond_en']
    
    avg_dic = {}

    for i in a_cols:
        avg = np.mean(g_a[i])
        avg_dic[i] = avg

    for j in b_cols:
        avg = np.mean(bond_df[j])
        avg_dic[j] = avg

    data_g.append(avg_dic)

100%|████████████████████████████████████████████████████████████████████████████| 18157/18157 [20:05<00:00, 15.06it/s]


In [17]:
df_g = pd.DataFrame(data_g)
df_g

Unnamed: 0,dist,at_n,at_w,el_neg,ion_en,el_aff,mp,bp,den,len,bond_en
0,7.421214,3.849057,7.234868,2.479811,12.726113,1.024340,1528.964906,1644.488302,0.898466,1.274554,416.872727
1,4.567176,3.675676,6.821243,2.412162,12.740162,1.001057,1366.060000,1476.818108,0.852533,1.289784,384.307692
2,7.972815,4.240000,8.010640,2.493600,12.796800,1.050043,1254.924000,1363.209200,0.808378,1.298106,409.160000
3,9.343431,3.761905,7.031786,2.492619,12.787667,1.026439,1380.237143,1486.291429,0.809888,1.263025,406.348837
4,6.102327,4.457143,8.551143,2.566286,12.798229,1.062233,1548.010286,1665.515429,0.907135,1.263479,435.333333
...,...,...,...,...,...,...,...,...,...,...,...
18152,6.682258,4.121212,7.857818,2.453939,12.351697,1.062406,2094.273939,2248.339394,1.236662,1.292820,435.972222
18153,6.650131,4.018868,7.595132,2.434906,12.452491,1.051346,1820.125849,1960.608113,1.108458,1.312258,424.123810
18154,6.644104,3.764706,7.067824,2.440294,12.622176,1.014688,1698.430000,1824.688235,1.000305,1.259218,425.971429
18155,13.431267,4.517647,8.670588,2.442353,12.262494,1.092062,2090.795294,2254.348824,1.299893,1.317754,426.408602


In [18]:
##### exited state #####

data_ex = []
path = 'data/mol_files/train_set/'

for name in tqdm(train['index']):
    
    ex_a = pd.read_csv(path + name + '_ex_a.csv')
    ex_b = pd.read_csv(path + name + '_ex_b.csv')
    
    
    # creating derived variables
    
    # (1) distance from the origin
    ex_a['dist'] = np.sqrt(ex_a['0']**2 + ex_a['1']**2 + ex_a['2']**2)
    
    # (2) atomic information
    ex_a['at_n'] = ex_a['3'].map(at_number)
    ex_a['at_w'] = ex_a['3'].map(at_weight)
    ex_a['el_neg'] = ex_a['3'].map(electro_neg)
    ex_a['ion_en'] = ex_a['3'].map(ion_energy)
    ex_a['el_aff'] = ex_a['3'].map(electron_aff)
    ex_a['mp'] = ex_a['3'].map(melting_p)
    ex_a['bp'] = ex_a['3'].map(boiling_p)
    ex_a['den'] = ex_a['3'].map(density)
    
    # (3) bond information
    ex_a.index = range(1, len(ex_a)+1)
    
    # bond length
    at_list_1 = []
    at_list_2 = []
    len_list = []

    for i in ex_b.index:

        idx_1 = ex_b.iloc[i, 0]
        idx_2 = ex_b.iloc[i, 1]

        atom_1 = ex_a.loc[idx_1]
        atom_2 = ex_a.loc[idx_2]

        sym_1 = atom_1['3']
        sym_2 = atom_2['3']
        bond_len = np.sqrt((atom_1['0'] - atom_2['0'])**2 + (atom_1['1'] - atom_2['1'])**2 + (atom_1['2'] - atom_2['2'])**2)

        at_list_1.append(sym_1)
        at_list_2.append(sym_2)
        len_list.append(bond_len)
    
    bond_df = pd.DataFrame()
    bond_df['at_1'] = at_list_1
    bond_df['at_2'] = at_list_2
    bond_df['type'] = ex_b['2']
    bond_df['len'] = len_list
    
    # bond energy
    bond_df['bond_en'] = 1
    set_list = []

    for i in bond_df.index:
        at_set = {bond_df.loc[i, 'at_1'], bond_df.loc[i, 'at_2']}
        set_list.append(at_set)

    for idx, bond in enumerate(set_list):

        for j in en_df.index:
            at_set = {en_df.loc[j, 'at_1'], en_df.loc[j, 'at_2']}

            if (bond == at_set) and (bond_df.loc[idx, 'type'] == en_df.loc[j, 'type']):
                bond_df.loc[idx, 'bond_en'] = en_df.loc[j, 'energy']
    
    
    # aggregation of created variables
    
    a_cols = ['dist', 'at_n', 'at_w', 'el_neg', 'ion_en', 'el_aff', 'mp', 'bp', 'den']
    b_cols = ['len', 'bond_en']
    
    avg_dic = {}

    for i in a_cols:
        avg = np.mean(ex_a[i])
        avg_dic[i] = avg

    for j in b_cols:
        avg = np.mean(bond_df[j])
        avg_dic[j] = avg

    data_ex.append(avg_dic)

100%|████████████████████████████████████████████████████████████████████████████| 18157/18157 [17:47<00:00, 17.01it/s]


In [19]:
df_ex = pd.DataFrame(data_ex)
df_ex

Unnamed: 0,dist,at_n,at_w,el_neg,ion_en,el_aff,mp,bp,den,len,bond_en
0,7.419384,3.849057,7.234868,2.479811,12.726113,1.024340,1528.964906,1644.488302,0.898466,1.277462,416.872727
1,4.480080,3.675676,6.821243,2.412162,12.740162,1.001057,1366.060000,1476.818108,0.852533,1.292906,384.307692
2,7.975893,4.240000,8.010640,2.493600,12.796800,1.050043,1254.924000,1363.209200,0.808378,1.303473,409.160000
3,9.344530,3.761905,7.031786,2.492619,12.787667,1.026439,1380.237143,1486.291429,0.809888,1.265426,406.348837
4,6.113824,4.457143,8.551143,2.566286,12.798229,1.062233,1548.010286,1665.515429,0.907135,1.265641,435.333333
...,...,...,...,...,...,...,...,...,...,...,...
18152,6.685352,4.121212,7.857818,2.453939,12.351697,1.062406,2094.273939,2248.339394,1.236662,1.295104,435.972222
18153,6.538838,4.018868,7.595132,2.434906,12.452491,1.051346,1820.125849,1960.608113,1.108458,1.312676,424.123810
18154,6.649195,3.764706,7.067824,2.440294,12.622176,1.014688,1698.430000,1824.688235,1.000305,1.263567,425.971429
18155,13.415828,4.517647,8.670588,2.442353,12.262494,1.092062,2090.795294,2254.348824,1.299893,1.318533,426.408602


In [21]:
df_g.to_csv('train_g.csv', index=False)
df_ex.to_csv('train_ex.csv', index=False)

#### test_set

In [22]:
##### ground state #####

data_g = []
path = 'data/mol_files/test_set/'

for name in tqdm(test['index']):
    
    g_a = pd.read_csv(path + name + '_g_a.csv')
    g_b = pd.read_csv(path + name + '_g_b.csv')
    
    
    # creating derived variables
    
    # (1) distance from the origin
    g_a['dist'] = np.sqrt(g_a['0']**2 + g_a['1']**2 + g_a['2']**2)
    
    # (2) atomic information
    g_a['at_n'] = g_a['3'].map(at_number)
    g_a['at_w'] = g_a['3'].map(at_weight)
    g_a['el_neg'] = g_a['3'].map(electro_neg)
    g_a['ion_en'] = g_a['3'].map(ion_energy)
    g_a['el_aff'] = g_a['3'].map(electron_aff)
    g_a['mp'] = g_a['3'].map(melting_p)
    g_a['bp'] = g_a['3'].map(boiling_p)
    g_a['den'] = g_a['3'].map(density)
    
    # (3) bond information
    g_a.index = range(1, len(g_a)+1)
    
    # bond length
    at_list_1 = []
    at_list_2 = []
    len_list = []

    for i in g_b.index:

        idx_1 = g_b.iloc[i, 0]
        idx_2 = g_b.iloc[i, 1]

        atom_1 = g_a.loc[idx_1]
        atom_2 = g_a.loc[idx_2]

        sym_1 = atom_1['3']
        sym_2 = atom_2['3']
        bond_len = np.sqrt((atom_1['0'] - atom_2['0'])**2 + (atom_1['1'] - atom_2['1'])**2 + (atom_1['2'] - atom_2['2'])**2)

        at_list_1.append(sym_1)
        at_list_2.append(sym_2)
        len_list.append(bond_len)
    
    bond_df = pd.DataFrame()
    bond_df['at_1'] = at_list_1
    bond_df['at_2'] = at_list_2
    bond_df['type'] = g_b['2']
    bond_df['len'] = len_list
    
    # bond energy
    bond_df['bond_en'] = 1
    set_list = []

    for i in bond_df.index:
        at_set = {bond_df.loc[i, 'at_1'], bond_df.loc[i, 'at_2']}
        set_list.append(at_set)

    for idx, bond in enumerate(set_list):

        for j in en_df.index:
            at_set = {en_df.loc[j, 'at_1'], en_df.loc[j, 'at_2']}

            if (bond == at_set) and (bond_df.loc[idx, 'type'] == en_df.loc[j, 'type']):
                bond_df.loc[idx, 'bond_en'] = en_df.loc[j, 'energy']
    
    
    # aggregation of created variables
    
    a_cols = ['dist', 'at_n', 'at_w', 'el_neg', 'ion_en', 'el_aff', 'mp', 'bp', 'den']
    b_cols = ['len', 'bond_en']
    
    avg_dic = {}

    for i in a_cols:
        avg = np.mean(g_a[i])
        avg_dic[i] = avg

    for j in b_cols:
        avg = np.mean(bond_df[j])
        avg_dic[j] = avg

    data_g.append(avg_dic)

df_g = pd.DataFrame(data_g)

100%|████████████████████████████████████████████████████████████████████████████████| 457/457 [00:38<00:00, 11.98it/s]


In [23]:
##### exited state #####

data_ex = []
path = 'data/mol_files/test_set/'

for name in tqdm(test['index']):
    
    ex_a = pd.read_csv(path + name + '_ex_a.csv')
    ex_b = pd.read_csv(path + name + '_ex_b.csv')
    
    
    # creating derived variables
    
    # (1) distance from the origin
    ex_a['dist'] = np.sqrt(ex_a['0']**2 + ex_a['1']**2 + ex_a['2']**2)
    
    # (2) atomic information
    ex_a['at_n'] = ex_a['3'].map(at_number)
    ex_a['at_w'] = ex_a['3'].map(at_weight)
    ex_a['el_neg'] = ex_a['3'].map(electro_neg)
    ex_a['ion_en'] = ex_a['3'].map(ion_energy)
    ex_a['el_aff'] = ex_a['3'].map(electron_aff)
    ex_a['mp'] = ex_a['3'].map(melting_p)
    ex_a['bp'] = ex_a['3'].map(boiling_p)
    ex_a['den'] = ex_a['3'].map(density)
    
    # (3) bond information
    ex_a.index = range(1, len(ex_a)+1)
    
    # bond length
    at_list_1 = []
    at_list_2 = []
    len_list = []

    for i in ex_b.index:

        idx_1 = ex_b.iloc[i, 0]
        idx_2 = ex_b.iloc[i, 1]

        atom_1 = ex_a.loc[idx_1]
        atom_2 = ex_a.loc[idx_2]

        sym_1 = atom_1['3']
        sym_2 = atom_2['3']
        bond_len = np.sqrt((atom_1['0'] - atom_2['0'])**2 + (atom_1['1'] - atom_2['1'])**2 + (atom_1['2'] - atom_2['2'])**2)

        at_list_1.append(sym_1)
        at_list_2.append(sym_2)
        len_list.append(bond_len)
    
    bond_df = pd.DataFrame()
    bond_df['at_1'] = at_list_1
    bond_df['at_2'] = at_list_2
    bond_df['type'] = ex_b['2']
    bond_df['len'] = len_list
    
    # bond energy
    bond_df['bond_en'] = 1
    set_list = []

    for i in bond_df.index:
        at_set = {bond_df.loc[i, 'at_1'], bond_df.loc[i, 'at_2']}
        set_list.append(at_set)

    for idx, bond in enumerate(set_list):

        for j in en_df.index:
            at_set = {en_df.loc[j, 'at_1'], en_df.loc[j, 'at_2']}

            if (bond == at_set) and (bond_df.loc[idx, 'type'] == en_df.loc[j, 'type']):
                bond_df.loc[idx, 'bond_en'] = en_df.loc[j, 'energy']
    
    
    # aggregation of created variables
    
    a_cols = ['dist', 'at_n', 'at_w', 'el_neg', 'ion_en', 'el_aff', 'mp', 'bp', 'den']
    b_cols = ['len', 'bond_en']
    
    avg_dic = {}

    for i in a_cols:
        avg = np.mean(ex_a[i])
        avg_dic[i] = avg

    for j in b_cols:
        avg = np.mean(bond_df[j])
        avg_dic[j] = avg

    data_ex.append(avg_dic)

df_ex = pd.DataFrame(data_ex)

100%|████████████████████████████████████████████████████████████████████████████████| 457/457 [00:37<00:00, 12.04it/s]


In [24]:
df_g.to_csv('test_g.csv', index=False)
df_ex.to_csv('test_ex.csv', index=False)