In [1]:
import numpy as np # linear algebra
from scipy.stats.stats import pearsonr
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from tqdm import tqdm_notebook as tqdm
import seaborn as sns 
import matplotlib.pyplot as plt
import math
sns.set()
import os
import openbabel as ob
pd.set_option('display.max_columns', 500)

In [36]:
train = pd.read_csv('train.csv')
test = pd.read_csv(f'test.csv')
structures = pd.read_csv('structures.csv')
x = structures.groupby('molecule_name').atom_index.max().reset_index(drop=False)
x.columns = ['molecule_name','totalatoms']
x.totalatoms+=1
train = train.merge(x,on='molecule_name')
test = test.merge(x,on='molecule_name')
#train = train[train.molecule_name=='dsgdb9nsd_000001']

In [37]:
obConversion = ob.OBConversion()
obConversion.SetInFormat("xyz")
structdir='./Structures/'
mols=[]
mols_files=os.listdir(structdir)
mols_index=dict(map(reversed,enumerate(mols_files)))
for f in mols_index.keys():
    try:
        mol = ob.OBMol()
        obConversion.ReadFile(mol, structdir+f) 
        mols.append(mol)
    except:
        print("Wrong format")
        pass

In [38]:
# Get atom information for train
methods = [ "eem", "mmff94", "gasteiger", "qeq", "qtpie", 
               "eem2015ha", "eem2015hm", "eem2015hn", "eem2015ba", "eem2015bm", "eem2015bn" ]
def create_dipole_moments(df, mols):
    stats = []
    for m, groupdf in tqdm(df.groupby('molecule_name')):
        mol=mols[mols_index[m+'.xyz']]
        entrystats = {}
        entrystats['molecule_name'] = m
        for method in methods:
            # on the command-line, you can get partial charges like this
            # obabel CHF3.g03 -onul --partialcharge gasteiger --print
            cm = ob.OBChargeModel_FindType(method)
            cm.ComputeCharges(mol)
            # but in C++ or Python we can get the dipole moment vector
            dipole = cm.GetDipoleMoment(mol)
            entrystats[f'dipole_{method}_x'] = cm.GetDipoleMoment(mol).GetX()
            entrystats[f'dipole_{method}_y'] = cm.GetDipoleMoment(mol).GetY()
            entrystats[f'dipole_{method}_z'] = cm.GetDipoleMoment(mol).GetZ()         
            entrystats[f'moment_{method}'] = moment = math.sqrt(dipole.GetX()**2+dipole.GetY()**2+dipole.GetZ()**2)
            
        stats.append(entrystats)
        
    outfile = pd.DataFrame(stats)    
    return outfile
        #entrystats = {}
        #entrystats['molecule_name'] = m
        #entrystats['molecule_dipole_moment_x'] = openbabel.OBChargeModel.GetDipoleMoment(mol)[0]
        #entrystats['molecule_dipole_moment_y'] = openbabel.OBChargeModel.GetDipoleMoment(mol)[1]
        #entrystats['molecule_dipole_moment_z'] = openbabel.OBChargeModel.GetDipoleMoment(mol)[2]

In [39]:
dipole_train = create_dipole_moments(train, mols)

HBox(children=(IntProgress(value=0, max=85003), HTML(value='')))

In [42]:
dipole_train.shape

(85003, 45)

In [43]:
dipole_test = create_dipole_moments(test, mols)

HBox(children=(IntProgress(value=0, max=45772), HTML(value='')))

In [44]:
dipole_test.head()

Unnamed: 0,dipole_eem2015ba_x,dipole_eem2015ba_y,dipole_eem2015ba_z,dipole_eem2015bm_x,dipole_eem2015bm_y,dipole_eem2015bm_z,dipole_eem2015bn_x,dipole_eem2015bn_y,dipole_eem2015bn_z,dipole_eem2015ha_x,...,moment_eem2015ba,moment_eem2015bm,moment_eem2015bn,moment_eem2015ha,moment_eem2015hm,moment_eem2015hn,moment_gasteiger,moment_mmff94,moment_qeq,moment_qtpie
0,1.831868e-15,0.0,-2.775558e-17,-1.74305e-14,0.0,0.0,1.124101e-14,0.0,2.775558e-17,-6.106227e-16,...,1.832078e-15,1.74305e-14,1.124104e-14,6.106227e-16,6.106227e-16,2.581269e-15,0.0,0.0,8.716683e-16,1.121603e-16
1,0.2767103,0.378973,-0.4974225,0.2390778,0.327434,-0.429776,0.2512039,0.344042,-0.4515744,0.3190803,...,0.683826,0.5908292,0.620796,0.7885332,0.7043739,0.6488648,1.795077,1.687692,0.965406,1.005163
2,7.288001e-06,-1e-05,-1.032895e-07,9.306704e-05,-0.000132,-1e-06,8.646845e-05,-0.000119,-1.226044e-06,-1.64454e-06,...,1.242947e-05,0.0001612929,0.0001472976,2.820404e-06,0.0001574348,0.0001340853,0.000116,0.000359,0.0002170352,0.0003597267
3,0.8953699,0.533465,-0.005753728,0.4656249,0.277433,-0.00301,0.8072865,0.480979,-0.00521825,0.9406552,...,1.04226,0.5420191,0.9397232,1.094976,0.6428395,0.9980073,4.090892,3.342608,2.067661,2.559191
4,0.7068226,0.526501,-0.0680833,0.5732893,0.426672,-0.057323,0.6128178,0.466284,-0.08003477,0.8605125,...,0.8839888,0.7169346,0.7741912,1.057254,0.9024214,0.8075849,1.493993,1.780029,1.123998,1.488473


In [45]:
dipole_train.to_csv('df_dipole_train.csv', index = False)
dipole_test.to_csv('df_dipole_test.csv', index = False)

In [2]:
dipole_train = pd.read_csv('df_dipole_train.csv')
dipole_test = pd.read_csv('df_dipole_test.csv')

In [6]:
dipole_train.head()

Unnamed: 0,dipole_eem2015ba_x,dipole_eem2015ba_y,dipole_eem2015ba_z,dipole_eem2015bm_x,dipole_eem2015bm_y,dipole_eem2015bm_z,dipole_eem2015bn_x,dipole_eem2015bn_y,dipole_eem2015bn_z,dipole_eem2015ha_x,dipole_eem2015ha_y,dipole_eem2015ha_z,dipole_eem2015hm_x,dipole_eem2015hm_y,dipole_eem2015hm_z,dipole_eem2015hn_x,dipole_eem2015hn_y,dipole_eem2015hn_z,dipole_eem_x,dipole_eem_y,dipole_eem_z,dipole_gasteiger_x,dipole_gasteiger_y,dipole_gasteiger_z,dipole_mmff94_x,dipole_mmff94_y,dipole_mmff94_z,dipole_qeq_x,dipole_qeq_y,dipole_qeq_z,dipole_qtpie_x,dipole_qtpie_y,dipole_qtpie_z,molecule_name,moment_eem,moment_eem2015ba,moment_eem2015bm,moment_eem2015bn,moment_eem2015ha,moment_eem2015hm,moment_eem2015hn,moment_gasteiger,moment_mmff94,moment_qeq,moment_qtpie
0,5.257599e-07,-4.171933e-07,9.587587e-09,7e-06,-6e-06,2.25024e-07,7e-06,-6e-06,1.643369e-07,-1.185248e-07,9.461598e-08,-2.6384e-09,7e-06,-6e-06,2.338656e-07,6e-06,-5e-06,1.505996e-07,5.162559e-06,-4e-06,1.080148e-07,3.011039e-06,-3e-06,1.529568e-07,0.0,0.0,0.0,-2.9e-05,2.384273e-05,-9.043553e-07,5.7e-05,-4.6e-05,1.664848e-06,dsgdb9nsd_000001,7e-06,6.712419e-07,9e-06,9e-06,1.516815e-07,9e-06,8e-06,4e-06,0.0,3.8e-05,7.3e-05
1,0.1718592,-0.1150444,-0.3280418,0.173404,-0.116079,-0.3309895,0.152517,-0.102097,-0.2911221,0.1938536,-0.1297679,-0.3700241,0.194019,-0.129879,-0.3703379,0.158096,-0.105831,-0.3017706,0.1658959,-0.111053,-0.3166591,0.2198626,-0.146748,-0.4183036,0.741305,-0.496242,-1.414974,-0.749928,0.502012,1.431667,1.387155,-0.928579,-2.648194,dsgdb9nsd_000002,0.374336,0.3877915,0.391277,0.344147,0.4374206,0.437792,0.356735,0.509807,1.672704,1.692359,3.130397
2,0.4666781,-0.294188,-0.006023551,0.346085,-0.218167,-0.004467017,0.42296,-0.266629,-0.005459272,0.4950977,-0.3121034,-0.006390371,0.404522,-0.255005,-0.005221278,0.432155,-0.272425,-0.005577954,0.3200435,-0.201751,-0.004130896,0.8190548,-0.508684,-0.01037144,1.666745,-1.050695,-0.021513,-1.715619,1.081504,0.02214399,2.885833,-1.819193,-0.03724829,dsgdb9nsd_000003,0.37835,0.5516985,0.409135,0.500016,0.5852957,0.478218,0.510886,1.019033,1.970396,2.028174,3.411581
3,-0.01297169,0.9554743,0.005266079,-0.004135,0.304562,0.001678588,-0.005773,0.425199,0.002343475,-0.01597095,1.176395,0.00648368,-0.005771,0.42508,0.002342819,-0.006056,0.446091,0.002458624,-0.006489974,0.478041,0.002634716,-0.04744895,3.58026,0.02003849,-0.043466,3.201615,0.017646,0.01227,-0.9038,-0.004981276,-0.011307,0.832866,0.004590326,dsgdb9nsd_000005,0.478093,0.9555769,0.304595,0.425244,1.176521,0.425125,0.446139,3.368599,3.201959,0.903897,0.832956
4,-1.057558e-07,1.215925e-07,4.084135e-08,-1e-06,2e-06,6.015577e-07,-1e-06,2e-06,5.300113e-07,2.367787e-08,-2.86475e-08,-9.528736e-09,-1e-06,2e-06,6.020938e-07,-1e-06,1e-06,4.844108e-07,-9.445143e-07,1e-06,3.7488e-07,-6.343331e-07,1e-06,3.580509e-07,0.0,0.0,0.0,3e-06,4.681395e-07,-5.735078e-07,-5e-06,-1e-06,9.073841e-07,dsgdb9nsd_000007,2e-06,1.662439e-07,2e-06,2e-06,3.836819e-08,2e-06,2e-06,1e-06,0.0,3e-06,5e-06


In [5]:
methods = [ "eem", "mmff94", "gasteiger", "qeq", "qtpie", 
               "eem2015ha", "eem2015hm", "eem2015hn", "eem2015ba", "eem2015bm", "eem2015bn" ]

In [9]:
def dip_magn(df, methods):
    for m in methods:
        t_p_0 = df[[f'dipole_{m}_x', f'dipole_{m}_y', f'dipole_{m}_z']].values

        df[f'dipole_{m}_magnitude'] = np.linalg.norm(t_p_0, axis=1)
    return df


In [10]:
dipole_train = dip_magn(dipole_train, methods)

In [12]:
dipole_test = dip_magn(dipole_test, methods)

In [13]:
dipole_test.head()

Unnamed: 0,dipole_eem2015ba_x,dipole_eem2015ba_y,dipole_eem2015ba_z,dipole_eem2015bm_x,dipole_eem2015bm_y,dipole_eem2015bm_z,dipole_eem2015bn_x,dipole_eem2015bn_y,dipole_eem2015bn_z,dipole_eem2015ha_x,dipole_eem2015ha_y,dipole_eem2015ha_z,dipole_eem2015hm_x,dipole_eem2015hm_y,dipole_eem2015hm_z,dipole_eem2015hn_x,dipole_eem2015hn_y,dipole_eem2015hn_z,dipole_eem_x,dipole_eem_y,dipole_eem_z,dipole_gasteiger_x,dipole_gasteiger_y,dipole_gasteiger_z,dipole_mmff94_x,dipole_mmff94_y,dipole_mmff94_z,dipole_qeq_x,dipole_qeq_y,dipole_qeq_z,dipole_qtpie_x,dipole_qtpie_y,dipole_qtpie_z,molecule_name,moment_eem,moment_eem2015ba,moment_eem2015bm,moment_eem2015bn,moment_eem2015ha,moment_eem2015hm,moment_eem2015hn,moment_gasteiger,moment_mmff94,moment_qeq,moment_qtpie,dipole_eem_magnitude,dipole_mmff94_magnitude,dipole_gasteiger_magnitude,dipole_qeq_magnitude,dipole_qtpie_magnitude,dipole_eem2015ha_magnitude,dipole_eem2015hm_magnitude,dipole_eem2015hn_magnitude,dipole_eem2015ba_magnitude,dipole_eem2015bm_magnitude,dipole_eem2015bn_magnitude
0,1.831868e-15,0.0,-2.775558e-17,-1.74305e-14,0.0,0.0,1.124101e-14,0.0,2.775558e-17,-6.106227e-16,0.0,0.0,6.106227e-16,0.0,0.0,-2.581269e-15,0.0,0.0,-2.775558e-16,0.0,2.775558e-17,0.0,0.0,0.0,0.0,0.0,0.0,-8.699708e-16,0.0,5.4373170000000005e-17,-1.121603e-16,0.0,0.0,dsgdb9nsd_000004,2.789401e-16,1.832078e-15,1.74305e-14,1.124104e-14,6.106227e-16,6.106227e-16,2.581269e-15,0.0,0.0,8.716683e-16,1.121603e-16,2.789401e-16,0.0,0.0,8.716683e-16,1.121603e-16,6.106227e-16,6.106227e-16,2.581269e-15,1.832078e-15,1.74305e-14,1.124104e-14
1,0.2767103,0.378973,-0.4974225,0.2390778,0.327434,-0.429776,0.2512039,0.344042,-0.4515744,0.3190803,0.437001,-0.5735877,0.2850238,0.39036,-0.51237,0.2625622,0.359597,-0.471992,0.2070955,0.283632,-0.372283,0.738829,1.018077,-1.339192,0.682925,0.935309,-1.227645,-0.3906487,-0.535023,0.7022485,0.4067337,0.557056,-0.73117,dsgdb9nsd_000015,0.5117912,0.683826,0.5908292,0.620796,0.7885332,0.7043739,0.6488648,1.795077,1.687692,0.965406,1.005163,0.5117912,1.687692,1.83733,0.965406,1.005163,0.7885332,0.7043739,0.6488648,0.683826,0.5908292,0.620796
2,7.288001e-06,-1e-05,-1.032895e-07,9.306704e-05,-0.000132,-1e-06,8.646845e-05,-0.000119,-1.226044e-06,-1.64454e-06,2e-06,2.342386e-08,9.112539e-05,-0.000128,-1e-06,7.879315e-05,-0.000108,-1e-06,6.183242e-05,-8.5e-05,-8.729611e-07,5.5e-05,-9.4e-05,-8.727754e-07,0.000181,-0.00031,-3e-06,-0.0001417779,0.000164,1.908176e-06,0.0002422749,-0.000266,-3e-06,dsgdb9nsd_000016,0.000104796,1.242947e-05,0.0001612929,0.0001472976,2.820404e-06,0.0001574348,0.0001340853,0.000116,0.000359,0.0002170352,0.0003597267,0.000104796,0.000359,0.000109,0.0002170352,0.0003597267,2.820404e-06,0.0001574348,0.0001340853,1.242947e-05,0.0001612929,0.0001472976
3,0.8953699,0.533465,-0.005753728,0.4656249,0.277433,-0.00301,0.8072865,0.480979,-0.00521825,0.9406552,0.560449,-0.006033535,0.5522358,0.329037,-0.003556,0.8573564,0.510811,-0.005532,0.5435602,0.323863,-0.003508873,3.694611,2.276731,-0.0253627,2.871513,1.710877,-0.018582,-1.776195,-1.058407,0.01126997,2.198376,1.31012,-0.013799,dsgdb9nsd_000020,0.6327378,1.04226,0.5420191,0.9397232,1.094976,0.6428395,0.9980073,4.090892,3.342608,2.067661,2.559191,0.6327378,3.342608,4.33985,2.067661,2.559191,1.094976,0.6428395,0.9980073,1.04226,0.5420191,0.9397232
4,0.7068226,0.526501,-0.0680833,0.5732893,0.426672,-0.057323,0.6128178,0.466284,-0.08003477,0.8605125,0.61199,-0.05265603,0.7340641,0.52128,-0.061496,0.6424018,0.482808,-0.080061,0.4525469,0.348833,-0.06447504,1.106591,1.128717,-0.2829856,0.676756,1.135692,-1.191935,-0.512553,-0.931076,0.3657319,0.1146002,1.258317,-0.786802,dsgdb9nsd_000022,0.5750129,0.8839888,0.7169346,0.7741912,1.057254,0.9024214,0.8075849,1.493993,1.780029,1.123998,1.488473,0.5750129,1.780029,1.60581,1.123998,1.488473,1.057254,0.9024214,0.8075849,0.8839888,0.7169346,0.7741912


In [14]:
dipole_train.to_csv('df_dipole_train.csv', index = False)
dipole_test.to_csv('df_dipole_test.csv', index = False)