In [21]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
from tqdm import tqdm_notebook as tqdm
%matplotlib inline
%load_ext line_profiler

In [2]:
os.listdir('../../data/')

['test.csv',
 'potential_energy.csv',
 'structures.csv',
 'init',
 'train.csv',
 'sample_submission.csv',
 'dipole_moments.csv',
 'mulliken_charges.csv',
 'scalar_coupling_contributions.csv',
 '.ipynb_checkpoints',
 'magnetic_shielding_tensors.csv']

In [3]:
train_df = pd.read_csv('../../data/train.csv')
structure_df = pd.read_csv('../../data/structures.csv')

In [4]:
train_df.head()


Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type,scalar_coupling_constant
0,0,dsgdb9nsd_000001,1,0,1JHC,84.8076
1,1,dsgdb9nsd_000001,1,2,2JHH,-11.257
2,2,dsgdb9nsd_000001,1,3,2JHH,-11.2548
3,3,dsgdb9nsd_000001,1,4,2JHH,-11.2543
4,4,dsgdb9nsd_000001,2,0,1JHC,84.8074


In [70]:
radius = pd.read_pickle('../../pkl/atom_radius.pkl')

In [71]:
radius

{'C': {'radius': 0.67, 'electrons': 6, 'atomic_weight': 12.0107},
 'H': {'radius': 0.53, 'electrons': 1, 'atomic_weight': 1.0079},
 'N': {'radius': 0.56, 'electrons': 7, 'atomic_weight': 14.0067},
 'O': {'radius': 0.48, 'electrons': 8, 'atomic_weight': 15.9994},
 'F': {'radius': 0.42, 'electrons': 9, 'atomic_weight': 18.9984}}

In [7]:
structure_df.head()

Unnamed: 0,molecule_name,atom_index,atom,x,y,z
0,dsgdb9nsd_000001,0,C,-0.012698,1.085804,0.008001
1,dsgdb9nsd_000001,1,H,0.00215,-0.006031,0.001976
2,dsgdb9nsd_000001,2,H,1.011731,1.463751,0.000277
3,dsgdb9nsd_000001,3,H,-0.540815,1.447527,-0.876644
4,dsgdb9nsd_000001,4,H,-0.523814,1.437933,0.906397


In [8]:
radius["C"]["radius"]

0.67

In [9]:
add = np.zeros((len(train_df),1),dtype=np.float32)
for i in tqdm(range(len(structure_df))):
    if structure_df.iloc[i][2] == "C":
        add[i] = radius["C"]["radius"]
    elif structure_df.iloc[i][2] == "H":
        add[i] = radius["H"]["radius"]
    elif structure_df.iloc[i][2] == "N":
        add[i] = radius["N"]["radius"]
    elif structure_df.iloc[i][2] == "O":
        add[i] = radius["O"]["radius"]
    elif structure_df.iloc[i][2] == "F":
        add[i] = radius["F"]["radius"]
    

HBox(children=(IntProgress(value=0, max=2358657), HTML(value='')))




In [10]:
train_df.shape

(4658147, 6)

In [11]:
structure_df.shape

(2358657, 6)

In [12]:
add = add[:2358657,:]

In [13]:
add_df = pd.DataFrame(add)

In [14]:
s_df = pd.concat([structure_df,add_df],axis = 1)

In [25]:
# def test():
#     a = np.zeros((len(train_df),2),dtype=np.float32)
#     stock = None
#     molecule_name_atom = None
#     for i,row in tqdm(train_df.head().iterrows(),total=len(train_df)):
#         if stock != row[1]:
#             stock = row[1]
#             molecule_name_atom = s_df[s_df['molecule_name']==stock].values
#         a[i,0] = molecule_name_atom[np.where(molecule_name_atom[:,1]==row[2])[0],6]
#         a[i,1] = molecule_name_atom[np.where(molecule_name_atom[:,1]==row[3])[0],6]

In [74]:
def make_atom_dic():
    moleculer_dic = dict()
    
    for i,row in tqdm(structure_df.iterrows(),total=len(structure_df)):
        
        if row[0] in moleculer_dic.keys():
            moleculer_dic[row[0]][row[1]] =  radius[row[2]]
        else:
            moleculer_dic[row[0]] = {row[1]:radius[row[2]]}
    return moleculer_dic

In [75]:
moleculer_dic = make_atom_dic()


HBox(children=(IntProgress(value=0, max=2358657), HTML(value='')))




In [76]:
moleculer_dic["dsgdb9nsd_000001"]

{0: {'radius': 0.67, 'electrons': 6, 'atomic_weight': 12.0107},
 1: {'radius': 0.53, 'electrons': 1, 'atomic_weight': 1.0079},
 2: {'radius': 0.53, 'electrons': 1, 'atomic_weight': 1.0079},
 3: {'radius': 0.53, 'electrons': 1, 'atomic_weight': 1.0079},
 4: {'radius': 0.53, 'electrons': 1, 'atomic_weight': 1.0079}}

In [85]:
def add_radius_feature():
    a = np.zeros((len(train_df),2),dtype=np.float32)
    for i,row in tqdm(train_df.iterrows(),total=len(train_df)):
        a[i,0] = moleculer_dic[row[1]][row[2]]["radius"]
        a[i,1] = moleculer_dic[row[1]][row[3]]["radius"]
    return a

In [90]:
def add_electrons_feature():
    a = np.zeros((len(train_df),2),dtype=np.float32)
    for i,row in tqdm(train_df.iterrows(),total=len(train_df)):
        a[i,0] = moleculer_dic[row[1]][row[2]]["electrons"]
        a[i,1] = moleculer_dic[row[1]][row[3]]["electrons"]
    return a

In [90]:
def add_atomic_weight_feature():
    a = np.zeros((len(train_df),2),dtype=np.float32)
    for i,row in tqdm(train_df.iterrows(),total=len(train_df)):
        a[i,0] = moleculer_dic[row[1]][row[2]]["atomic_weight"]
        a[i,1] = moleculer_dic[row[1]][row[3]]["atomic_weight"]
    return a

In [86]:
radius_feature = add_radius_feature()

HBox(children=(IntProgress(value=0, max=4658147), HTML(value='')))




In [91]:
electrons_feature = add_electrons_feature()

HBox(children=(IntProgress(value=0, max=4658147), HTML(value='')))




In [87]:
radius_feature

array([[0.53, 0.67],
       [0.53, 0.53],
       [0.53, 0.53],
       ...,
       [0.53, 0.67],
       [0.53, 0.67],
       [0.53, 0.67]], dtype=float32)

In [93]:
electrons_feature

array([[1., 6.],
       [1., 1.],
       [1., 1.],
       ...,
       [1., 6.],
       [1., 6.],
       [1., 6.]], dtype=float32)

In [28]:
%lprun  -f test test()

HBox(children=(IntProgress(value=0, max=2358657), HTML(value='')))

molecule_name    dsgdb9nsd_000001
atom_index                      0
atom                            C
x                      -0.0126981
y                          1.0858
z                        0.008001
Name: 0, dtype: object
molecule_name    dsgdb9nsd_000001
atom_index                      1
atom                            H
x                      0.00215042
y                     -0.00603132
z                      0.00197612
Name: 1, dtype: object
molecule_name    dsgdb9nsd_000001
atom_index                      2
atom                            H
x                         1.01173
y                         1.46375
z                     0.000276575
Name: 2, dtype: object
molecule_name    dsgdb9nsd_000001
atom_index                      3
atom                            H
x                       -0.540815
y                         1.44753
z                       -0.876644
Name: 3, dtype: object
molecule_name    dsgdb9nsd_000001
atom_index                      4
atom                    

In [89]:
pd.to_pickle(radius_feature, '../../pkl/radius_feature.pkl')

In [94]:
pd.to_pickle(electrons_feature, '../../pkl/electrons_feature.pkl')