In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

import pandas as pd
import numpy as np

def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

DATA = '/home/ike/Data/Molecular/'

train = pd.read_csv(f"{DATA}train.csv")
test = pd.read_csv(f"{DATA}test.csv")
structure = pd.read_csv(f"{DATA}structures.csv")
data = pd.concat([train.drop(columns=['scalar_coupling_constant']), test], ignore_index=True)
data = data.merge(structure.add_suffix("_0"), how='left', left_on=['molecule_name', 'atom_index_0'], right_on=['molecule_name_0', 'atom_index_0'])\
.merge(structure.add_suffix("_1"), how='left', left_on=['molecule_name', 'atom_index_1'], right_on=['molecule_name_1', 'atom_index_1'])\
.drop(columns=['molecule_name_0', 'molecule_name_1'])

In [2]:
data['dist_x'] = abs(data['x_0']-data['x_1'])
data['dist_y'] = abs(data['y_0']-data['y_1'])
data['dist_z'] = abs(data['z_0']-data['z_1'])
data['dist'] = np.sqrt(data['dist_x']**2 + data['dist_y']**2 + data['dist_z']**2)

In [3]:
data.head()

Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type,atom_0,x_0,y_0,z_0,atom_1,x_1,y_1,z_1,dist_x,dist_y,dist_z,dist
0,0,dsgdb9nsd_000001,1,0,1JHC,H,0.00215,-0.006031,0.001976,C,-0.012698,1.085804,0.008001,0.014849,1.091835,0.006025,1.091953
1,1,dsgdb9nsd_000001,1,2,2JHH,H,0.00215,-0.006031,0.001976,H,1.011731,1.463751,0.000277,1.00958,1.469782,0.0017,1.78312
2,2,dsgdb9nsd_000001,1,3,2JHH,H,0.00215,-0.006031,0.001976,H,-0.540815,1.447527,-0.876644,0.542965,1.453558,0.87862,1.783147
3,3,dsgdb9nsd_000001,1,4,2JHH,H,0.00215,-0.006031,0.001976,H,-0.523814,1.437933,0.906397,0.525964,1.443964,0.904421,1.783157
4,4,dsgdb9nsd_000001,2,0,1JHC,H,1.011731,1.463751,0.000277,C,-0.012698,1.085804,0.008001,1.024429,0.377947,0.007724,1.091952


In [4]:
neighbour = data.rename(columns={
    'atom_index_0': 'atom_index_1',
    'atom_index_1': 'atom_index_0',
    'atom_0': 'atom_1',
    'x_0': 'x_1',
    'y_0': 'y_1',
    'z_0': 'z_1',
    'atom_1': 'atom_0',
    'x_1': 'x_0',
    'y_1': 'y_0',
    'z_1': 'z_0'
}, )
neighbour = pd.concat([
    data[['molecule_name', 'atom_index_0', 'atom_index_1', 'type', 'atom_0', 'x_0', 'y_0', 'z_0', 'atom_1', 'x_1', 'y_1', 'z_1', 'dist']],
    neighbour[['molecule_name', 'atom_index_0', 'atom_index_1', 'type', 'atom_0', 'x_0', 'y_0', 'z_0', 'atom_1', 'x_1', 'y_1', 'z_1', 'dist']]
], ignore_index=True)

In [5]:
center_atom = structure.groupby('molecule_name')[['x', 'y', 'z']].mean()

idx = neighbour.groupby(['molecule_name', 'atom_index_0'])['dist'].idxmax()
farthest_atom = neighbour.loc[idx, ['x_1', 'y_1', 'z_1']]
farthest_atom.index = idx.index
farthest_atom.rename(columns={
    'x_1': 'x',
    'y_1': 'y',
    'z_1': 'z'
}, inplace=True)

idx = neighbour.groupby(['molecule_name', 'atom_index_0'])['dist'].idxmin()
nearest_atom = neighbour.loc[idx, ['x_1', 'y_1', 'z_1']]
nearest_atom.index = idx.index
nearest_atom.rename(columns={
    'x_1': 'x',
    'y_1': 'y',
    'z_1': 'z'
}, inplace=True)

In [6]:
neighbour = neighbour\
.merge(center_atom.add_suffix("_c"), how='left', left_on='molecule_name', right_index=True)\
.merge(farthest_atom.add_suffix("_f0"), how='left', left_on=['molecule_name', 'atom_index_0'], right_index=True)\
.merge(nearest_atom.add_suffix("_n0"), how='left', left_on=['molecule_name', 'atom_index_0'], right_index=True)\
.merge(farthest_atom.add_suffix("_f1"), how='left', left_on=['molecule_name', 'atom_index_1'], right_index=True)\
.merge(nearest_atom.add_suffix("_n1"), how='left', left_on=['molecule_name', 'atom_index_1'], right_index=True)

In [7]:
neighbour['dist_0_n'] = np.sqrt(((neighbour[['x_0', 'y_0', 'z_0']].values - neighbour[['x_n0', 'y_n0', 'z_n0']].values)**2).sum(axis=1)) + 1e-9
neighbour['dist_1_n'] = np.sqrt(((neighbour[['x_1', 'y_1', 'z_1']].values - neighbour[['x_n1', 'y_n1', 'z_n1']].values)**2).sum(axis=1)) + 1e-9
neighbour['dist_0_f'] = np.sqrt(((neighbour[['x_0', 'y_0', 'z_0']].values - neighbour[['x_f0', 'y_f0', 'z_f0']].values)**2).sum(axis=1)) + 1e-9
neighbour['dist_1_f'] = np.sqrt(((neighbour[['x_1', 'y_1', 'z_1']].values - neighbour[['x_f1', 'y_f1', 'z_f1']].values)**2).sum(axis=1)) + 1e-9
neighbour['dist_0_c'] = np.sqrt(((neighbour[['x_0', 'y_0', 'z_0']].values - neighbour[['x_c', 'y_c', 'z_c']].values)**2).sum(axis=1)) + 1e-9
neighbour['dist_1_c'] = np.sqrt(((neighbour[['x_1', 'y_1', 'z_1']].values - neighbour[['x_c', 'y_c', 'z_c']].values)**2).sum(axis=1)) + 1e-9

In [8]:
neighbour['cos_a0n0_a1n1'] = ((neighbour[['x_0', 'y_0', 'z_0']].values - neighbour[['x_n0', 'y_n0', 'z_n0']].values) * (neighbour[['x_1', 'y_1', 'z_1']].values - neighbour[['x_n1', 'y_n1', 'z_n1']].values)).sum(axis=1) / neighbour['dist_0_n'].values / neighbour['dist_1_n'].values
neighbour['cos_a0f0_a1f1'] = ((neighbour[['x_0', 'y_0', 'z_0']].values - neighbour[['x_f0', 'y_f0', 'z_f0']].values) * (neighbour[['x_1', 'y_1', 'z_1']].values - neighbour[['x_f1', 'y_f1', 'z_f1']].values)).sum(axis=1) / neighbour['dist_0_f'].values / neighbour['dist_1_f'].values
neighbour['cos_a0c_a1c'] = ((neighbour[['x_0', 'y_0', 'z_0']].values - neighbour[['x_c', 'y_c', 'z_c']].values) * (neighbour[['x_1', 'y_1', 'z_1']].values - neighbour[['x_c', 'y_c', 'z_c']].values)).sum(axis=1) / neighbour['dist_0_c'].values / neighbour['dist_1_c'].values
neighbour['cos_a0n0_a0a1'] = ((neighbour[['x_0', 'y_0', 'z_0']].values - neighbour[['x_n0', 'y_n0', 'z_n0']].values) * (neighbour[['x_0', 'y_0', 'z_0']].values - neighbour[['x_1', 'y_1', 'z_1']].values)).sum(axis=1) / neighbour['dist_0_n'].values / neighbour['dist'].values
neighbour['cos_a1n1_a0a1'] = ((neighbour[['x_1', 'y_1', 'z_1']].values - neighbour[['x_n1', 'y_n1', 'z_n1']].values) * (neighbour[['x_0', 'y_0', 'z_0']].values - neighbour[['x_1', 'y_1', 'z_1']].values)).sum(axis=1) / neighbour['dist_1_n'].values / neighbour['dist'].values
neighbour['cos_a0f0_a0a1'] = ((neighbour[['x_0', 'y_0', 'z_0']].values - neighbour[['x_f0', 'y_f0', 'z_f0']].values) * (neighbour[['x_0', 'y_0', 'z_0']].values - neighbour[['x_1', 'y_1', 'z_1']].values)).sum(axis=1) / neighbour['dist_0_f'].values / neighbour['dist'].values
neighbour['cos_a1f1_a0a1'] = ((neighbour[['x_1', 'y_1', 'z_1']].values - neighbour[['x_f1', 'y_f1', 'z_f1']].values) * (neighbour[['x_0', 'y_0', 'z_0']].values - neighbour[['x_1', 'y_1', 'z_1']].values)).sum(axis=1) / neighbour['dist_1_f'].values / neighbour['dist'].values

In [9]:
neighbour.drop(columns=[
    'dist',
    'x_c',
    'y_c',
    'z_c',
    'x_f0',
    'y_f0',
    'z_f0',
    'x_n0',
    'y_n0',
    'z_n0',
    'x_f1',
    'y_f1',
    'z_f1',
    'x_n1',
    'y_n1',
    'z_n1'
], inplace=True)

In [10]:
angle_molecule_atom = neighbour.groupby(['molecule_name', 'atom_index_0'])[['cos_a0n0_a0a1', 'cos_a0f0_a0a1']].agg(['mean', 'std', 'min', 'max', 'median', 'skew'])
angle_molecule_atom.columns = [
    'cos_a0_n_a1_molecule_atom_mean',
    'cos_a0_n_a1_molecule_atom_std',
    'cos_a0_n_a1_molecule_atom_min',
    'cos_a0_n_a1_molecule_atom_max',
    'cos_a0_n_a1_molecule_atom_median',
    'cos_a0_n_a1_molecule_atom_skew',
    'cos_a0_f_a1_molecule_atom_mean',
    'cos_a0_f_a1_molecule_atom_std',
    'cos_a0_f_a1_molecule_atom_min',
    'cos_a0_f_a1_molecule_atom_max',
    'cos_a0_f_a1_molecule_atom_median',
    'cos_a0_f_a1_molecule_atom_skew'
]

In [11]:
neighbourghbour = neighbour.merge(angle_molecule_atom.add_suffix('_0'), how='left', left_on=['molecule_name', 'atom_index_0'], right_index=True)\
.merge(angle_molecule_atom.add_suffix('_1'), how='left', left_on=['molecule_name', 'atom_index_1'], right_index=True)

In [12]:
atom_property = neighbour.drop_duplicates(subset=['molecule_name', 'atom_index_0'])

In [13]:
dist_molecule_atom = atom_property.groupby('molecule_name')[['dist_0_n', 'dist_0_f']].agg(['mean', 'std', 'min', 'max', 'median', 'skew'])
dist_molecule_atom.columns = [
    'dist_n_molecule_mean',
    'dist_n_molecule_std',
    'dist_n_molecule_min',
    'dist_n_molecule_max',
    'dist_n_molecule_median',
    'dist_n_molecule_skew',
    'dist_f_molecule_mean',
    'dist_f_molecule_std',
    'dist_f_molecule_min',
    'dist_f_molecule_max',
    'dist_f_molecule_median',
    'dist_f_molecule_skew'
]
dist_molecule_atom_type = atom_property.groupby(['molecule_name', 'atom_0'])[['dist_0_n', 'dist_0_f']].agg(['mean', 'std', 'min', 'max', 'median', 'skew'])
dist_molecule_atom_type.columns = [
    'dist_n_molecule_type_mean',
    'dist_n_molecule_type_std',
    'dist_n_molecule_type_min',
    'dist_n_molecule_type_max',
    'dist_n_molecule_type_median',
    'dist_n_molecule_type_skew',
    'dist_f_molecule_type_mean',
    'dist_f_molecule_type_std',
    'dist_f_molecule_type_min',
    'dist_f_molecule_type_max',
    'dist_f_molecule_type_median',
    'dist_f_molecule_type_skew'
]

In [14]:
neighbour = neighbour.merge(dist_molecule_atom, how='left', left_on='molecule_name', right_index=True)\
.merge(dist_molecule_atom_type.add_suffix("_0"), how='left', left_on=['molecule_name', 'atom_0'], right_index=True)\
.merge(dist_molecule_atom_type.add_suffix("_1"), how='left', left_on=['molecule_name', 'atom_1'], right_index=True)

In [15]:
neighbour.drop(columns=['molecule_name', 'atom_index_0', 'atom_index_1', 'type', 'atom_0', 'x_0', 'y_0', 'z_0', 'atom_1', 'x_1', 'y_1', 'z_1'], inplace=True)
neighbour.fillna(0.0, inplace=True)
neighbour = reduce_mem_usage(neighbour)
data = neighbour.iloc[:7163689, :]
data.to_pickle("angle_feature.gz")

Mem. usage decreased to 2678.08 Mb (50.0% reduction)
