In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

import pandas as pd
import numpy as np

def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [2]:
DATA = '/home/ike/Data/Molecular/'

train = pd.read_csv(f"{DATA}train.csv")
test = pd.read_csv(f"{DATA}test.csv")
structure = pd.read_csv(f"{DATA}structures.csv")


In [3]:
data = pd.concat([train.drop(columns=['scalar_coupling_constant']), test], ignore_index=True)
data = data.merge(structure.add_suffix("_0"), how='left', left_on=['molecule_name', 'atom_index_0'], right_on=['molecule_name_0', 'atom_index_0'])\
.merge(structure.add_suffix("_1"), how='left', left_on=['molecule_name', 'atom_index_1'], right_on=['molecule_name_1', 'atom_index_1'])\
.drop(columns=['molecule_name_0', 'molecule_name_1'])

In [56]:
data.head()

Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type,atom_0,x_0,y_0,z_0,atom_1,...,dist_molecule_mean,dist_molecule_std,dist_x_molecule_min,dist_x_molecule_max,dist_y_molecule_min,dist_y_molecule_max,dist_z_molecule_min,dist_z_molecule_max,dist_molecule_min,dist_molecule_max
0,0,dsgdb9nsd_000001,1,0,1JHC,H,0.00215,-0.006031,0.001976,C,...,1.506668,0.356932,0.014849,1.552546,0.009594,1.469782,0.0017,1.783041,1.091946,1.783158
1,1,dsgdb9nsd_000001,1,2,2JHH,H,0.00215,-0.006031,0.001976,H,...,1.506668,0.356932,0.014849,1.552546,0.009594,1.469782,0.0017,1.783041,1.091946,1.783158
2,2,dsgdb9nsd_000001,1,3,2JHH,H,0.00215,-0.006031,0.001976,H,...,1.506668,0.356932,0.014849,1.552546,0.009594,1.469782,0.0017,1.783041,1.091946,1.783158
3,3,dsgdb9nsd_000001,1,4,2JHH,H,0.00215,-0.006031,0.001976,H,...,1.506668,0.356932,0.014849,1.552546,0.009594,1.469782,0.0017,1.783041,1.091946,1.783158
4,4,dsgdb9nsd_000001,2,0,1JHC,H,1.011731,1.463751,0.000277,C,...,1.506668,0.356932,0.014849,1.552546,0.009594,1.469782,0.0017,1.783041,1.091946,1.783158


In [4]:
data['dist_x'] = abs(data['x_0']-data['x_1'])
data['dist_y'] = abs(data['y_0']-data['y_1'])
data['dist_z'] = abs(data['z_0']-data['z_1'])
data['dist'] = np.sqrt(data['dist_x']**2 + data['dist_y']**2 + data['dist_z']**2)

In [5]:
data['dist_x_molecule_mean'] = data.groupby('molecule_name')['dist_x'].transform('mean')
data['dist_x_molecule_std'] = data.groupby('molecule_name')['dist_x'].transform('std')
data['dist_x_molecule_min'] = data.groupby('molecule_name')['dist_x'].transform('min')
data['dist_x_molecule_max'] = data.groupby('molecule_name')['dist_x'].transform('max')
data['dist_x_molecule_median'] = data.groupby('molecule_name')['dist_x'].transform('median')
data['dist_x_molecule_skew'] = data.groupby('molecule_name')['dist_x'].transform('skew')

data['dist_y_molecule_mean'] = data.groupby('molecule_name')['dist_y'].transform('mean')
data['dist_y_molecule_std'] = data.groupby('molecule_name')['dist_y'].transform('std')
data['dist_y_molecule_min'] = data.groupby('molecule_name')['dist_y'].transform('min')
data['dist_y_molecule_max'] = data.groupby('molecule_name')['dist_y'].transform('max')
data['dist_y_molecule_median'] = data.groupby('molecule_name')['dist_y'].transform('median')
data['dist_y_molecule_skew'] = data.groupby('molecule_name')['dist_y'].transform('skew')

data['dist_z_molecule_mean'] = data.groupby('molecule_name')['dist_z'].transform('mean')
data['dist_z_molecule_std'] = data.groupby('molecule_name')['dist_z'].transform('std')
data['dist_z_molecule_min'] = data.groupby('molecule_name')['dist_z'].transform('min')
data['dist_z_molecule_max'] = data.groupby('molecule_name')['dist_z'].transform('max')
data['dist_z_molecule_median'] = data.groupby('molecule_name')['dist_z'].transform('median')
data['dist_z_molecule_skew'] = data.groupby('molecule_name')['dist_z'].transform('skew')

data['dist_molecule_mean'] = data.groupby('molecule_name')['dist'].transform('mean')
data['dist_molecule_std'] = data.groupby('molecule_name')['dist'].transform('std')
data['dist_molecule_min'] = data.groupby('molecule_name')['dist'].transform('min')
data['dist_molecule_max'] = data.groupby('molecule_name')['dist'].transform('max')
data['dist_molecule_median'] = data.groupby('molecule_name')['dist'].transform('median')
data['dist_molecule_skew'] = data.groupby('molecule_name')['dist'].transform('skew')

data['dist_x_molecule_type_mean'] = data.groupby(['molecule_name', 'type'])['dist_x'].transform('mean')
data['dist_x_molecule_type_std'] = data.groupby(['molecule_name', 'type'])['dist_x'].transform('std')
data['dist_x_molecule_type_min'] = data.groupby(['molecule_name', 'type'])['dist_x'].transform('min')
data['dist_x_molecule_type_max'] = data.groupby(['molecule_name', 'type'])['dist_x'].transform('max')
data['dist_x_molecule_type_median'] = data.groupby(['molecule_name', 'type'])['dist_x'].transform('median')
data['dist_x_molecule_type_skew'] = data.groupby(['molecule_name', 'type'])['dist_x'].transform('skew')

data['dist_y_molecule_type_mean'] = data.groupby(['molecule_name', 'type'])['dist_y'].transform('mean')
data['dist_y_molecule_type_std'] = data.groupby(['molecule_name', 'type'])['dist_y'].transform('std')
data['dist_y_molecule_type_min'] = data.groupby(['molecule_name', 'type'])['dist_y'].transform('min')
data['dist_y_molecule_type_max'] = data.groupby(['molecule_name', 'type'])['dist_y'].transform('max')
data['dist_y_molecule_type_median'] = data.groupby(['molecule_name', 'type'])['dist_y'].transform('median')
data['dist_y_molecule_type_skew'] = data.groupby(['molecule_name', 'type'])['dist_y'].transform('skew')

data['dist_z_molecule_type_mean'] = data.groupby(['molecule_name', 'type'])['dist_z'].transform('mean')
data['dist_z_molecule_type_std'] = data.groupby(['molecule_name', 'type'])['dist_z'].transform('std')
data['dist_z_molecule_type_min'] = data.groupby(['molecule_name', 'type'])['dist_z'].transform('min')
data['dist_z_molecule_type_max'] = data.groupby(['molecule_name', 'type'])['dist_z'].transform('max')
data['dist_z_molecule_type_median'] = data.groupby(['molecule_name', 'type'])['dist_z'].transform('median')
data['dist_z_molecule_type_skew'] = data.groupby(['molecule_name', 'type'])['dist_z'].transform('skew')

data['dist_molecule_type_mean'] = data.groupby(['molecule_name', 'type'])['dist'].transform('mean')
data['dist_molecule_type_std'] = data.groupby(['molecule_name', 'type'])['dist'].transform('std')
data['dist_molecule_type_min'] = data.groupby(['molecule_name', 'type'])['dist'].transform('min')
data['dist_molecule_type_max'] = data.groupby(['molecule_name', 'type'])['dist'].transform('max')
data['dist_molecule_type_median'] = data.groupby(['molecule_name', 'type'])['dist'].transform('median')
data['dist_molecule_type_skew'] = data.groupby(['molecule_name', 'type'])['dist'].transform('skew')

data.fillna(0.0, inplace=True)

In [6]:
atom_molecule = pd.crosstab(structure['molecule_name'], structure['atom'], normalize='index')
bond_molecule = pd.crosstab(data['molecule_name'], data['type'], normalize='index')

In [7]:
neighbour = pd.DataFrame({
    'molecule_name': np.hstack([data['molecule_name'], data['molecule_name']]),
    'atom_index_0': np.hstack([data['atom_index_0'], data['atom_index_1']]),
    'atom_index_1': np.hstack([data['atom_index_1'], data['atom_index_0']]),
    'atom_0': np.hstack([data['atom_0'], data['atom_1']]),
    'atom_1': np.hstack([data['atom_1'], data['atom_0']]),
    'type': np.hstack([data['type'], data['type']]),
    'dist_x': np.hstack([data['dist_x'], data['dist_x']]),
    'dist_y': np.hstack([data['dist_y'], data['dist_y']]),
    'dist_z': np.hstack([data['dist_z'], data['dist_z']]),
    'dist': np.hstack([data['dist'], data['dist']]),
}, columns=['molecule_name', 'atom_index_0', 'atom_index_1', 'atom_0', 'atom_1', 'type', 'dist_x', 'dist_y', 'dist_z', 'dist'])

In [8]:
bond_molecule_atom = pd.crosstab([neighbour['molecule_name'], neighbour['atom_index_0']], neighbour['type'], normalize='index')
atom_molecule_atom = pd.crosstab([neighbour['molecule_name'], neighbour['atom_index_0']], neighbour['atom_1'], normalize='index')

In [9]:
atom_property = pd.concat([
    neighbour.groupby(['molecule_name', 'atom_index_0'])['atom_index_1'].count().rename("neighbour_count_molecule_atom"),
    neighbour.groupby(['molecule_name', 'atom_index_0'])['dist'].mean().rename('neighbour_mean_dist_molecule_atom'),
    neighbour.groupby(['molecule_name', 'atom_index_0'])['dist'].std().rename('neighbour_std_dist_molecule_atom'),
    neighbour.groupby(['molecule_name', 'atom_index_0'])['dist'].min().rename('neighbour_min_dist_molecule_atom'),
    neighbour.groupby(['molecule_name', 'atom_index_0'])['dist'].max().rename('neighbour_max_dist_molecule_atom'),
    neighbour.groupby(['molecule_name', 'atom_index_0'])['dist'].median().rename('neighbour_median_dist_molecule_atom'),
    neighbour.groupby(['molecule_name', 'atom_index_0'])['dist'].skew().rename('neighbour_skew_dist_molecule_atom'),
    neighbour.groupby(['molecule_name', 'atom_index_0'])['dist_x'].mean().rename('neighbour_mean_dist_x_molecule_atom'),
    neighbour.groupby(['molecule_name', 'atom_index_0'])['dist_x'].std().rename('neighbour_std_dist_x_molecule_atom'),
    neighbour.groupby(['molecule_name', 'atom_index_0'])['dist_x'].min().rename('neighbour_min_dist_x_molecule_atom'),
    neighbour.groupby(['molecule_name', 'atom_index_0'])['dist_x'].max().rename('neighbour_max_dist_x_molecule_atom'),
    neighbour.groupby(['molecule_name', 'atom_index_0'])['dist_x'].median().rename('neighbour_median_dist_x_molecule_atom'),
    neighbour.groupby(['molecule_name', 'atom_index_0'])['dist_x'].skew().rename('neighbour_skew_dist_x_molecule_atom'),
    neighbour.groupby(['molecule_name', 'atom_index_0'])['dist_y'].mean().rename('neighbour_mean_dist_y_molecule_atom'),
    neighbour.groupby(['molecule_name', 'atom_index_0'])['dist_y'].std().rename('neighbour_std_dist_y_molecule_atom'),
    neighbour.groupby(['molecule_name', 'atom_index_0'])['dist_y'].min().rename('neighbour_min_dist_y_molecule_atom'),
    neighbour.groupby(['molecule_name', 'atom_index_0'])['dist_y'].max().rename('neighbour_max_dist_y_molecule_atom'),
    neighbour.groupby(['molecule_name', 'atom_index_0'])['dist_y'].median().rename('neighbour_median_dist_y_molecule_atom'),
    neighbour.groupby(['molecule_name', 'atom_index_0'])['dist_y'].skew().rename('neighbour_skew_dist_y_molecule_atom'),
    neighbour.groupby(['molecule_name', 'atom_index_0'])['dist_z'].mean().rename('neighbour_mean_dist_z_molecule_atom'),
    neighbour.groupby(['molecule_name', 'atom_index_0'])['dist_z'].std().rename('neighbour_std_dist_z_molecule_atom'),
    neighbour.groupby(['molecule_name', 'atom_index_0'])['dist_z'].min().rename('neighbour_min_dist_z_molecule_atom'),
    neighbour.groupby(['molecule_name', 'atom_index_0'])['dist_z'].max().rename('neighbour_max_dist_z_molecule_atom'),
    neighbour.groupby(['molecule_name', 'atom_index_0'])['dist_z'].median().rename('neighbour_median_dist_z_molecule_atom'),
    neighbour.groupby(['molecule_name', 'atom_index_0'])['dist_z'].skew().rename('neighbour_skew_dist_z_molecule_atom'),
], axis=1)
atom_property.fillna(0.0, inplace=True)

In [10]:
atom_property = pd.concat([atom_property, bond_molecule_atom.add_prefix("neighbour_"), atom_molecule_atom.add_prefix("neighbour_")], axis=1)

In [11]:
data = data.merge(atom_molecule, how='left', left_on='molecule_name', right_index=True)\
.merge(bond_molecule, how='left', left_on='molecule_name', right_index=True)\
.merge(atom_property.add_suffix("_0"), how='left', left_on=['molecule_name', 'atom_index_0'], right_index=True)\
.merge(atom_property.add_suffix("_1"), how='left', left_on=['molecule_name', 'atom_index_1'], right_index=True)

In [12]:
data = reduce_mem_usage(data)

Mem. usage decreased to 4501.08 Mb (47.5% reduction)


In [13]:
data.to_pickle("basic", 'gzip')

Read from data

In [3]:
angle = pd.read_pickle("angle_feature.gz").drop(columns=[
    'dist',
    'x_c',
    'y_c',
    'z_c',
    'x_f0',
    'y_f0',
    'z_f0',
    'x_n0',
    'y_n0',
    'z_n0',
    'x_f1',
    'y_f1',
    'z_f1',
    'x_n1',
    'y_n1',
    'z_n1'
])
data = pd.concat([
    pd.read_pickle("basic.gz"),
    angle
], axis = 1)

In [6]:
for col in angle.columns:
    print(col)

dist_0_n
dist_1_n
dist_0_f
dist_1_f
dist_0_c
dist_1_c
cos_a0n0_a1n1
cos_a0f0_a1f1
cos_a0c_a1c
cos_a0n0_a0a1
cos_a1n1_a0a1
cos_a0f0_a0a1
cos_a1f1_a0a1
dist_n_molecule_mean
dist_n_molecule_std
dist_n_molecule_min
dist_n_molecule_max
dist_n_molecule_median
dist_n_molecule_skew
dist_f_molecule_mean
dist_f_molecule_std
dist_f_molecule_min
dist_f_molecule_max
dist_f_molecule_median
dist_f_molecule_skew
dist_c_molecule_mean
dist_c_molecule_std
dist_c_molecule_min
dist_c_molecule_max
dist_c_molecule_median
dist_c_molecule_skew
dist_n_molecule_type_mean_0
dist_n_molecule_type_std_0
dist_n_molecule_type_min_0
dist_n_molecule_type_max_0
dist_n_molecule_type_median_0
dist_n_molecule_type_skew_0
dist_f_molecule_type_mean_0
dist_f_molecule_type_std_0
dist_f_molecule_type_min_0
dist_f_molecule_type_max_0
dist_f_molecule_type_median_0
dist_f_molecule_type_skew_0
dist_c_molecule_type_mean_0
dist_c_molecule_type_std_0
dist_c_molecule_type_min_0
dist_c_molecule_type_max_0
dist_c_molecule_type_median_0
di

In [4]:
data.drop(columns=[
    'id',
    'molecule_name',
    'atom_index_0',
    'atom_index_1',
    'type',
    'atom_0',
    'x_0',
    'y_0',
    'z_0',
    'atom_1',
    'x_1',
    'y_1',
    'z_1'
    
], inplace=True)

In [5]:
data.drop(columns=[
'dist_c_molecule_mean',
'dist_c_molecule_std',
'dist_c_molecule_min',
'dist_c_molecule_max',
'dist_c_molecule_median',
'dist_c_molecule_skew',
'dist_c_molecule_type_mean_0',
'dist_c_molecule_type_std_0',
'dist_c_molecule_type_min_0',
'dist_c_molecule_type_max_0',
'dist_c_molecule_type_median_0',
'dist_c_molecule_type_skew_0',
'dist_c_molecule_type_mean_1',
'dist_c_molecule_type_std_1',
'dist_c_molecule_type_min_1',
'dist_c_molecule_type_max_1',
'dist_c_molecule_type_median_1',
'dist_c_molecule_type_skew_1'

], inplace=True)

In [6]:
data.drop(columns=[
'dist_n_molecule_type_min_1',
'dist_n_molecule_type_max_1',
'dist_n_molecule_type_min_0',
'dist_n_molecule_type_max_0',
'dist_f_molecule_type_min_1',
'dist_f_molecule_type_max_1',
'dist_f_molecule_type_min_0',
'dist_f_molecule_type_max_0',
], inplace=True)

In [11]:
data.drop(columns=[
'dist_n_molecule_min',
'dist_n_molecule_max',
'dist_f_molecule_min',
'dist_f_molecule_max',
], inplace=True)

In [12]:
Xtest = data.iloc[4658147:, :]
X = data.iloc[:4658147, :]

In [8]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error

folds = KFold(n_splits=5, random_state=49)
y = train.scalar_coupling_constant

In [13]:
loss = []
for bond in pd.unique(train['type']):
    Xb = X[train['type'] == bond]
    yb = y[train['type'] == bond]
    lr = LinearRegression()
    loss_ = []
    for it, iv in folds.split(Xb, y=yb):
        Xt = Xb.iloc[it, :]
        yt = yb.iloc[it]
        Xv = Xb.iloc[iv, :]
        yv = yb.iloc[iv]
        lr.fit(Xt, yt)
        yv_ = lr.predict(Xv)
        loss_.append(mean_absolute_error(yv, yv_))
    loss.append(np.mean(loss_))

In [10]:
print(f"Final loss: {sum(map(lambda itm: np.log(itm), loss))/8}")

Final loss: 0.560250778031026


In [14]:
print(f"Final loss: {sum(map(lambda itm: np.log(itm), loss))/8}")

Final loss: 0.5606243476724897
