In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
from tqdm import tqdm
import numpy as np
import pandas as pd
from sklearn import *
import gc

In [2]:
! ls ../data/*.csv

ls: cannot access '../data/*.csv': No such file or directory


# Data loading

In [3]:
from pathlib import Path
PATH = Path('../../data')

In [4]:
train = pd.read_csv(PATH/'train.csv')[::10]
test = pd.read_csv(PATH/'test.csv')[::10]

In [5]:
def feature_atomtype(df, s):
    # https://www.kaggle.com/jazivxt/all-this-over-a-dog
    df['atom1'] = df['type'].map(lambda x: str(x)[2])
    df['atom2'] = df['type'].map(lambda x: str(x)[3])
    lbl = preprocessing.LabelEncoder()
    for i in range(4):
        df['type'+str(i)] = lbl.fit_transform(df['type'].map(lambda x: str(x)[i]))

    df = pd.merge(df, s.rename(columns={'atom_index':'atom_index_0', 'x':'x0', 'y':'y0', 'z':'z0', 'atom':'atom1'}), how='left', on=['molecule_name', 'atom_index_0', 'atom1'])
    df = pd.merge(df, s.rename(columns={'atom_index':'atom_index_1', 'x':'x1', 'y':'y1', 'z':'z1', 'atom':'atom2'}), how='left', on=['molecule_name', 'atom_index_1', 'atom2'])
    return df

def feature_pair_geometry(df):
    p0 = df[['x0', 'y0', 'z0']].values
    p1 = df[['x1', 'y1', 'z1']].values
    r = np.linalg.norm(p0 - p1, axis=1)
    df['dist'] = r

    for agg in ['min', 'max', 'mean']:
        tmp = eval('df.groupby(["type"], as_index=False).dist.' + agg + '()')
        tmp.rename(columns={"dist":agg + "_dist"}, inplace=True)
        df = pd.merge(df, tmp, how='left', on=['type'])
    return df

In [6]:
def reduce_mem_usage(df, verbose=True):
    # somewhere from kaggle kernel
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose:
        print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem)
 / start_mem))
    
    return df

In [7]:
def feature_basic(df):
    structures = pd.read_csv(PATH/'structures.csv')
    df = feature_atomtype(df, structures)
    df = feature_pair_geometry(df)
    df = reduce_mem_usage(df)
    gc.collect()
    return df

In [8]:
train = feature_basic(train)
test = feature_basic(test)

Mem. usage decreased to 41.76 Mb (48.9% reduction)
Mem. usage decreased to 21.51 Mb (48.9% reduction)


In [9]:
train.head()

Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type,scalar_coupling_constant,atom1,atom2,type0,type1,...,x0,y0,z0,x1,y1,z1,dist,min_dist,max_dist,mean_dist
0,0,dsgdb9nsd_000001,1,0,1JHC,84.807602,H,C,0,0,...,0.00215,-0.006031,0.001976,-0.012698,1.085804,0.008001,1.091953,1.061272,1.121432,1.092919
1,10,dsgdb9nsd_000002,1,0,1JHN,32.6889,H,N,0,0,...,0.017257,0.012545,-0.027377,-0.040426,1.024108,0.062564,1.01719,1.002405,1.083587,1.012903
2,20,dsgdb9nsd_000007,2,1,2JHC,-2.37831,H,C,1,0,...,0.994873,1.939743,0.002941,0.002104,-0.003882,0.001999,2.182492,1.831791,2.52005,2.190124
3,30,dsgdb9nsd_000007,3,6,3JHH,3.25253,H,H,2,0,...,-0.542076,1.923611,-0.865117,-1.011477,-0.418034,0.009508,2.543345,2.07745,3.165045,2.703366
4,40,dsgdb9nsd_000007,5,7,2JHH,-11.6993,H,H,1,0,...,0.525487,-0.401908,0.877544,0.508626,-0.39247,-0.887601,1.765251,1.60663,1.96934,1.774909


In [10]:
# https://www.kaggle.com/adrianoavelar/bond-calculaltion-lb-0-82
def feature_neighbors(s):
    i_atom = s['atom_index'].values
    i_atom_type = s['atom'].values
    p = s[['x', 'y', 'z']].values
    m = s['molecule_name'].values
    t = np.empty((len(s)+1), dtype=np.object)
    t[:len(s)] = s['atom'].values
    p_compare = p
    m_compare = m
    t_compare = t
    source_row = np.arange(len(s))
    max_atoms = max(s.groupby('molecule_name').atom_index.max().values)
    bonds = np.zeros((len(s)+1, max_atoms+1), dtype=np.int8)
    bond_dists = np.zeros((len(s)+1, max_atoms+1), dtype=np.float32)
    bond_atoms = np.empty((len(s)+1, max_atoms+1), dtype=np.object)
    for i in tqdm(range(max_atoms-1)):
        p_compare = np.roll(p_compare, -1, axis=0)
        m_compare = np.roll(m_compare, -1, axis=0)
        t_compare = np.roll(t_compare, -1, axis=0)

        mask = np.where(m == m_compare, 1, 0) #Are we still comparing atoms in the same molecule?
        dists = np.linalg.norm(p - p_compare, axis=1) * mask
        r_bond = 3.0

        bond = np.where(np.logical_and(dists > 0.0001, dists < r_bond), 1, 0)

        source_row = source_row
        target_row = source_row + i + 1 #Note: Will be out of bounds of bonds array for some values of i
        target_row = np.where(np.logical_or(target_row > len(s), mask==0), len(s), target_row) #If invalid target, write to dummy row

        source_atom = i_atom
        target_atom = i_atom + i + 1 #Note: Will be out of bounds of bonds array for some values of i
        target_atom = np.where(np.logical_or(target_atom > max_atoms, mask==0), max_atoms, target_atom) #If invalid target, write to dummy col

        target_atom_type = np.where(np.logical_or(target_row > len(s), mask==0), '', t[target_row]) #If invalid target, write to dummy row    
        source_atom_type = i_atom_type

        bonds[(source_row, target_atom)] = bond
        bonds[(target_row, source_atom)] = bond
        bond_dists[(source_row, target_atom)] = dists
        bond_dists[(target_row, source_atom)] = dists
        bond_atoms[(source_row, target_atom)] = target_atom_type
        bond_atoms[(target_row, source_atom)] = source_atom_type

    bonds = np.delete(bonds, axis=0, obj=-1) #Delete dummy row
    bonds = np.delete(bonds, axis=1, obj=-1) #Delete dummy col
    bond_dists = np.delete(bond_dists, axis=0, obj=-1) #Delete dummy row
    bond_dists = np.delete(bond_dists, axis=1, obj=-1) #Delete dummy col
    bond_atoms = np.delete(bond_atoms, axis=0, obj=-1) #Delete dummy row
    bond_atoms = np.delete(bond_atoms, axis=1, obj=-1) #Delete dummy col
    
    mask = bonds == 1
    bond_lengths_mean = [np.mean(row[mask[j]]) for j,row in enumerate(tqdm(bond_dists))]
    n_bonds = np.sum(bonds, axis=1)
    bond_data = {'n_bonds':n_bonds, 'bond_lengths_mean': bond_lengths_mean }
    bond_df = pd.DataFrame(bond_data)
    s = s.join(bond_df)
    
    atom_types = ['C', 'N', 'H', 'O', 'F']
    bond_data = {}
    for at in atom_types:
        bonds_at = np.empty((len(s), max_atoms), dtype=np.int8) 
        bonds_at[:] = -1
        for i in tqdm(range(len(bond_atoms))):
            mask = bond_atoms[i,:] == at
            atom_j_indices = np.argwhere(mask)
            dists = bond_dists[i, mask]
            atom_j_sorted = np.argsort(dists)
            bonds_at[i, :len(atom_j_sorted)] = atom_j_sorted
            
        maxatom = np.max(np.sum(bonds_at >= 0, axis=1))
        for i in range(maxatom):
            bond_data['bond_%s_%d' % (at, i)] = bonds_at[:, i]
    bond_df = pd.DataFrame(bond_data)
    s = s.join(bond_df)
    return s

In [11]:
def feature_bonds(df1, df2):
    structures = pd.read_csv(PATH/'structures.csv')
    structures = feature_neighbors(structures)
    df1 = pd.merge(df1, structures.drop(['x', 'y', 'z'], axis=1).rename(columns={'atom':'atom1'}), how='left', on=['molecule_name', 'atom_index_0', 'atom1'])
    df1 = pd.merge(df1, structures.drop(['x', 'y', 'z'], axis=1).rename(columns={'atom':'atom1'}), how='left', on=['molecule_name', 'atom_index_0', 'atom1'])
    df2 = pd.merge(df2, structures.drop(['x', 'y', 'z'], axis=1).rename(columns={'atom':'atom1'}), how='left', on=['molecule_name', 'atom_index_0', 'atom1'])
    df2 = pd.merge(df2, structures.drop(['x', 'y', 'z'], axis=1).rename(columns={'atom':'atom1'}), how='left', on=['molecule_name', 'atom_index_0', 'atom1'])

    df1 = reduce_mem_usage(df1)
    df2 = reduce_mem_usage(df2)
    gc.collect()
    return df1, df2

In [None]:
train, test = feature_bonds(train, test)

100%|██████████| 27/27 [00:07<00:00,  3.72it/s]
100%|██████████| 2358657/2358657 [00:18<00:00, 127733.39it/s]
100%|██████████| 2358657/2358657 [00:25<00:00, 91266.23it/s]
 44%|████▍     | 1048253/2358657 [00:11<00:14, 93139.25it/s]

In [13]:
excluded = ['id', 'molecule_name', 'atom_index_0', 'atom_index_1', 'atom1', 'atom2'] + ['x0', 'y0', 'z0', 'x1', 'y1', 'z1']
col = [c for c in train.columns if c not in ['scalar_coupling_constant'] + excluded]
reg = ensemble.ExtraTreesRegressor(n_jobs=-1, n_estimators=20, random_state=4, verbose=1)

In [15]:
col

Index(['id', 'molecule_name', 'atom_index_0', 'atom_index_1', 'type',
       'scalar_coupling_constant', 'atom1', 'atom2', 'type0', 'type1',
       ...
       'bond_O_1_y', 'bond_O_2_y', 'bond_O_3_y', 'bond_O_4_y', 'bond_F_0_y',
       'bond_F_1_y', 'bond_F_2_y', 'bond_F_3_y', 'bond_F_4_y', 'bond_F_5_y'],
      dtype='object', length=124)

In [54]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(train[col], train['scalar_coupling_constant'], test_size=0.2)

In [55]:
reg.fit(X_train.drop(['type'], axis=1), y_train)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 6 concurrent workers.
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:   38.9s finished


ExtraTreesRegressor(bootstrap=False, criterion='mse', max_depth=None,
          max_features='auto', max_leaf_nodes=None,
          min_impurity_decrease=0.0, min_impurity_split=None,
          min_samples_leaf=1, min_samples_split=2,
          min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=-1,
          oob_score=False, random_state=4, verbose=1, warm_start=False)

In [56]:
# https://www.kaggle.com/uberkinder/efficient-metric
def group_mean_log_mae(y_true, y_pred, types, floor=1e-9):
    maes = (y_true-y_pred).abs().groupby(types).mean()
    return np.log(maes.map(lambda x: max(x, floor))).mean()

In [57]:
y_pred = reg.predict(X_test.drop('type', axis=1))

[Parallel(n_jobs=6)]: Using backend ThreadingBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  20 out of  20 | elapsed:    0.3s finished


In [58]:
group_mean_log_mae(y_test, y_pred, X_test.type)

0.23199779434054785

In [59]:
for t in train.type.unique():
    idx = X_test.type == t
    print(t, (y_test[idx] - y_pred[idx]).abs().mean())

1JHC 3.13440698398015
1JHN 1.8348052993434065
2JHC 1.61181149884822
3JHH 0.9873188273634053
2JHH 0.7384456583758383
3JHC 1.5267124508881924
2JHN 1.047342654303434
3JHN 0.592053938633429


In [37]:
# train with all data
reg.fit(train[col].drop('type', axis=1), train['scalar_coupling_constant'])

  return umr_sum(a, axis, dtype, out, keepdims, initial)
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 6 concurrent workers.
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    0.8s finished


ExtraTreesRegressor(bootstrap=False, criterion='mse', max_depth=None,
          max_features='auto', max_leaf_nodes=None,
          min_impurity_decrease=0.0, min_impurity_split=None,
          min_samples_leaf=1, min_samples_split=2,
          min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=-1,
          oob_score=False, random_state=4, verbose=1, warm_start=False)

In [41]:
test['scalar_coupling_constant']  = reg.predict(test[col].drop('type', axis=1))
test[['id', 'scalar_coupling_constant']].to_csv('submission.csv', index=False) #float_format='%.9f'

[Parallel(n_jobs=6)]: Using backend ThreadingBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  20 out of  20 | elapsed:    0.1s finished


In [46]:
from IPython.display import FileLink
FileLink('submission.csv')

In [None]:
# score: ~0.7