In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
from tqdm import tqdm
import numpy as np
import pandas as pd
from sklearn import *
import gc

In [2]:
! ls ../data/*.csv

ls: cannot access '../data/*.csv': No such file or directory


# Data loading

In [3]:
from pathlib import Path
PATH = Path('../../data')

In [4]:
train = pd.read_csv(PATH/'train.csv')[::10]
test = pd.read_csv(PATH/'test.csv')[::10]

In [5]:
def feature_atomtype(df, s):
    # https://www.kaggle.com/jazivxt/all-this-over-a-dog
    df['atom1'] = df['type'].map(lambda x: str(x)[2])
    df['atom2'] = df['type'].map(lambda x: str(x)[3])
    lbl = preprocessing.LabelEncoder()
    for i in range(4):
        df['type'+str(i)] = lbl.fit_transform(df['type'].map(lambda x: str(x)[i]))

    df = pd.merge(df, s.rename(columns={'atom_index':'atom_index_0', 'x':'x0', 'y':'y0', 'z':'z0', 'atom':'atom1'}), how='left', on=['molecule_name', 'atom_index_0', 'atom1'])
    df = pd.merge(df, s.rename(columns={'atom_index':'atom_index_1', 'x':'x1', 'y':'y1', 'z':'z1', 'atom':'atom2'}), how='left', on=['molecule_name', 'atom_index_1', 'atom2'])
    return df

def feature_pair_geometry(df):
    p0 = df[['x0', 'y0', 'z0']].values
    p1 = df[['x1', 'y1', 'z1']].values
    r = np.linalg.norm(p0 - p1, axis=1)
    df['dist'] = r

    for agg in ['min', 'max', 'mean']:
        tmp = eval('df.groupby(["type"], as_index=False).dist.' + agg + '()')
        tmp.rename(columns={"dist":agg + "_dist"}, inplace=True)
        df = pd.merge(df, tmp, how='left', on=['type'])
    return df

In [6]:
def reduce_mem_usage(df, verbose=True):
    # somewhere from kaggle kernel
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose:
        print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem)
 / start_mem))
    
    return df

In [7]:
def feature_basic(df):
    structures = pd.read_csv(PATH/'structures.csv')
    df = feature_atomtype(df, structures)
    df = feature_pair_geometry(df)
    df = reduce_mem_usage(df)
    gc.collect()
    return df

In [8]:
train = feature_basic(train)
test = feature_basic(test)

Mem. usage decreased to 41.76 Mb (48.9% reduction)
Mem. usage decreased to 21.51 Mb (48.9% reduction)


In [9]:
train.head()

Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type,scalar_coupling_constant,atom1,atom2,type0,type1,...,x0,y0,z0,x1,y1,z1,dist,min_dist,max_dist,mean_dist
0,0,dsgdb9nsd_000001,1,0,1JHC,84.807602,H,C,0,0,...,0.00215,-0.006031,0.001976,-0.012698,1.085804,0.008001,1.091953,1.061272,1.121432,1.092919
1,10,dsgdb9nsd_000002,1,0,1JHN,32.6889,H,N,0,0,...,0.017257,0.012545,-0.027377,-0.040426,1.024108,0.062564,1.01719,1.002405,1.083587,1.012903
2,20,dsgdb9nsd_000007,2,1,2JHC,-2.37831,H,C,1,0,...,0.994873,1.939743,0.002941,0.002104,-0.003882,0.001999,2.182492,1.831791,2.52005,2.190124
3,30,dsgdb9nsd_000007,3,6,3JHH,3.25253,H,H,2,0,...,-0.542076,1.923611,-0.865117,-1.011477,-0.418034,0.009508,2.543345,2.07745,3.165045,2.703366
4,40,dsgdb9nsd_000007,5,7,2JHH,-11.6993,H,H,1,0,...,0.525487,-0.401908,0.877544,0.508626,-0.39247,-0.887601,1.765251,1.60663,1.96934,1.774909


In [44]:
# https://www.kaggle.com/adrianoavelar/bond-calculaltion-lb-0-82
def feature_neighbors(s):
    i_atom = s['atom_index'].values
    i_atom_type = s['atom'].values
    p = s[['x', 'y', 'z']].values
    m = s['molecule_name'].values
    t = np.empty((len(s)+1), dtype=np.object)
    t[:len(s)] = s['atom'].values
    p_compare = p
    m_compare = m
    t_compare = t
    source_row = np.arange(len(s))
    max_atoms = max(s.groupby('molecule_name').atom_index.max().values)
    bonds = np.zeros((len(s)+1, max_atoms+1), dtype=np.int8)
    bond_dists = np.zeros((len(s)+1, max_atoms+1), dtype=np.float32)
    bond_atoms = np.empty((len(s)+1, max_atoms+1), dtype=np.object)
    for i in tqdm(range(max_atoms-1)):
        p_compare = np.roll(p_compare, -1, axis=0)
        m_compare = np.roll(m_compare, -1, axis=0)
        t_compare = np.roll(t_compare, -1, axis=0)

        mask = np.where(m == m_compare, 1, 0) #Are we still comparing atoms in the same molecule?
        dists = np.linalg.norm(p - p_compare, axis=1) * mask
        r_bond = 3.0

        bond = np.where(np.logical_and(dists > 0.0001, dists < r_bond), 1, 0)

        source_row = source_row
        target_row = source_row + i + 1 #Note: Will be out of bounds of bonds array for some values of i
        target_row = np.where(np.logical_or(target_row > len(s), mask==0), len(s), target_row) #If invalid target, write to dummy row

        source_atom = i_atom
        target_atom = i_atom + i + 1 #Note: Will be out of bounds of bonds array for some values of i
        target_atom = np.where(np.logical_or(target_atom > max_atoms, mask==0), max_atoms, target_atom) #If invalid target, write to dummy col

        target_atom_type = np.where(np.logical_or(target_row > len(s), mask==0), '', t[target_row]) #If invalid target, write to dummy row    
        source_atom_type = i_atom_type

        bonds[(source_row, target_atom)] = bond
        bonds[(target_row, source_atom)] = bond
        bond_dists[(source_row, target_atom)] = dists
        bond_dists[(target_row, source_atom)] = dists
        bond_atoms[(source_row, target_atom)] = target_atom_type
        bond_atoms[(target_row, source_atom)] = source_atom_type

    bonds = np.delete(bonds, axis=0, obj=-1) #Delete dummy row
    bonds = np.delete(bonds, axis=1, obj=-1) #Delete dummy col
    bond_dists = np.delete(bond_dists, axis=0, obj=-1) #Delete dummy row
    bond_dists = np.delete(bond_dists, axis=1, obj=-1) #Delete dummy col
    bond_atoms = np.delete(bond_atoms, axis=0, obj=-1) #Delete dummy row
    bond_atoms = np.delete(bond_atoms, axis=1, obj=-1) #Delete dummy col
    
    mask = bonds == 1
    bond_lengths_mean = [np.mean(row[mask[j]]) for j,row in enumerate(tqdm(bond_dists))]
    n_bonds = np.sum(bonds, axis=1)
    bond_data = {'n_bonds':n_bonds, 'bond_lengths_mean': bond_lengths_mean }
    bond_df = pd.DataFrame(bond_data)
    s = s.join(bond_df)
    
    atom_types = ['C', 'N', 'H', 'O', 'F']
    bond_data = {}
    for at in atom_types:
        bonds_at = np.empty((len(s), max_atoms), dtype=np.int8) 
        bonds_at[:] = -1
        for i in tqdm(range(len(bond_atoms))):
            mask = bond_atoms[i,:] == at
            atom_j_indices = np.argwhere(mask)
            dists = bond_dists[i, mask]
            atom_j_sorted = np.argsort(dists)
            bonds_at[i, :len(atom_j_sorted)] = atom_j_sorted
            
        maxatom = np.max(np.sum(bonds_at >= 0, axis=1))
        for i in range(maxatom):
            bond_data['bond_%s_%d' % (at, i)] = bonds_at[:, i]
    bond_df = pd.DataFrame(bond_data)
    s = s.join(bond_df)
    return s

In [45]:
structures = pd.read_csv(PATH/'structures.csv')
structures = feature_neighbors(structures)


  0%|          | 0/27 [00:00<?, ?it/s][A
  4%|▎         | 1/27 [00:00<00:11,  2.17it/s][A
  7%|▋         | 2/27 [00:00<00:10,  2.27it/s][A
 11%|█         | 3/27 [00:01<00:09,  2.47it/s][A
 15%|█▍        | 4/27 [00:01<00:08,  2.64it/s][A
 19%|█▊        | 5/27 [00:01<00:07,  2.79it/s][A
 22%|██▏       | 6/27 [00:02<00:07,  2.90it/s][A
 26%|██▌       | 7/27 [00:02<00:06,  2.97it/s][A
 30%|██▉       | 8/27 [00:02<00:06,  3.04it/s][A
 33%|███▎      | 9/27 [00:03<00:05,  3.10it/s][A
 37%|███▋      | 10/27 [00:03<00:05,  3.16it/s][A
 41%|████      | 11/27 [00:03<00:04,  3.21it/s][A
 44%|████▍     | 12/27 [00:03<00:04,  3.26it/s][A
 48%|████▊     | 13/27 [00:04<00:04,  3.30it/s][A
 52%|█████▏    | 14/27 [00:04<00:03,  3.34it/s][A
 56%|█████▌    | 15/27 [00:04<00:03,  3.37it/s][A
 59%|█████▉    | 16/27 [00:05<00:03,  3.41it/s][A
 63%|██████▎   | 17/27 [00:05<00:02,  3.45it/s][A
 67%|██████▋   | 18/27 [00:05<00:02,  3.50it/s][A
 70%|███████   | 19/27 [00:05<00:02,  3.54it/s]

 61%|██████    | 1438351/2358657 [00:10<00:06, 136828.26it/s][A
 62%|██████▏   | 1452119/2358657 [00:10<00:06, 137080.47it/s][A
 62%|██████▏   | 1465855/2358657 [00:10<00:06, 137161.99it/s][A
 63%|██████▎   | 1479572/2358657 [00:10<00:06, 137098.25it/s][A
 63%|██████▎   | 1493491/2358657 [00:10<00:06, 137718.79it/s][A
 64%|██████▍   | 1507280/2358657 [00:11<00:06, 137768.97it/s][A
 64%|██████▍   | 1521058/2358657 [00:11<00:06, 137670.41it/s][A
 65%|██████▌   | 1534826/2358657 [00:11<00:06, 137058.50it/s][A
 66%|██████▌   | 1548557/2358657 [00:11<00:05, 137133.69it/s][A
 66%|██████▌   | 1562272/2358657 [00:11<00:05, 136819.53it/s][A
 67%|██████▋   | 1575955/2358657 [00:11<00:05, 136805.68it/s][A
 67%|██████▋   | 1589637/2358657 [00:11<00:05, 136753.78it/s][A
 68%|██████▊   | 1603375/2358657 [00:11<00:05, 136938.35it/s][A
 69%|██████▊   | 1617070/2358657 [00:11<00:05, 136493.41it/s][A
 69%|██████▉   | 1630720/2358657 [00:11<00:05, 136233.02it/s][A
 70%|██████▉   | 1644362/

 24%|██▍       | 563594/2358657 [00:06<00:19, 92422.83it/s][A
 24%|██▍       | 572838/2358657 [00:06<00:19, 92256.39it/s][A
 25%|██▍       | 582091/2358657 [00:06<00:19, 92337.21it/s][A
 25%|██▌       | 591326/2358657 [00:06<00:19, 92167.05it/s][A
 25%|██▌       | 600544/2358657 [00:06<00:19, 92035.90it/s][A
 26%|██▌       | 609748/2358657 [00:06<00:19, 90365.44it/s][A
 26%|██▌       | 619065/2358657 [00:06<00:19, 91188.37it/s][A
 27%|██▋       | 628216/2358657 [00:06<00:18, 91282.63it/s][A
 27%|██▋       | 637349/2358657 [00:06<00:18, 91103.85it/s][A
 27%|██▋       | 646640/2358657 [00:06<00:18, 91637.92it/s][A
 28%|██▊       | 655911/2358657 [00:07<00:18, 91956.56it/s][A
 28%|██▊       | 665163/2358657 [00:07<00:18, 92123.59it/s][A
 29%|██▊       | 674397/2358657 [00:07<00:18, 92185.53it/s][A
 29%|██▉       | 683617/2358657 [00:07<00:18, 92088.21it/s][A
 29%|██▉       | 692904/2358657 [00:07<00:18, 92320.41it/s][A
 30%|██▉       | 702294/2358657 [00:07<00:17, 92787.52i

 74%|███████▍  | 1749562/2358657 [00:18<00:06, 92371.73it/s][A
 75%|███████▍  | 1758800/2358657 [00:18<00:06, 92121.56it/s][A
 75%|███████▍  | 1768063/2358657 [00:19<00:06, 92272.81it/s][A
 75%|███████▌  | 1777291/2358657 [00:19<00:06, 92197.91it/s][A
 76%|███████▌  | 1786512/2358657 [00:19<00:06, 92137.05it/s][A
 76%|███████▌  | 1795726/2358657 [00:19<00:06, 91860.96it/s][A
 77%|███████▋  | 1804963/2358657 [00:19<00:06, 92011.93it/s][A
 77%|███████▋  | 1814165/2358657 [00:19<00:05, 91975.56it/s][A
 77%|███████▋  | 1823363/2358657 [00:19<00:05, 91894.30it/s][A
 78%|███████▊  | 1832610/2358657 [00:19<00:05, 92065.83it/s][A
 78%|███████▊  | 1841817/2358657 [00:19<00:05, 91910.26it/s][A
 78%|███████▊  | 1851027/2358657 [00:19<00:05, 91966.79it/s][A
 79%|███████▉  | 1860308/2358657 [00:20<00:05, 92218.04it/s][A
 79%|███████▉  | 1869531/2358657 [00:20<00:05, 92157.72it/s][A
 80%|███████▉  | 1878747/2358657 [00:20<00:05, 92117.91it/s][A
 80%|████████  | 1887959/2358657 [00:20<

 25%|██▌       | 598291/2358657 [00:06<00:18, 96640.49it/s][A
 26%|██▌       | 607956/2358657 [00:06<00:18, 96631.21it/s][A
 26%|██▌       | 617620/2358657 [00:06<00:18, 95332.87it/s][A
 27%|██▋       | 627226/2358657 [00:06<00:18, 95548.04it/s][A
 27%|██▋       | 636784/2358657 [00:06<00:18, 95415.87it/s][A
 27%|██▋       | 646444/2358657 [00:06<00:17, 95765.77it/s][A
 28%|██▊       | 656029/2358657 [00:06<00:17, 95788.59it/s][A
 28%|██▊       | 665703/2358657 [00:06<00:17, 96070.29it/s][A
 29%|██▊       | 675312/2358657 [00:07<00:17, 95819.33it/s][A
 29%|██▉       | 684895/2358657 [00:07<00:17, 95338.84it/s][A
 29%|██▉       | 694483/2358657 [00:07<00:17, 95493.91it/s][A
 30%|██▉       | 704158/2358657 [00:07<00:17, 95866.85it/s][A
 30%|███       | 713895/2358657 [00:07<00:17, 96311.97it/s][A
 31%|███       | 723561/2358657 [00:07<00:16, 96413.12it/s][A
 31%|███       | 733276/2358657 [00:07<00:16, 96631.11it/s][A
 31%|███▏      | 742940/2358657 [00:07<00:16, 96630.76i

 78%|███████▊  | 1833129/2358657 [00:19<00:05, 93869.64it/s][A
 78%|███████▊  | 1842733/2358657 [00:19<00:05, 94508.01it/s][A
 79%|███████▊  | 1852199/2358657 [00:19<00:05, 94543.99it/s][A
 79%|███████▉  | 1861891/2358657 [00:19<00:05, 95242.09it/s][A
 79%|███████▉  | 1871538/2358657 [00:19<00:05, 95605.43it/s][A
 80%|███████▉  | 1881105/2358657 [00:19<00:05, 94644.85it/s][A
 80%|████████  | 1890668/2358657 [00:19<00:04, 94937.28it/s][A
 81%|████████  | 1900327/2358657 [00:19<00:04, 95427.08it/s][A
 81%|████████  | 1909887/2358657 [00:19<00:04, 95478.78it/s][A
 81%|████████▏ | 1919438/2358657 [00:20<00:04, 94026.03it/s][A
 82%|████████▏ | 1929018/2358657 [00:20<00:04, 94548.64it/s][A
 82%|████████▏ | 1938533/2358657 [00:20<00:04, 94724.63it/s][A
 83%|████████▎ | 1948010/2358657 [00:20<00:04, 94724.38it/s][A
 83%|████████▎ | 1957498/2358657 [00:20<00:04, 94770.91it/s][A
 83%|████████▎ | 1966978/2358657 [00:20<00:04, 93859.26it/s][A
 84%|████████▍ | 1976368/2358657 [00:20<

 28%|██▊       | 664961/2358657 [00:07<00:18, 91351.91it/s][A
 29%|██▊       | 674140/2358657 [00:07<00:18, 91481.10it/s][A
 29%|██▉       | 683337/2358657 [00:07<00:18, 91626.53it/s][A
 29%|██▉       | 692615/2358657 [00:07<00:18, 91966.97it/s][A
 30%|██▉       | 701813/2358657 [00:07<00:18, 91913.84it/s][A
 30%|███       | 711118/2358657 [00:07<00:17, 92247.61it/s][A
 31%|███       | 720344/2358657 [00:07<00:17, 92133.80it/s][A
 31%|███       | 729559/2358657 [00:07<00:17, 92134.80it/s][A
 31%|███▏      | 738834/2358657 [00:08<00:17, 92316.95it/s][A
 32%|███▏      | 748066/2358657 [00:08<00:17, 92252.57it/s][A
 32%|███▏      | 757292/2358657 [00:08<00:17, 92184.21it/s][A
 33%|███▎      | 766571/2358657 [00:08<00:17, 92363.50it/s][A
 33%|███▎      | 775808/2358657 [00:08<00:17, 91959.93it/s][A
 33%|███▎      | 785005/2358657 [00:08<00:17, 91352.81it/s][A
 34%|███▎      | 794142/2358657 [00:08<00:17, 91240.47it/s][A
 34%|███▍      | 803267/2358657 [00:08<00:17, 91172.93i

 78%|███████▊  | 1849057/2358657 [00:20<00:05, 92769.19it/s][A
 79%|███████▉  | 1858416/2358657 [00:20<00:05, 93013.75it/s][A
 79%|███████▉  | 1867718/2358657 [00:20<00:05, 92923.34it/s][A
 80%|███████▉  | 1877011/2358657 [00:20<00:05, 92862.27it/s][A
 80%|███████▉  | 1886316/2358657 [00:20<00:05, 92910.44it/s][A
 80%|████████  | 1895608/2358657 [00:20<00:04, 92863.32it/s][A
 81%|████████  | 1904895/2358657 [00:20<00:04, 92724.89it/s][A
 81%|████████  | 1914220/2358657 [00:20<00:04, 92880.97it/s][A
 82%|████████▏ | 1923509/2358657 [00:20<00:04, 92425.87it/s][A
 82%|████████▏ | 1932804/2358657 [00:20<00:04, 92581.46it/s][A
 82%|████████▏ | 1942063/2358657 [00:21<00:04, 92425.46it/s][A
 83%|████████▎ | 1951392/2358657 [00:21<00:04, 92677.50it/s][A
 83%|████████▎ | 1960661/2358657 [00:21<00:04, 92666.30it/s][A
 84%|████████▎ | 1970110/2358657 [00:21<00:04, 93203.63it/s][A
 84%|████████▍ | 1979432/2358657 [00:21<00:04, 92789.24it/s][A
 84%|████████▍ | 1988806/2358657 [00:21<

 29%|██▉       | 683054/2358657 [00:07<00:17, 94593.44it/s][A
 29%|██▉       | 692654/2358657 [00:07<00:17, 95008.33it/s][A
 30%|██▉       | 702161/2358657 [00:07<00:17, 95024.47it/s][A
 30%|███       | 711665/2358657 [00:07<00:17, 94735.99it/s][A
 31%|███       | 721140/2358657 [00:07<00:17, 94597.33it/s][A
 31%|███       | 730601/2358657 [00:07<00:17, 94508.36it/s][A
 31%|███▏      | 740053/2358657 [00:07<00:17, 94250.87it/s][A
 32%|███▏      | 749479/2358657 [00:07<00:17, 94136.56it/s][A
 32%|███▏      | 759017/2358657 [00:08<00:16, 94505.60it/s][A
 33%|███▎      | 768472/2358657 [00:08<00:16, 94517.86it/s][A
 33%|███▎      | 777925/2358657 [00:08<00:16, 94177.21it/s][A
 33%|███▎      | 787371/2358657 [00:08<00:16, 94261.56it/s][A
 34%|███▍      | 796806/2358657 [00:08<00:16, 94287.63it/s][A
 34%|███▍      | 806329/2358657 [00:08<00:16, 94565.85it/s][A
 35%|███▍      | 815786/2358657 [00:08<00:16, 94458.69it/s][A
 35%|███▍      | 825233/2358657 [00:08<00:16, 94208.30i

 80%|████████  | 1889865/2358657 [00:20<00:05, 93146.79it/s][A
 81%|████████  | 1899226/2358657 [00:20<00:04, 93283.57it/s][A
 81%|████████  | 1908555/2358657 [00:20<00:04, 93244.04it/s][A
 81%|████████▏ | 1917933/2358657 [00:20<00:04, 93401.48it/s][A
 82%|████████▏ | 1927274/2358657 [00:20<00:04, 93331.11it/s][A
 82%|████████▏ | 1936608/2358657 [00:20<00:04, 93072.86it/s][A
 83%|████████▎ | 1945916/2358657 [00:20<00:04, 92716.35it/s][A
 83%|████████▎ | 1955212/2358657 [00:20<00:04, 92787.94it/s][A
 83%|████████▎ | 1964492/2358657 [00:20<00:04, 92781.37it/s][A
 84%|████████▎ | 1973771/2358657 [00:20<00:04, 92768.59it/s][A
 84%|████████▍ | 1983134/2358657 [00:21<00:04, 93022.86it/s][A
 84%|████████▍ | 1992437/2358657 [00:21<00:03, 92922.48it/s][A
 85%|████████▍ | 2001765/2358657 [00:21<00:03, 93028.55it/s][A
 85%|████████▌ | 2011164/2358657 [00:21<00:03, 93312.88it/s][A
 86%|████████▌ | 2020496/2358657 [00:21<00:03, 93127.68it/s][A
 86%|████████▌ | 2029810/2358657 [00:21<

 25%|██▍       | 585704/2358657 [00:07<00:23, 75738.93it/s][A
 25%|██▌       | 593279/2358657 [00:07<00:23, 75606.55it/s][A
 25%|██▌       | 600880/2358657 [00:07<00:23, 75726.72it/s][A
 26%|██▌       | 608453/2358657 [00:08<00:23, 75725.94it/s][A
 26%|██▌       | 616048/2358657 [00:08<00:22, 75791.80it/s][A
 26%|██▋       | 623628/2358657 [00:08<00:22, 75559.74it/s][A
 27%|██▋       | 631185/2358657 [00:08<00:22, 75558.46it/s][A
 27%|██▋       | 638742/2358657 [00:08<00:22, 75518.69it/s][A
 27%|██▋       | 646306/2358657 [00:08<00:22, 75554.95it/s][A
 28%|██▊       | 653862/2358657 [00:08<00:22, 75373.25it/s][A
 28%|██▊       | 661400/2358657 [00:08<00:22, 75226.32it/s][A
 28%|██▊       | 668923/2358657 [00:08<00:22, 75025.20it/s][A
 29%|██▊       | 676427/2358657 [00:08<00:22, 75026.76it/s][A
 29%|██▉       | 684001/2358657 [00:09<00:22, 75237.15it/s][A
 29%|██▉       | 691566/2358657 [00:09<00:22, 75358.35it/s][A
 30%|██▉       | 699103/2358657 [00:09<00:22, 75202.94i

 67%|██████▋   | 1571063/2358657 [00:20<00:10, 76898.29it/s][A
 67%|██████▋   | 1578806/2358657 [00:20<00:10, 77056.46it/s][A
 67%|██████▋   | 1586548/2358657 [00:20<00:10, 77163.42it/s][A
 68%|██████▊   | 1594265/2358657 [00:20<00:09, 76974.77it/s][A
 68%|██████▊   | 1601963/2358657 [00:21<00:09, 76857.79it/s][A
 68%|██████▊   | 1609690/2358657 [00:21<00:09, 76980.74it/s][A
 69%|██████▊   | 1617389/2358657 [00:21<00:09, 76938.58it/s][A
 69%|██████▉   | 1625083/2358657 [00:21<00:09, 76644.59it/s][A
 69%|██████▉   | 1632748/2358657 [00:21<00:09, 76640.70it/s][A
 70%|██████▉   | 1640464/2358657 [00:21<00:09, 76793.21it/s][A
 70%|██████▉   | 1648144/2358657 [00:21<00:09, 76605.86it/s][A
 70%|███████   | 1655833/2358657 [00:21<00:09, 76690.85it/s][A
 71%|███████   | 1663536/2358657 [00:21<00:09, 76791.10it/s][A
 71%|███████   | 1671216/2358657 [00:21<00:08, 76759.97it/s][A
 71%|███████   | 1678893/2358657 [00:22<00:08, 76268.70it/s][A
 72%|███████▏  | 1686521/2358657 [00:22<

In [36]:
def feature_bonds(df1, df2):
    structures = pd.read_csv(PATH/'structures.csv')
    structures = feature_neighbors(structures)
    df1 = pd.merge(df1, structures.rename(columns={'atom_index':'atom_index_0', 'x':'x0', 'y':'y0', 'z':'z0', 'atom':'atom1'}), how='left', on=['molecule_name', 'atom_index_0', 'atom1'])
    df1 = pd.merge(df1, structures.rename(columns={'atom_index':'atom_index_1', 'x':'x1', 'y':'y1', 'z':'z1', 'atom':'atom2'}), how='left', on=['molecule_name', 'atom_index_1', 'atom2'])
    df2 = pd.merge(df2, structures.rename(columns={'atom_index':'atom_index_0', 'x':'x0', 'y':'y0', 'z':'z0', 'atom':'atom1'}), how='left', on=['molecule_name', 'atom_index_0', 'atom1'])
    df2 = pd.merge(df2, structures.rename(columns={'atom_index':'atom_index_1', 'x':'x1', 'y':'y1', 'z':'z1', 'atom':'atom2'}), how='left', on=['molecule_name', 'atom_index_1', 'atom2'])
    df1 = reduce_mem_usage(df1)
    df2 = reduce_mem_usage(df2)
    gc.collect()
    return df1, df2

array([[-1, -1, -1, ..., -1, -1, -1],
       [ 0, -1, -1, ..., -1, -1, -1],
       [ 0, -1, -1, ..., -1, -1, -1],
       ...,
       [ 5,  6,  4, ..., -1, -1, -1],
       [ 6,  2,  3, ..., -1, -1, -1],
       [-1, -1, -1, ..., -1, -1, -1]], dtype=int8)

In [None]:
train, test = feature_bonds(train, test)

In [52]:
excluded = ['id', 'molecule_name', 'atom_index_0', 'atom_index_1', 'atom1', 'atom2'] + ['x0', 'y0', 'z0', 'x1', 'y1', 'z1']
col = [c for c in train.columns if c not in ['scalar_coupling_constant'] + excluded]
reg = ensemble.ExtraTreesRegressor(n_jobs=-1, n_estimators=20, random_state=4, verbose=1)

In [53]:
col

['type',
 'type0',
 'type1',
 'type2',
 'type3',
 'dist',
 'min_dist',
 'max_dist',
 'mean_dist',
 'x_x',
 'y_x',
 'z_x',
 'n_bonds_x',
 'bond_lengths_mean_x',
 'bond_C_0_x',
 'bond_C_1_x',
 'bond_C_2_x',
 'bond_C_3_x',
 'bond_C_4_x',
 'bond_C_5_x',
 'bond_C_6_x',
 'bond_C_7_x',
 'bond_C_8_x',
 'bond_N_0_x',
 'bond_N_1_x',
 'bond_N_2_x',
 'bond_N_3_x',
 'bond_N_4_x',
 'bond_N_5_x',
 'bond_N_6_x',
 'bond_H_0_x',
 'bond_H_1_x',
 'bond_H_2_x',
 'bond_H_3_x',
 'bond_H_4_x',
 'bond_H_5_x',
 'bond_H_6_x',
 'bond_H_7_x',
 'bond_H_8_x',
 'bond_H_9_x',
 'bond_H_10_x',
 'bond_H_11_x',
 'bond_H_12_x',
 'bond_H_13_x',
 'bond_H_14_x',
 'bond_H_15_x',
 'bond_H_16_x',
 'bond_H_17_x',
 'bond_H_18_x',
 'bond_O_0_x',
 'bond_O_1_x',
 'bond_O_2_x',
 'bond_O_3_x',
 'bond_O_4_x',
 'bond_F_0_x',
 'bond_F_1_x',
 'bond_F_2_x',
 'bond_F_3_x',
 'bond_F_4_x',
 'bond_F_5_x',
 'x_y',
 'y_y',
 'z_y',
 'n_bonds_y',
 'bond_lengths_mean_y',
 'bond_C_0_y',
 'bond_C_1_y',
 'bond_C_2_y',
 'bond_C_3_y',
 'bond_C_4_y',
 'bo

In [54]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(train[col], train['scalar_coupling_constant'], test_size=0.2)

In [55]:
reg.fit(X_train.drop(['type'], axis=1), y_train)

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 6 concurrent workers.
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:   38.9s finished


ExtraTreesRegressor(bootstrap=False, criterion='mse', max_depth=None,
          max_features='auto', max_leaf_nodes=None,
          min_impurity_decrease=0.0, min_impurity_split=None,
          min_samples_leaf=1, min_samples_split=2,
          min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=-1,
          oob_score=False, random_state=4, verbose=1, warm_start=False)

In [56]:
# https://www.kaggle.com/uberkinder/efficient-metric
def group_mean_log_mae(y_true, y_pred, types, floor=1e-9):
    maes = (y_true-y_pred).abs().groupby(types).mean()
    return np.log(maes.map(lambda x: max(x, floor))).mean()

In [57]:
y_pred = reg.predict(X_test.drop('type', axis=1))

[Parallel(n_jobs=6)]: Using backend ThreadingBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  20 out of  20 | elapsed:    0.3s finished


In [58]:
group_mean_log_mae(y_test, y_pred, X_test.type)

0.23199779434054785

In [59]:
for t in train.type.unique():
    idx = X_test.type == t
    print(t, (y_test[idx] - y_pred[idx]).abs().mean())

1JHC 3.13440698398015
1JHN 1.8348052993434065
2JHC 1.61181149884822
3JHH 0.9873188273634053
2JHH 0.7384456583758383
3JHC 1.5267124508881924
2JHN 1.047342654303434
3JHN 0.592053938633429


In [37]:
# train with all data
reg.fit(train[col].drop('type', axis=1), train['scalar_coupling_constant'])

  return umr_sum(a, axis, dtype, out, keepdims, initial)
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 6 concurrent workers.
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    0.8s finished


ExtraTreesRegressor(bootstrap=False, criterion='mse', max_depth=None,
          max_features='auto', max_leaf_nodes=None,
          min_impurity_decrease=0.0, min_impurity_split=None,
          min_samples_leaf=1, min_samples_split=2,
          min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=-1,
          oob_score=False, random_state=4, verbose=1, warm_start=False)

In [41]:
test['scalar_coupling_constant']  = reg.predict(test[col].drop('type', axis=1))
test[['id', 'scalar_coupling_constant']].to_csv('submission.csv', index=False) #float_format='%.9f'

[Parallel(n_jobs=6)]: Using backend ThreadingBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  20 out of  20 | elapsed:    0.1s finished


In [46]:
from IPython.display import FileLink
FileLink('submission.csv')

In [None]:
# score: ~0.7