In [15]:
%matplotlib inline
import matplotlib.pyplot as plt
from tqdm import tqdm
import numpy as np
import pandas as pd
from sklearn import *
import gc

In [2]:
! ls ../data/*.csv

../data/1_average_coupling.csv
../data/dipole_moments.csv
../data/magnetic_shielding_tensors.csv
../data/mulliken_charges.csv
../data/potential_energy.csv
../data/sample_submission.csv
../data/scalar_coupling_contributions.csv
../data/structures.csv
../data/test_4_more_features.csv
../data/test_7_coupling_constant.csv
../data/test.csv
../data/test_tmp.csv
../data/train_4_more_features.csv
../data/train_7_coupling_constant.csv
../data/train.csv
../data/train_tmp.csv


# Data loading

In [3]:
from pathlib import Path
PATH = Path('../data')

In [4]:
train = pd.read_csv(PATH/'train.csv')[::10]
test = pd.read_csv(PATH/'test.csv')[::10]

In [9]:
def feature_atomtype(df, s):
    # https://www.kaggle.com/jazivxt/all-this-over-a-dog
    df['atom1'] = df['type'].map(lambda x: str(x)[2])
    df['atom2'] = df['type'].map(lambda x: str(x)[3])
    lbl = preprocessing.LabelEncoder()
    for i in range(4):
        df['type'+str(i)] = lbl.fit_transform(df['type'].map(lambda x: str(x)[i]))

    df = pd.merge(df, s.rename(columns={'atom_index':'atom_index_0', 'x':'x0', 'y':'y0', 'z':'z0', 'atom':'atom1'}), how='left', on=['molecule_name', 'atom_index_0', 'atom1'])
    df = pd.merge(df, s.rename(columns={'atom_index':'atom_index_1', 'x':'x1', 'y':'y1', 'z':'z1', 'atom':'atom2'}), how='left', on=['molecule_name', 'atom_index_1', 'atom2'])
    return df

def feature_pair_geometry(df):
    p0 = df[['x0', 'y0', 'z0']].values
    p1 = df[['x1', 'y1', 'z1']].values
    r = np.linalg.norm(p0 - p1, axis=1)
    df['dist'] = r

    for agg in ['min', 'max', 'mean']:
        tmp = eval('df.groupby(["type"], as_index=False).dist.' + agg + '()')
        tmp.rename(columns={"dist":agg + "_dist"}, inplace=True)
        df = pd.merge(df, tmp, how='left', on=['type'])
    return df

In [12]:
def reduce_mem_usage(df, verbose=True):
    # somewhere from kaggle kernel
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose:
        print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem)
 / start_mem))
    
    return df

In [16]:
def feature_basic(df):
    structures = pd.read_csv(PATH/'structures.csv')
    df = feature_atomtype(df, structures)
    df = feature_pair_geometry(df)
    df = reduce_mem_usage(df)
    gc.collect()
    return df

In [17]:
train = feature_basic(train)
test = feature_basic(test)

Mem. usage decreased to 31.98 Mb (60.9% reduction)
Mem. usage decreased to 16.73 Mb (60.2% reduction)


In [27]:
excluded = ['id', 'molecule_name', 'atom_index_0', 'atom_index_1', 'atom1', 'atom2'] + ['x0', 'y0', 'z0', 'x1', 'y1', 'z1']
col = [c for c in train.columns if c not in ['scalar_coupling_constant'] + excluded]
reg = ensemble.ExtraTreesRegressor(n_jobs=-1, n_estimators=20, random_state=4, verbose=1)

In [28]:
col

['type',
 'type0',
 'type1',
 'type2',
 'type3',
 'dist',
 'min_dist',
 'max_dist',
 'mean_dist']

In [29]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(train[col], train['scalar_coupling_constant'], test_size=0.2)

In [31]:
reg.fit(X_train.drop(['type'], axis=1), y_train)

  return umr_sum(a, axis, dtype, out, keepdims, initial)
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 6 concurrent workers.
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    0.7s finished


ExtraTreesRegressor(bootstrap=False, criterion='mse', max_depth=None,
          max_features='auto', max_leaf_nodes=None,
          min_impurity_decrease=0.0, min_impurity_split=None,
          min_samples_leaf=1, min_samples_split=2,
          min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=-1,
          oob_score=False, random_state=4, verbose=1, warm_start=False)

In [32]:
# https://www.kaggle.com/uberkinder/efficient-metric
def group_mean_log_mae(y_true, y_pred, types, floor=1e-9):
    maes = (y_true-y_pred).abs().groupby(types).mean()
    return np.log(maes.map(lambda x: max(x, floor))).mean()

In [33]:
y_pred = reg.predict(X_test.drop('type', axis=1))

[Parallel(n_jobs=6)]: Using backend ThreadingBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  20 out of  20 | elapsed:    0.0s finished


In [34]:
group_mean_log_mae(y_test, y_pred, X_test.type)

0.7587598234024385

In [35]:
for t in train.type.unique():
    idx = X_test.type == t
    print(t, (y_test[idx] - y_pred[idx]).abs().mean())

1JHC 5.268634709340875
1JHN 2.921645829396459
2JHC 2.502172447427435
3JHH 1.8745136356193908
2JHH 1.3325881808233813
3JHC 2.093548698203321
2JHN 2.504358464326092
3JHN 0.8578169430747304


In [37]:
# train with all data
reg.fit(train[col].drop('type', axis=1), train['scalar_coupling_constant'])

  return umr_sum(a, axis, dtype, out, keepdims, initial)
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 6 concurrent workers.
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    0.8s finished


ExtraTreesRegressor(bootstrap=False, criterion='mse', max_depth=None,
          max_features='auto', max_leaf_nodes=None,
          min_impurity_decrease=0.0, min_impurity_split=None,
          min_samples_leaf=1, min_samples_split=2,
          min_weight_fraction_leaf=0.0, n_estimators=20, n_jobs=-1,
          oob_score=False, random_state=4, verbose=1, warm_start=False)

In [41]:
test['scalar_coupling_constant']  = reg.predict(test[col].drop('type', axis=1))
test[['id', 'scalar_coupling_constant']].to_csv('submission.csv', index=False) #float_format='%.9f'

[Parallel(n_jobs=6)]: Using backend ThreadingBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  20 out of  20 | elapsed:    0.1s finished


In [46]:
from IPython.display import FileLink
FileLink('submission.csv')

In [None]:
# score: ~0.7