# About


This notebook will cover a basic model with a basic EDA about the competition.

## Imports

In [1]:
# Imports
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import tree
from sklearn import metrics

# Visualization
# Ploty Imports
from itertools import combinations
import plotly.graph_objs as go
from plotly.offline import init_notebook_mode, iplot

# Figures inline and set visualization style
%matplotlib inline
sns.set() #Different type of visualization

# import the necessary modelling algorithms

# Regression
from sklearn.linear_model import LinearRegression,Ridge,Lasso,RidgeCV
from sklearn.ensemble import RandomForestRegressor,BaggingRegressor,GradientBoostingRegressor,AdaBoostRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import ExtraTreesRegressor
import xgboost as xgb
from xgboost.sklearn import XGBRegressor

# Model selection
from sklearn.model_selection import train_test_split,cross_validate
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV

# Preprocessing
from sklearn.preprocessing import MinMaxScaler,StandardScaler,Imputer,LabelEncoder,PolynomialFeatures

# Evaluation metrics
from sklearn.metrics import mean_squared_log_error,mean_squared_error, r2_score,mean_absolute_error # for regression
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score  # for classification

# Show multiple statements at once
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"


numpy.core.umath_tests is an internal NumPy module and should not be imported. It will be removed in a future NumPy release.



## Load Data

In [2]:
# Train Data
train = pd.read_csv('input/train.csv')

# Test Data
test = pd.read_csv('input/test.csv')

sub = pd.read_csv('input/sample_submission.csv')
structures = pd.read_csv('input/structures.csv')

## EDA

In [3]:
# Lets take a look at the main csv files
train.head()
sub.head()
structures.head()

Unnamed: 0,id,molecule_name,atom_index_0,atom_index_1,type,scalar_coupling_constant
0,0,dsgdb9nsd_000001,1,0,1JHC,84.8076
1,1,dsgdb9nsd_000001,1,2,2JHH,-11.257
2,2,dsgdb9nsd_000001,1,3,2JHH,-11.2548
3,3,dsgdb9nsd_000001,1,4,2JHH,-11.2543
4,4,dsgdb9nsd_000001,2,0,1JHC,84.8074


Unnamed: 0,id,scalar_coupling_constant
0,4658147,0
1,4658148,0
2,4658149,0
3,4658150,0
4,4658151,0


Unnamed: 0,molecule_name,atom_index,atom,x,y,z
0,dsgdb9nsd_000001,0,C,-0.012698,1.085804,0.008001
1,dsgdb9nsd_000001,1,H,0.00215,-0.006031,0.001976
2,dsgdb9nsd_000001,2,H,1.011731,1.463751,0.000277
3,dsgdb9nsd_000001,3,H,-0.540815,1.447527,-0.876644
4,dsgdb9nsd_000001,4,H,-0.523814,1.437933,0.906397


In [4]:
print(f'{train.shape[0]} rows in the train data.')
print(f'{test.shape[0]} rows in the test data.')
print(f"{train['molecule_name'].nunique()} different molecules in the train data.")
print(f"There are {test['molecule_name'].nunique()} different molecules in the test data.")
print(f"There are {structures['atom'].nunique()} unique atoms.")
print(f"There are {train['type'].nunique()} unique types.")

4658147 rows in the train data.
2505542 rows in the test data.
85003 different molecules in the train data.
There are 45772 different molecules in the test data.
There are 5 unique atoms.
There are 8 unique types.


In [5]:
# We take a first look at the dataset
train.info()
print ('#################################################')
print ('#################################################')
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4658147 entries, 0 to 4658146
Data columns (total 6 columns):
id                          int64
molecule_name               object
atom_index_0                int64
atom_index_1                int64
type                        object
scalar_coupling_constant    float64
dtypes: float64(1), int64(3), object(2)
memory usage: 213.2+ MB
#################################################
#################################################
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2505542 entries, 0 to 2505541
Data columns (total 5 columns):
id               int64
molecule_name    object
atom_index_0     int64
atom_index_1     int64
type             object
dtypes: int64(3), object(2)
memory usage: 95.6+ MB


In [6]:
sub.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2505542 entries, 0 to 2505541
Data columns (total 2 columns):
id                          int64
scalar_coupling_constant    int64
dtypes: int64(2)
memory usage: 38.2 MB


In [7]:
sub['scalar_coupling_constant'].unique()

array([0])

Lets take a look at a nicer visualization.

In [8]:
# Plotly notebook mode
init_notebook_mode(connected=True)

In [9]:
'''
This function will plot a mollecule
'''
def plot_molecule(molecule_name, structures_df):
    """Creates a 3D plot of the molecule"""
    
    atomic_radii = dict(C=0.68, F=0.64, H=0.23, N=0.68, O=0.68)  
    cpk_colors = dict(C='black', F='green', H='white', N='blue', O='red')

    molecule = structures[structures.molecule_name == molecule_name]
    coordinates = molecule[['x', 'y', 'z']].values
    x_coordinates = coordinates[:, 0]
    y_coordinates = coordinates[:, 1]
    z_coordinates = coordinates[:, 2]
    elements = molecule.atom.tolist()
    radii = [atomic_radii[element] for element in elements]
    
    def get_bonds():
        """Generates a set of bonds from atomic cartesian coordinates"""
        ids = np.arange(coordinates.shape[0])
        bonds = set()
        coordinates_compare, radii_compare, ids_compare = coordinates, radii, ids
        
        for i in range(len(ids)):
            coordinates_compare = np.roll(coordinates_compare, -1, axis=0)
            radii_compare = np.roll(radii_compare, -1, axis=0)
            ids_compare = np.roll(ids_compare, -1, axis=0)
            distances = np.linalg.norm(coordinates - coordinates_compare, axis=1)
            bond_distances = (radii + radii_compare) * 1.3
            mask = np.logical_and(distances > 0.1, distances <  bond_distances)
            bonds.update(map(frozenset, zip(ids[mask], ids_compare[mask])))
        
        return bonds            
            
    def atom_trace():
        """Creates an atom trace for the plot"""
        colors = [cpk_colors[element] for element in elements]
        markers = dict(color=colors, line=dict(color='lightgray', width=2), size=7, symbol='circle', opacity=0.8)
        trace = go.Scatter3d(x=x_coordinates, y=y_coordinates, z=z_coordinates, mode='markers', marker=markers,
                             text=elements)
        return trace

    def bond_trace():
        """"Creates a bond trace for the plot"""
        trace = go.Scatter3d(x=[], y=[], z=[], hoverinfo='none', mode='lines',
                             marker=dict(color='grey', size=7, opacity=1))
        for i, j in bonds:
            trace['x'] += (x_coordinates[i], x_coordinates[j], None)
            trace['y'] += (y_coordinates[i], y_coordinates[j], None)
            trace['z'] += (z_coordinates[i], z_coordinates[j], None)
        return trace
    
    bonds = get_bonds()
    atoms = zip(range(len(elements)), x_coordinates, y_coordinates, z_coordinates)
    annotations = [dict(text=num, x=x, y=y, z=z, showarrow=False, yshift=15)
                   for num, x, y, z in atoms]
    data = [atom_trace(), bond_trace()]
    axis_params = dict(showgrid=False, showticklabels=False, zeroline=False, titlefont=dict(color='white'))
    layout = go.Layout(scene=dict(xaxis=axis_params, yaxis=axis_params, zaxis=axis_params, annotations=annotations),
                       margin=dict(r=0, l=0, b=0, t=0), showlegend=False)
    fig = go.Figure(data=data, layout=layout)
    iplot(fig)

'\nThis function will plot a mollecule\n'

In [10]:
# Plots

plot_molecule('dsgdb9nsd_133885', structures)

plot_molecule('dsgdb9nsd_105227', structures)

plot_molecule('dsgdb9nsd_099964', structures)

### Duplicate Check

In [11]:
# We check if we have duplicates
train.duplicated().any()
test.duplicated().any()

False

False

### Label Encoding

Before modeling it would be a good idea to do a Label Encoding to variables which are still as objects in order to improve the model's performance.

In [12]:
categoricals = train.select_dtypes(include='object').columns
categoricals = test.select_dtypes(include='object').columns

In [14]:
for c in categoricals:
    lbl = LabelEncoder()
    lbl.fit(list(train[c].values))
    train[c] = lbl.transform(list(train[c].values))

LabelEncoder()

LabelEncoder()

In [15]:
for c in categoricals:
    lbl = LabelEncoder()
    lbl.fit(list(test[c].values))
    test[c] = lbl.transform(list(test[c].values))

LabelEncoder()

LabelEncoder()

In [16]:
# Check it has been done properly
train.dtypes
test.dtypes

id                            int64
molecule_name                 int64
atom_index_0                  int64
atom_index_1                  int64
type                          int64
scalar_coupling_constant    float64
dtype: object

id               int64
molecule_name    int64
atom_index_0     int64
atom_index_1     int64
type             int64
dtype: object

## Metric

In [None]:
def metric(df, preds):
    
    df["prediction"] = preds
    maes = []
    
    for t in df.type.unique():
        
        y_true = df[df.type==t].scalar_coupling_constant.values
        y_pred = df[df.type==t].prediction.values
        mae = np.log(metrics.mean_absolute_error(y_true, y_pred))
        maes.append(mae)
    
    return np.mean(maes)

# Modeling

In [17]:
X = train.drop(['id', 'molecule_name', 'scalar_coupling_constant'], axis=1)
X_test = test.drop(['id', 'molecule_name'], axis=1)

### Setting K-Folds

In [18]:
# Setting a 5-fold stratified cross-validation (note: shuffle=True)
skf = KFold(n_splits=5, shuffle=True, random_state=8)

In [19]:
params = {'booster' : 'gbtree',
          #'nthread' : 5,
          'objective' : 'reg:linear',
          'eval_metric' : 'mae', 
          'max_depth' : 8,
          'eta' : 0.3,
          'subsample' : 0.7, 
          'colsample_bytree' : 0.7 
        }

In [20]:
# We define de label
y = train['scalar_coupling_constant']

In [21]:
dtrain = xgb.DMatrix(train, label = y)

In [None]:
res = xgb.cv(params,
             dtrain,
             num_boost_round = 4000,
             folds=skf,
             seed=2019,
             early_stopping_rounds = 10,
             verbose_eval=True)

[14:07:13] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 308 extra nodes, 0 pruned nodes, max_depth=8
[14:07:20] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 306 extra nodes, 0 pruned nodes, max_depth=8
[14:07:27] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 306 extra nodes, 0 pruned nodes, max_depth=8
[14:07:34] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 306 extra nodes, 0 pruned nodes, max_depth=8
[14:07:41] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 306 extra nodes, 0 pruned nodes, max_depth=8
[0]	train-mae:12.7356+0.00539494	test-mae:12.7356+0.0215321
[14:07:47] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 310 extra nodes, 0 pruned nodes, max_depth=8
[14:07:52] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 310 extra nodes, 0 pruned nodes, max_depth=8
[14:07:57] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 306 extra nodes, 0 pruned nodes, max_depth=8
[14:08:02] src/tree/updater_prune.cc

[14:13:02] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 468 extra nodes, 0 pruned nodes, max_depth=8
[14:13:09] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 456 extra nodes, 0 pruned nodes, max_depth=8
[14:13:18] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 456 extra nodes, 0 pruned nodes, max_depth=8
[14:13:29] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 502 extra nodes, 0 pruned nodes, max_depth=8
[13]	train-mae:0.260043+0.0816703	test-mae:0.260723+0.0820469
[14:13:39] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 450 extra nodes, 0 pruned nodes, max_depth=8
[14:13:48] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 418 extra nodes, 0 pruned nodes, max_depth=8
[14:13:56] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 496 extra nodes, 0 pruned nodes, max_depth=8
[14:14:05] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 486 extra nodes, 0 pruned nodes, max_depth=8
[14:14:15] src/tree/updater_prune.

[14:19:55] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 490 extra nodes, 0 pruned nodes, max_depth=8
[14:19:59] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 508 extra nodes, 0 pruned nodes, max_depth=8
[14:20:03] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 456 extra nodes, 0 pruned nodes, max_depth=8
[26]	train-mae:0.0787372+0.0358865	test-mae:0.0793968+0.0362763
[14:20:08] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 486 extra nodes, 0 pruned nodes, max_depth=8
[14:20:12] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 484 extra nodes, 0 pruned nodes, max_depth=8
[14:20:17] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 478 extra nodes, 0 pruned nodes, max_depth=8
[14:20:21] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 504 extra nodes, 0 pruned nodes, max_depth=8
[14:20:26] src/tree/updater_prune.cc:74: tree pruning end, 1 roots, 462 extra nodes, 0 pruned nodes, max_depth=8
[27]	train-mae:0.077136+0.034751

In [None]:
best_round=[i for i, e in enumerate(res['test-mae-mean']) if e == min(res['test-mae-mean'])][0]

In [None]:
best_round

In [None]:
res.iloc[best_round]

In [None]:
model = xgb.train(params = params, dtrain = dtrain, num_boost_round = best_round)

In [None]:
test.info()

In [None]:
test.drop(columns=['id']);

In [None]:
dtest = xgb.DMatrix(test)

In [None]:
train.info()

In [None]:
train = train.drop(['id'], axis=1);

In [None]:
train.info()

In [None]:
ypred = model.predict(dtest)

# Predictions