# Machine Learning project CS-433: NMR spectroscopy supervised learning



___

## Schedules:

* Week 10 (18-24 November): 
 * Tests of various linear models/simple NN on a 10% subset of data
* Week 11 (25-1 December):
 * Feature selection: being able to come with a good set of features
* Week 12 (2-8 December):
 * Start of big scale analysis with Spark, implementation of the models which perform well at small scale
* Week 13 (9-15 December):
 * Wrapping up
* Week 14 (16-22 December): 
 * 19th December: Deadline

___

## Table of contents

1. [Log Book](#log)
2. [Pipeline](#pipeline)
3. [Data Processing](#data_proc) <br>
&emsp;3.1. [Data Vizualisation](#data_viz) <br>
&emsp;3.2 [Outliers detection](#outliers) <br>
  &emsp;&emsp;3.2.1 [DBSCAN](#dbscan) <br>
  &emsp;&emsp;3.2.2 [Inter quantile range method](#iqr) <br>
&emsp;3.3 [Scaling](#scaling) <br>
&emsp;&emsp;3.3.1 [Min max scaling](#minmax) <br>
&emsp;3.4 [Dimensionality reduction](#dim_red) <br>
  &emsp;&emsp;3.4.1 [PCA](#pca) <br>
&emsp;3.5 [Feature Selection](#feat_sel) <br>
  &emsp;&emsp;3.5.1 [Relative importance from linear regression](#rel_imp_lin) <br>
  &emsp;&emsp;3.5.2 [Random forest](#rand_for) <br>
  &emsp;&emsp;3.5.3 [Univariate feature selection](#un_feat_sel) <br>
  &emsp;&emsp;3.5.4 [Recursive feature selection](#rec_feat_sel) <br>
  &emsp;&emsp;3.5.5 [Lasso Regression](#lasso) <br>
  &emsp;&emsp;3.5.6 [Boruta](#boruta) <br>
&emsp;3.6 [Models](#models) <br>
  &emsp;&emsp;3.6.1 [Linear Models](#lin_mods) <br>
  &emsp;&emsp;3.6.2 [Neural Networks](#NN) <br>
4. [Main](#main) <br>
   4.1 [ANN implementation](#ann_imp) <br>
    

In [3]:
import os
import re
import pickle
import scipy.stats
import sklearn.metrics
import datetime
import json

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from scipy.stats import norm
from itertools import combinations

#from boruta import BorutaPy
from IPython.core.debugger import set_trace

from sklearn.feature_selection import RFE
from sklearn.preprocessing import Normalizer

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler
from sklearn.utils import resample
from sklearn.metrics import r2_score
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import Ridge
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import LinearRegression
from sklearn.kernel_ridge import KernelRidge
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.cluster import DBSCAN
from sklearn.decomposition import PCA
from sklearn.decomposition import KernelPCA

In [4]:
# For neural net part
import torch 
import torch.nn as nn
import torch.nn.functional as F
from torch import optim

#from keras.callbacks import ModelCheckpoint
#from keras.models import Sequential
#from keras.layers import Dense, Activation, Flatten

In [5]:
class Net_3(nn.Module):
    def __init__(self, n):
        super(Net_3, self).__init__()
        self.fc1 = nn.Linear(n,100)
        self.fc2 = nn.Linear(100,1)
    def forward(self,x):
        x = F.leaky_relu(self.fc1(x))
        x = F.leaky_relu(self.fc2(x))
        return x

In [6]:
class NN3():
    def __init__(self,mod_1,mod_2,mod_3,mini_batch_size = 10 ,
                 apply_iqr = True,apply_scaler = False,apply_pca = False,
                 assemble_y = 'custom',nb_epochs = 150,normalize = False):
        self.mod1 = mod_1
        self.mod2 = mod_2
        self.mod3 = mod_3
        self.minbatchsize = mini_batch_size
        self.assemble_y = assemble_y
        self.nb_epochs = nb_epochs
        self.apply_iqr = apply_iqr
        self.apply_pca = apply_pca
        self.apply_scaler = apply_scaler
        
    def train_model(self,model, train_input, train_target, monitor_loss=False):
        criterion = nn.MSELoss() #regression task
        optimizer = optim.Adam(model.parameters(), lr = 1e-4) #1e-4 normalement

        # Monitor loss
        losses = []

        for e in range(self.nb_epochs):
            sum_loss = 0
            N = train_input.size(0)
            for b in range(0, N, self.minbatchsize):
                output = model(train_input.narrow(0, b, min(self.minbatchsize,N - b)))
                loss = criterion(output, train_target.narrow(0, b, min(self.minbatchsize,N - b)))
                model.zero_grad()
                loss.backward()

                sum_loss += loss.item() #compute loss for each mini batch for 1 epoch

                optimizer.step()

            # Monitor loss
            losses.append(sum_loss)

            print('[epoch {:d}] loss: {:0.2f}'.format(e+1, sum_loss))

        if monitor_loss:
            return losses
        
    def IQR_y_outliers(self,X1,X2,X3,y_data):
        ''' aims at removing all rows whose label (i.e. shielding) is considered as outlier.
        output:
         - X_filtered
         - y_filtered
        '''
        q1, q3 = np.percentile(y_data, [25, 75])
        iqr = q3 - q1
        lower_bound = q1 - (iqr * 1.5)
        upper_bound = q3 + (iqr * 1.5)

        assert(q1 != q3)

        idx = np.where((y_data > lower_bound) & (y_data < upper_bound))
        X1, X2, X3 = X1[idx], X2[idx], X3[idx]
        y_data = y_data[idx]

        assert(X1.shape[0] == y_data.shape[0] and X2.shape[0] == y_data.shape[0] and X3.shape[0] == y_data.shape[0])
        return X1, X2, X3, y_data
    
    def fit(self, X1,X2,X3,y):
        if self.apply_iqr:
            X1,X2,X3,y = self.IQR_y_outliers(X1,X2,X3,y)
        if self.normalize:
            self.trans1 = Normalizer().fit(X1)
            self.trans2 = Normalizer().fit(X2)
            self.tran3 = Normalizer.fit(X3)
        X1 = torch.Tensor(X1)
        X2 = torch.Tensor(X2)
        X3 = torch.Tensor(X3)
        y = torch.Tensor(y.reshape(len(y), 1))
        print('#' * 30 + 'Training model 1'+ '#' * 30)
        loss1 = self.train_model(self.mod1, X1, y, monitor_loss=True)
        print('#' * 30 + 'Training model 2'+ '#' * 30)
        loss2 = self.train_model(self.mod2, X2, y, monitor_loss=True)
        print('#' * 30 + 'Training model 3'+ '#' * 30)
        loss3 = self.train_model(self.mod3, X3, y, monitor_loss=True)
        print('#' * 30 + 'TRAINING TERMINATED'+ '#' * 30)
        
    
    def droledemean(self,xs):
        xs = list(xs)
        invs = np.array([[1/np.abs(x -y) for y in xs if y is not x] for x in xs])
        tot = np.sum(invs)
        weights = np.sum(invs,axis = 1)
        return np.sum(weights * xs)/tot
        
    def assemble_ys(self,y1,y2,y3):
        if self.assemble_y == 'mean':
            return np.mean([y1,y2,y3],axis = 0)
        k = np.array([self.droledemean(i) for i in np.array([y1.reshape(y1.shape[0]),
                                                             y2.reshape(y2.shape[0]),
                                                             y3.reshape(y3.shape[0])]).T])
        return k
    
    def predict_indep(self,X1,X2,X3):
        if self.normalize:
            X1 = self.trans1.transform(X1)
            X2 = self.trans2.transform(X2)
            X3 = self.trans3.transform(X3)
        X1 = torch.Tensor(X1)
        X2 = torch.Tensor(X2)
        X3 = torch.Tensor(X3)
        y1_hat = self.mod1(X1).detach().numpy()
        y2_hat = self.mod2(X2).detach().numpy()
        y3_hat = self.mod3(X3).detach().numpy()
        return y1_hat,y2_hat,y3_hat
    
    def set_mean(self,meth):
        assert(meth == 'mean' or meth == 'custom')
        self.assemble_y = meth
        
    def predict(self,X1,X2,X3):
        y1_hat,y2_hat,y3_hat = self.predict_indep(X1,X2,X3)
        return self.assemble_ys(y1_hat,y2_hat,y3_hat)      


### 4.1 Pipelines

Each cell here is meant to do a whole pipeline, from loading a certain number of samples, preprocessing etc. We keep using the R2 score, the MSE and the MAE as our metrics.

In [5]:
def compute_score(y_actual, y_pred,verbose = False):
    mse = mean_squared_error(y_actual, y_pred)
    mae = mean_absolute_error(y_actual, y_pred)
    if verbose:
        print("Obtained MSE on test set %2.2f " % mse)
        print("Obtained MAE on test set %2.2f " % mae)
    return {'mse':mse,'mae':mae}

In [6]:
def KFold3(model3,data_X1,data_X2,data_X3,data_y,n_splits = 4,verbose = True):
    """
    perform Kfold cross validation on an NN3 object
    """
    kf = KFold(n_splits=n_splits, random_state=14, shuffle=True)
    scores_mean = {'mse':[],'mae':[]}
    scores_custom = {'mse':[],'mae':[]}
    for kindx,(train_index, test_index) in enumerate(kf.split(data_y)):
        
        print('%i / %i fold' % (kindx+1,n_splits))
        X1_train, X1_test = data_X1[train_index],data_X1[test_index]
        X2_train, X2_test = data_X2[train_index],data_X2[test_index]
        X3_train, X3_test = data_X3[train_index],data_X3[test_index]
        y_train, y_test = data_y[train_index], data_y[test_index]
        model3.fit(X1_train,X2_train,X3_train,y_train)
        
        model3.set_mean('mean')
        y_hat = model3.predict(X1_test,X2_test,X3_test)
        score_mean = compute_score(y_test,y_hat)
        scores_mean['mse'].append(score_mean['mse'])
        scores_mean['mae'].append(score_mean['mae'])
        
        model3.set_mean('custom')
        y_hat = model3.predict(X1_test,X2_test,X3_test)
        score_custom = compute_score(y_test,y_hat)
        scores_custom['mse'].append(score_custom['mse'])
        scores_custom['mae'].append(score_custom['mae'])
        
        if verbose:
            print('Mean method:{}'.format(score_mean))
            print('Custom method:{}'.format(score_custom))
    return scores_mean,scores_custom


In [68]:
#number of samples we take from the datasets
n_samples = 15000
tot_data_y = np.load('data/CSD-10k_H_chemical_shieldings.npy',mmap_mode='r')
mask = np.random.permutation(tot_data_y.shape[0])[:n_samples]
data_y = tot_data_y[mask]
tot_data_X = np.load('data/CSD-10k_H_fps_1k_MD_n_12_l_9_rc_3.0_gw_0.3_rsr_1.0_rss_2.5_rse_5.npy',mmap_mode='r')
data_X1 = tot_data_X[mask]
tot_data_X = np.load('data/CSD-10k_H_fps_1k_MD_n_12_l_9_rc_5.0_gw_0.3_rsr_1.0_rss_2.5_rse_5.npy',mmap_mode='r')
data_X2 = tot_data_X[mask]
tot_data_X = np.load('data/CSD-10k_H_fps_1k_MD_n_12_l_9_rc_7.0_gw_0.3_rsr_1.0_rss_2.5_rse_5.npy',mmap_mode='r')
data_X3 = tot_data_X[mask]

#instanciation of the bigbibo
bibo1 = Net_3(14400)
bibo2 = Net_3(14400)
bibo3 = Net_3(14400)
bigbibo  = NN3(bibo1,bibo2,bibo3,nb_epochs=150,assemble_y='custom')

scores_mean,scores_custom = KFold3(bigbibo,data_X1,data_X2,data_X3,data_y,n_splits=5)

scores = {'mean_method':scores_mean,'custom_method':scores_custom}

with open('kfoldresults.pickle','wb') as f:
    pickle.dump(scores,f)

print(np.mean(scores_mean['mse']))
print(np.mean(scores_custom['mse']))

In [None]:
#single training no kfold
def single_test(bibgbibo,data_X1,data_X2,data_X3,data_y):
    train_size = 200
    mask = np.full(data_y.shape[0], False)
    mask[:train_size] = True
    np.random.shuffle(mask)

    y_train = data_y[mask]
    y_test = data_y[~mask]
    X1_train = data_X1[mask]
    X1_test = data_X1[~mask]
    X2_train = data_X2[mask]
    X2_test = data_X2[~mask]
    X3_train = data_X3[mask]
    X3_test = data_X3[~mask]

    bigbibo.fit(X1_train,X2_train,X3_train,y_train)
    
    bigbibo.set_mean('mean')
    y_hat = bigbibo.predict(X1_test,X2_test,X3_test)
    score_mean = compute_score(y_test,y_hat)
    
    bigbibo.set_mean('custom')
    y_hat = bigbibo.predict(X1_test,X2_test,X3_test)
    score_custom = compute_score(y_test,y_hat)
    
    return score_mean,score_custom


In [218]:
single_test(bigbibo,data_X1,data_X2,data_X3,data_y)

##############################Training model 1##############################


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


[epoch 1] loss: 12693.66
[epoch 2] loss: 12616.09
[epoch 3] loss: 12528.58
##############################Training model 2##############################
[epoch 1] loss: 12666.94
[epoch 2] loss: 12610.31
[epoch 3] loss: 12545.70
##############################Training model 3##############################
[epoch 1] loss: 12745.96
[epoch 2] loss: 12696.40
[epoch 3] loss: 12639.43
##############################TRAINING TERMINATED##############################


({'mse': 645.6513731255138, 'mae': 25.16708791091442},
 {'mse': 644.775093078498, 'mae': 25.149303449651683})