# Machine Learning project CS-433: NMR spectroscopy supervised learning



## Schedules:

* Week 10 (18-24 November): 
 * Tests of various linear models/simple NN on a 10% subset of data
* Week 11 (25-1 December):
 * Feature selection: being able to come with a good set of features
* Week 12 (2-8 December):
 * Start of big scale analysis with Spark, implementation of the models which perform well at small scale
* Week 13 (9-15 December):
 * Wrapping up
* Week 14 (16-22 December): 
 * 19th December: Deadline

In [None]:
import os
import re
import pickle
import scipy.stats
import sklearn.metrics

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from scipy.stats import norm
from itertools import combinations

from IPython.core.debugger import set_trace


from sklearn.preprocessing import MinMaxScaler
from sklearn.utils import resample
from sklearn.metrics import r2_score
from sklearn.metrics import make_scorer
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import Ridge
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.cluster import DBSCAN
from sklearn.decomposition import PCA

## Log Book

We write here the log of the different techniques/improvements we add to the program: **cf log/models_log.txt** for the different models already tried and their results.

## Pipeline

In [None]:
#pipeline graph coming soon

In [None]:
tot_data_X = np.load('data/CSD-10k_H_fps_1k_MD_n_12_l_9_rc_3.0_gw_0.3_rsr_1.0_rss_2.5_rse_5.npy',mmap_mode='r')
tot_data_y = np.load('data/CSD-10k_H_chemical_shieldings.npy',mmap_mode='r')
DATA_LEN = tot_data_X.shape[0]
DATA_COLS = tot_data_X.shape[1]

In [None]:
def load_data(n_samples,tot_data_x = tot_data_X,tot_data_y = tot_data_y):
    #np.random.seed(14)
    mask_data = np.random.permutation(DATA_LEN)[:n_samples]

    data_X = tot_data_X[mask_data]
    data_y = tot_data_y[mask_data]
    return data_X, data_y

In [None]:
def load_data_train_test(n_samples,tot_data_x = tot_data_X,tot_data_y = tot_data_y):
    data_X, data_y = load_data(n_samples,tot_data_x,tot_data_y)
    X_train,X_test,y_train,y_test = train_test_split(data_X,data_y,test_size = 0.2)
    return X_train,X_test,y_train,y_test

## Data Vizualisation

In [None]:
data_X,data_y = load_data(3000)
data_X_df = pd.DataFrame(data_X)
data_y_df = pd.DataFrame(data_y)

In [None]:
mask = np.random.permutation(DATA_COLS)[:9]
fig, axes = plt.subplots(nrows=3, ncols=3)
fig.set_size_inches(11,11)
for ind,i in enumerate(mask):
    index = np.unravel_index(ind,(3,3))
    axes[index].ticklabel_format(style='sci',scilimits=(-3,4),axis='both')
    data_X_df.iloc[:,i].hist(ax = axes[index],bins = 50)
    #data_X_df.iloc[:,i].plot.box(ax = axes[index])

In [None]:
fig, axes = plt.subplots(nrows=3, ncols=3)
fig.set_size_inches(11,11)
for ind,i in enumerate(mask):
    index = np.unravel_index(ind,(3,3))
    axes[index].ticklabel_format(style='sci',scilimits=(-3,4),axis='both')
    #data_X_df.iloc[:,i].hist(ax = axes[index],bins = 50)
    data_X_df.iloc[:,i].plot.box(ax = axes[index])

We see that the different features are scaled pretty differently, we might want to scale them beforehand. Since they don' look like following a gaussian, we'll apply min/max scaling: but in order to do so, we first need to get rid of the outliers thanks to one of the following methods
* Zscore: not adapted as our data might not be gaussian
* DBScan:
* Isolation Forest:

## Outliers detection

In [None]:
def filter_outliers(meth,X_train,y_train):
    """
    Drops the outliers values from the dataset
    masks: [[int]]: each array's indexes correspond to the samples that the corresponding feature considers as outliers.
    """
    masks = np.array([meth(feat) for feat in X_train.T])
    masks = np.hstack(masks)
   # set_trace()
    X_train = np.delete(X_train,masks,axis = 0)
    y_train = np.delete(y_train,masks,axis = 0)
    return X_train,y_train

### DBSCAN

Problem: computationally too demanding.

In [None]:
#clustering = DBSCAN(eps=0.3, min_samples=2).fit(X_train)
#with np.printoptions(threshold=np.inf):
#    print(clustering.labels_)

### Interquartile range method (IQR)

Consists in considering as outliers all data points that lie in >1.5 interquartile range from the quartiles.

In [None]:
def IQR(ys):
    """
    returns the array of indexes of the samples considered as outliers according to IQR"""
    q1, q3 = np.percentile(ys, [5, 95])
    iqr = q3 - q1
    lower_bound = q1 - (iqr * 1.5)
    upper_bound = q3 + (iqr * 1.5)
    return np.where((ys > upper_bound) | (ys < lower_bound))[0]

## Scaling

### Min/max Scaling

In [None]:
def apply_scaler(scaler,X_train,X_test):
    X_train = minmx_scaler.fit_transform(X_train)
    X_test = minmx_scaler.transform(X_test)
    return X_train,X_test

## Dimensionality reduction

### PCA

In [None]:
def plot_PCA(n_comp,X_train):
    """
    displays the 'elbow' of the PCA, ie the screeplot"""
    pca = PCA(n_components = n_comp)
    pca.fit(X_train)
    plt.figure(1)
    plt.plot(np.cumsum(pca.explained_variance_ratio_))
    plt.xlabel('Number of components')
    plt.ylabel('Cumulative explained variance')
    plt.show()
    
X_train,_,_,_ = load_data_train_test(3000)
plot_PCA(70,X_train)

In [None]:
def do_PCA(X_train,X_test,n):
    pca = PCA(n_components = n)
    X_train = pca.fit_transform(X_train)
    X_test = pca.transform(X_test)
    return X_train,X_test

## Models

### Linear Models

In [None]:
def test_lin_model(reg, X_train,y_train,X_test,y_test):
    """
    Test a model reg which is expected to be already instantiated
    return score_train,score_test"""
    lin.fit(X_train,y_train)
    train_R2 = reg.score(X_train,y_train)
    test_R2 = reg.score(X_test,y_test)
    y_hat = reg.predict(X_test)
    mse = mean_squared_error(y_test,y_hat)
    mae = mean_absolute_error(y_test,y_hat)
    print("Obtained score on train set %2.2f " % train_R2)
    print("Obtained score on test set %2.2f " % test_R2)
    print("Obtained MSE on test set %2.2f " % mse)
    print("Obtained MAE on test set %2.2f " % mae)
    return mse, mae

### Neural Nets

## Main

Each cell here is meant to do a whole pipeline, from loading a certain number of samples, preprocessing etc. We keep using the R2 score as our metric

In [None]:
X_train_save,X_test_save,y_train_save,y_test_save = load_data_train_test(10000)

In [None]:
#Linear Regression
X_train = np.copy(X_train_save)
X_test = np.copy(X_test_save)
y_train = np.copy(y_train)
y_test = np.copy(y_test_save)
X_train,y_train = filter_outliers(IQR,X_train,y_train)
X_train,X_test = do_PCA(X_train,X_test,40)
minmx_scaler = MinMaxScaler()
X_train, X_test = apply_scaler(minmx_scaler,X_train,X_test)
lin = LinearRegression(fit_intercept = True).fit(X_train,y_train)
train_score,test_score = test_lin_model(lin,X_train,y_train,X_test,y_test)

In [None]:
#Ridge regression
X_train = np.copy(X_train_save)
X_test = np.copy(X_test_save)
y_train = np.copy(y_train_save)
y_test = np.copy(y_test_save)
do_PCA(X_train,X_test,40)
minmx_scaler = MinMaxScaler()
X_train, X_test = apply_scaler(minmx_scaler,X_train,X_test)
rid = Ridge().fit(X_train,y_train)
train_score,test_score = test_lin_model(rid,X_train,y_train,X_test,y_test)