In [None]:
import math
import scipy
import numpy as np
import pandas as pd

# Import relevant modSAR classes 
import modSAR
from modSAR.network_algorithms import ModSAR
from modSAR.dataset import QSARDataset, QSARDatasetIO

# plotnine is the python version of ggplot2
from plotnine import *

import warnings
warnings.filterwarnings("ignore")

from rdkit import Chem
from rdkit.Chem import AllChem, Draw

from rdkit import RDLogger

import rdkit.Geometry
from rdkit.Chem import rdFMCS, PandasTools
from rdkit.Chem.Draw import IPythonConsole 
from rdkit.Chem import PandasTools

from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
from sklearn.model_selection import RandomizedSearchCV

# 2.5uM in Log units
CUTOFF_ACTIVITY = - np.log10(2.5e-06)

s4_template = Chem.MolFromSmarts('[#6]1:[#7]:[#6]:[#6]:[#7]2:[#6]:1:[#7]:[#7]:[#6]:2')
AllChem.Compute2DCoords(s4_template)

RDLogger.DisableLog('rdApp.info')

from rdkit.Chem import PandasTools

%matplotlib inline

# Load Data

In [None]:
dataset_morgan2 = \
    QSARDatasetIO.load(dataset_name='OSM4',
                   activity_sheetname='activity',
                   smiles_column='Canonical_Smiles',
                   id_column='OSM_ID',
                   filepath='data/osm_qsar_dataset_morgan2.xlsx',
                   calculate_similarity=False)
    

    
dataset_morgan2

In [None]:
dataset_morgan4 = \
    QSARDatasetIO.load(dataset_name='OSM4',
                   activity_sheetname='activity',
                   smiles_column='Canonical_Smiles',
                   id_column='OSM_ID',
                   filepath='../data/osm_qsar_dataset_morgan4.xlsx',
                   calculate_similarity=False)
    
dataset_morgan4

# Train ModSAR algorithm

ModSAR algorithm models regression algorithms in two stages:

1. Training data is represented as a **network** and divided into clusters by optimising the [modularity metric](https://python-louvain.readthedocs.io/en/latest/).
2. Each of this clusters (also called modules) are then modelled by the [OplraReg](https://onlinelibrary.wiley.com/doi/10.1002/minf.201800028) algorithm, a [segmented (or piecewise) regression model](https://www.theanalysisfactor.com/segmented-regression-for-non-constant-relationships/) with a regularisation penalty to select most relevant features and reduce overfitting.


ModSAR inherits a couple of parameters from OplraReg but in practice, we only need to tune $\lambda$ to define how big the effect of regularisation will be in the final piecewise linear equations. A higher $\lambda$ will lead to fewer features in the piecewise linear models and it will run quecker but if $\lambda$ is set too high, the model will have a huge bias and usually "underfit" the data. [Bias-variance tradeoff](https://towardsdatascience.com/understanding-the-bias-variance-tradeoff-165e6942b229).

## Training with different $\lambda$

In this section, we will perform a cross-validation while searching for an optimum $\lambda$ parameter of modSAR.


Based on previous knowledge, we know this parameter to be optimally placed around $\lambda \approx 0.005$ - no fewer than that - so I will be sampling this parameter from a random HalfNormal distribution, i.e. $\lambda \sim \text{HalfNormal}(\mu=0.005, \sigma=0.05)$.

Randomized Search and Cross-Validation is performed with scikit-learn's [RandomizedSearchCV](https://scikit-learn.org/stable/modules/grid_search.html#randomized-parameter-search):

In [None]:
from copy import deepcopy
from sklearn.model_selection import GridSearchCV

modsar_alg = ModSAR(lam=0.02, metadata=dataset_morgan4.metadata, solver_name = 'glpk')

scoring = ['neg_root_mean_squared_error',"neg_mean_absolute_error", "neg_mean_squared_error", "r2"]
param_distributions = {"lam": scipy.stats.halfnorm(0.005, 0.05)}

param_grid = {"lam": [i/100 for i in range(1, 20)]}


model_morgan2 = GridSearchCV(deepcopy(modsar_alg), 
                             return_train_score=True,
                             param_grid=param_grid, 
                             scoring=scoring, 
                             refit="neg_mean_squared_error",
                             cv=5,
                             n_jobs=5,
                             verbose=3)

model_morgan4 = GridSearchCV(deepcopy(modsar_alg), 
                             return_train_score=True,
                             param_grid=param_grid, 
                             scoring=scoring, 
                             refit="neg_mean_squared_error",
                             cv=5,
                             n_jobs=5,
                             verbose=3)

In [None]:
model_morgan2.fit(dataset_morgan2.X, dataset_morgan2.y)

In [None]:
# Save model to disk
import joblib
joblib.dump(model_morgan2, "data/results/model_modsar_morgan2_r2.joblib")
# results_df.to_csv("../data/model_modsar_morgan2_results.csv")