# maxsmi
## Analysis of results

This notebook serves to analyse the results of the simulations ran on the Curta cluster from the Freie Universität Berlin.

### Maxsmi Models

### Goal

Determine the Maxsmi models for all three tasks:

- ESOL,
- FreeSolv,
- Lipophilicity.

In [1]:
import os
from pathlib import Path
import pickle
import numpy as np
import matplotlib.pyplot as plt

# Path to this notebook
HERE = Path(_dh[-1])

path_to_output = HERE.parents[0]

In [2]:
def load_data(path,
              task,):
    """
    Loads the result data from the Maxsmi models.

    Parameters
    ----------
    path : str
        The path to output folder.
    task : str
        The data with associated task, e.g. "ESOL", "FreeSolv"

    Returns
    -------
    data: pd.Pandas
        Pandas data frame with performance metrics (on train and test sets), such as r2 score and time.
    """
    if task == "FreeSolv":
        ml_model = "CONV1D"
        augmentation_strategy = "augmentation_with_duplication"
        augmentation_number = 70
    elif task == "ESOL":
        ml_model = "CONV1D"
        augmentation_strategy = "augmentation_with_reduced_duplication"
        augmentation_number = 70
    elif task == "Lipophilicity":
        ml_model = "CONV1D"
        augmentation_strategy = "augmentation_without_duplication"
        augmentation_number = 80
    else:
        None
        
    with open(
        f"{path}/output/"
        f"{task}_smiles_{augmentation_strategy}_"
        f"{augmentation_number}_{augmentation_strategy}_"
        f"{augmentation_number}_{ml_model}/"
        f"results_metrics.pkl",
        "rb",
    ) as f:
        data = pickle.load(f)
        print(ml_model, augmentation_strategy, augmentation_number)
        return data

In [3]:
print("RMSE values\n-----------\n")
for task in ["FreeSolv", "ESOL", "Lipophilicity"]:
    print(task)
    maxsmi_model = load_data(path_to_output,
                             task)
    print(f"{maxsmi_model.test[0][1]:.3f}\n")

RMSE values
-----------

FreeSolv
CONV1D augmentation_with_duplication 70
1.032

ESOL
CONV1D augmentation_with_reduced_duplication 70
0.569

Lipophilicity
CONV1D augmentation_without_duplication 80
0.593



In [4]:
print("R^2 values\n----------\n")
for task in ["FreeSolv", "ESOL", "Lipophilicity"]:
    print(task)
    maxsmi_model = load_data(path_to_output,
                             task)
    print(f"{maxsmi_model.test[0][2]:.3f}\n")

R^2 values
----------

FreeSolv
CONV1D augmentation_with_duplication 70
0.935

ESOL
CONV1D augmentation_with_reduced_duplication 70
0.926

Lipophilicity
CONV1D augmentation_without_duplication 80
0.758



This values indeed correspond to the minimum value shown in the `results_tables` notebooks.

In [5]:
"""
From smiles to predictions

"""
import argparse
import logging
import logging.handlers
import pandas
import warnings
import os
from datetime import datetime
import numpy
import rdkit
from rdkit.Chem import Draw
import torch

In [6]:
from maxsmi.utils.utils_data import data_retrieval, smiles_in_training, data_checker
from maxsmi.utils.utils_smiles import (
    validity_check,
    smiles_to_canonical,
    smiles_to_folder_name,
    smiles_from_folder_name,
    is_connected,
    ALL_SMILES_DICT,
)
from maxsmi.utils.utils_encoding import char_replacement
from maxsmi.utils.utils_prediction import (
    retrieve_longest_smiles_from_optimal_model,
    unlabeled_smiles_max_length,
    character_check,
    mixture_check,
)

from maxsmi.pytorch_utils.pytorch_models import model_type
from maxsmi.pytorch_utils.pytorch_data import AugmentSmilesData
from maxsmi.pytorch_utils.pytorch_evaluation import out_of_sample_prediction
from maxsmi.utils.utils_optimal_model import retrieve_optimal_model

In [7]:
is_cuda = torch.cuda.is_available()

if is_cuda:
    device = torch.device("cuda")
    device_name = torch.cuda.get_device_name(device)
    logging.info(f"CUDA available: {is_cuda} with {device_name}")
else:
    device = torch.device("cpu")
    logging.info(f"CUDA available: {is_cuda}")

time_execution_start = datetime.now()

In [8]:
def retrieve_nb_model_parameters(task):
    data = data_retrieval(task)
    print(f"Shape of training data set before processing: {data.shape} ")
    
    longest_smiles = retrieve_longest_smiles_from_optimal_model(task)

    # Retrieve SMILES' dictionary
    smi_dict = ALL_SMILES_DICT

    # Obtain longest of all smiles
    max_length_smi = longest_smiles

    print(f"Longest smiles in training data set: {max_length_smi} ")
    (
        ml_model,
        augmentation_strategy,
        augmentation_number,
    ) = retrieve_optimal_model(task)
    (ml_model_name, ml_model) = model_type(ml_model, device, smi_dict, max_length_smi)
    print(f"Summary of ml model used for the prediction: {ml_model} ")
    
    file_path = f"{path_to_output}/prediction_models/{task}"
    ml_model.load_state_dict(
        torch.load(f"{file_path}/model_dict.pth", map_location=device)
    )
    
    total_params = sum(
    param.numel() for param in ml_model.parameters()
    )

    return total_params

In [9]:
retrieve_nb_model_parameters("ESOL")

Shape of training data set before processing: (1128, 2) 
Longest smiles in training data set: 109 
Summary of ml model used for the prediction: Convolutional1DNetwork(
  (convolution): Conv1d(48, 300, kernel_size=(10,), stride=(1,))
  (fully_connected_1): Linear(in_features=30000, out_features=100, bias=True)
  (fully_connected_out): Linear(in_features=100, out_features=1, bias=True)
) 


3144501

In [10]:
retrieve_nb_model_parameters("FreeSolv")

Shape of training data set before processing: (642, 2) 
Longest smiles in training data set: 76 
Summary of ml model used for the prediction: Convolutional1DNetwork(
  (convolution): Conv1d(48, 300, kernel_size=(10,), stride=(1,))
  (fully_connected_1): Linear(in_features=20100, out_features=100, bias=True)
  (fully_connected_out): Linear(in_features=100, out_features=1, bias=True)
) 


2154501

In [11]:
retrieve_nb_model_parameters("lipophilicity")

Shape of training data set before processing: (4200, 2) 
Longest smiles in training data set: 268 
Summary of ml model used for the prediction: Convolutional1DNetwork(
  (convolution): Conv1d(48, 300, kernel_size=(10,), stride=(1,))
  (fully_connected_1): Linear(in_features=77700, out_features=100, bias=True)
  (fully_connected_out): Linear(in_features=100, out_features=1, bias=True)
) 


7914501