In [1]:
import pandas as pd
import numpy as np

from catboost import CatBoostClassifier, CatBoostRegressor

from rdkit import DataStructs, Chem
from rdkit.Chem import AllChem
from rdkit.DataStructs.cDataStructs import ExplicitBitVect

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, mean_squared_error

# Указание входных данных

In [2]:
!ls Pharma_datasets/

Ames_Mutagenicity.csv	    LD50_Intravenous.csv      LDLo_Subcutaneous.csv
bbbp.csv		    LD50_oral.csv	      sider.csv
bioconcentration.csv	    LD50_Subcutaneous.csv     Skin_LD50.csv
Developmental_toxicity.csv  LDLo_intraperitoneal.csv  Skin_LDLo.csv
IGC50.csv		    LDLo_Intravenous.csv      TD50_mouse.csv
LD50_intraperitoneal.csv    LDLo_oral.csv


In [3]:
input_path = 'data/example_input.csv'  # здесь csv с входными smiles, для которых будут предсказаны свойства

datasets_dir = 'Pharma_datasets'
dataset_names = [
    'Ames_Mutagenicity.csv',
    'LD50_intraperitoneal.csv',
    'LDLo_oral.csv',
    'bbbp.csv',
    'LD50_Intravenous.csv',
    'LDLo_Subcutaneous.csv',
    'bioconcentration.csv',
    'LD50_oral.csv',
    'sider.csv',
    'LD50_Subcutaneous.csv',
    'Skin_LD50.csv',
    'Developmental_toxicity.csv',
    'LDLo_intraperitoneal.csv',
    'Skin_LDLo.csv',
    'IGC50.csv',
    'LDLo_Intravenous.csv',
    'TD50_mouse.csv'
]

## Создание датасетов

Размерность датасетов:

In [4]:
input_smiles = pd.read_csv(input_path, nrows=100)

datasets_classification = {}
datasets_regression = {}

for dataset_name in dataset_names:
    delimiter = ','
    if dataset_name == 'Ames_Mutagenicity.csv':
        delimiter = '\t'
    dataset = pd.read_csv(datasets_dir + '/' + dataset_name, delimiter=delimiter)
    
    for col in dataset.columns:
        dataset = dataset[~(dataset[col].isna())]
    
    if dataset.dtypes[1] in ('float64', 'object'):
        datasets_regression[dataset_name] = dataset
    else:
        datasets_classification[dataset_name] = dataset
    print(f'{dataset_name} is loaded with shape {dataset.shape}!')

Ames_Mutagenicity.csv is loaded with shape (6512, 2)!
LD50_intraperitoneal.csv is loaded with shape (36295, 2)!
LDLo_oral.csv is loaded with shape (266, 2)!
bbbp.csv is loaded with shape (2050, 2)!
LD50_Intravenous.csv is loaded with shape (266, 2)!
LDLo_Subcutaneous.csv is loaded with shape (266, 2)!
bioconcentration.csv is loaded with shape (1057, 2)!
LD50_oral.csv is loaded with shape (266, 2)!
sider.csv is loaded with shape (1427, 8)!
LD50_Subcutaneous.csv is loaded with shape (266, 2)!
Skin_LD50.csv is loaded with shape (266, 2)!
Developmental_toxicity.csv is loaded with shape (172, 2)!
LDLo_intraperitoneal.csv is loaded with shape (266, 2)!
Skin_LDLo.csv is loaded with shape (266, 2)!
IGC50.csv is loaded with shape (1482, 2)!
LDLo_Intravenous.csv is loaded with shape (266, 2)!
TD50_mouse.csv is loaded with shape (216, 2)!


In [5]:
bbbp_df = datasets_classification['bbbp.csv']

# Вычисление признаков

Вычисление fingerprints для каждого из smiles

In [6]:
fingerprint_size = 1024

In [7]:
def smiles2fingerprint(smiles):
    try:
        molecule = Chem.MolFromSmiles(smiles)
        # return Chem.RDKFingerprint(molecule)
        return AllChem.GetMorganFingerprintAsBitVect(molecule, 2, nBits=1024)
    except:
        return None


def fingerprint2array(fingerprint):
    if isinstance(fingerprint, ExplicitBitVect):
        arr = np.array(fingerprint).reshape(1, -1)
    else:
        # arr = -np.ones((1, fingerprint_size))
        arr = None
    return arr


def get_data(smiles, y=None):
    fingerprints = smiles.smiles.apply(smiles2fingerprint)
    if y is not None:
        y = y[~(fingerprints.isnull())]
    fingerprints = fingerprints[~(fingerprints.isnull())]
    
    fingerprints_arr = fingerprints.apply(fingerprint2array)
    if y is not None:
        y = y[~(fingerprints_arr.isnull())]
    fingerprints_arr = fingerprints_arr[~(fingerprints_arr.isnull())]
    
    cols = []
    for i in range(fingerprint_size):
        col = fingerprints_arr.apply(lambda row: int(row[0, i]))
        cols.append(col)
        
    x = pd.concat(cols, axis=1)
    x.columns = [f'fp{i}' for i in range(fingerprint_size)]
    
    if y is not None:
        return x, y
    else:
        return x

In [8]:
bbbp_df['fingerprints'] = bbbp_df['smiles'].apply(smiles2fingerprint)

[20:55:26] Explicit valence for atom # 1 N, 4, is greater than permitted
[20:55:26] Explicit valence for atom # 6 N, 4, is greater than permitted
[20:55:26] Explicit valence for atom # 6 N, 4, is greater than permitted
[20:55:26] Explicit valence for atom # 11 N, 4, is greater than permitted
[20:55:26] Explicit valence for atom # 12 N, 4, is greater than permitted
[20:55:26] Explicit valence for atom # 5 N, 4, is greater than permitted
[20:55:26] Explicit valence for atom # 5 N, 4, is greater than permitted
[20:55:26] Explicit valence for atom # 5 N, 4, is greater than permitted
[20:55:26] Explicit valence for atom # 5 N, 4, is greater than permitted
[20:55:26] Explicit valence for atom # 5 N, 4, is greater than permitted
[20:55:26] Explicit valence for atom # 5 N, 4, is greater than permitted


Удалим все None, так как гарантируется только канонический вид молекул в smiles в rdkit

In [9]:
bbbp_df = bbbp_df[~(bbbp_df['fingerprints'].isnull())]

In [10]:
bbbp_df['fingerprints_arr'] = bbbp_df['fingerprints'].apply(fingerprint2array)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bbbp_df['fingerprints_arr'] = bbbp_df['fingerprints'].apply(fingerprint2array)


In [11]:
bbbp_df = bbbp_df[~(bbbp_df['fingerprints_arr'].isnull())]

In [12]:
for i in range(fingerprint_size):
    bbbp_df[f'fp{i}'] = bbbp_df['fingerprints_arr'].apply(lambda row: int(row[0, i]))

  bbbp_df[f'fp{i}'] = bbbp_df['fingerprints_arr'].apply(lambda row: int(row[0, i]))
  bbbp_df[f'fp{i}'] = bbbp_df['fingerprints_arr'].apply(lambda row: int(row[0, i]))
  bbbp_df[f'fp{i}'] = bbbp_df['fingerprints_arr'].apply(lambda row: int(row[0, i]))
  bbbp_df[f'fp{i}'] = bbbp_df['fingerprints_arr'].apply(lambda row: int(row[0, i]))
  bbbp_df[f'fp{i}'] = bbbp_df['fingerprints_arr'].apply(lambda row: int(row[0, i]))
  bbbp_df[f'fp{i}'] = bbbp_df['fingerprints_arr'].apply(lambda row: int(row[0, i]))
  bbbp_df[f'fp{i}'] = bbbp_df['fingerprints_arr'].apply(lambda row: int(row[0, i]))
  bbbp_df[f'fp{i}'] = bbbp_df['fingerprints_arr'].apply(lambda row: int(row[0, i]))
  bbbp_df[f'fp{i}'] = bbbp_df['fingerprints_arr'].apply(lambda row: int(row[0, i]))
  bbbp_df[f'fp{i}'] = bbbp_df['fingerprints_arr'].apply(lambda row: int(row[0, i]))
  bbbp_df[f'fp{i}'] = bbbp_df['fingerprints_arr'].apply(lambda row: int(row[0, i]))
  bbbp_df[f'fp{i}'] = bbbp_df['fingerprints_arr'].apply(lambda row: int(row[

## Подготовка датасета входных smiles для моделей

In [13]:
X_test_smiles = get_data(input_smiles)

# Обучение моделей

## KNN для поиска свойств схожих молекул по fingerprint

In [16]:
X = bbbp_df['fingerprints']
y = bbbp_df['values']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=0, stratify=y)

In [17]:
def similar_fingerprints(a, b):
    similarities = np.zeros((len(a), len(b)))
    for i in range(len(a)):
        for j in range(len(b)):
            if a.iloc[i] is not None and b.iloc[j] is not None:
                # similarities[i, j] = DataStructs.FingerprintSimilarity(a.iloc[i], b.iloc[j])
                similarities[i, j] = DataStructs.DiceSimilarity(a.iloc[i], b.iloc[j])
    return similarities

In [18]:
sims = similar_fingerprints(X_val, X_train)

In [19]:
sims_thresholded = (sims * (sims > 0.7))

In [20]:
threshold = 0.7

In [21]:
X_val.shape

(408,)

In [22]:
n, m = sims.shape
y_pred = np.empty(y_val.shape)

for i in range(n):
    prob = 0
    for j in range(m):
        if sims[i, j] > threshold:
            row = sims[i] * (sims[i] > threshold)
            row /= sum(row)
            prob += row[j] * y_train.values[j]
    y_pred[i] = prob

In [23]:
roc_auc_score(y_val >= 0.5, y_pred)

0.6731770833333333

## Catboost

In [24]:
X = bbbp_df[[col for col in bbbp_df.columns if 'fp' in col]]
y = bbbp_df['values']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=0, stratify=y)

### Обучение

In [25]:
clf = CatBoostClassifier(
    iterations=50,
    random_seed=42,
    # learning_rate=0.5,
    custom_loss=['AUC']
)

clf.fit(
    X_train, y_train,
    eval_set=(X_val, y_val),
    verbose=False,
    plot=True
)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

<catboost.core.CatBoostClassifier at 0x7f159e22e3b0>

In [26]:
y_pred = clf.predict(X_val)

In [27]:
roc_auc_score(y_val, y_pred)

0.7087339743589745

### Инференс

Теперь предскажем значения для входных smiles

In [36]:
y_test = clf.predict(X_test_smiles)

In [37]:
pd.concat([input_smiles, 
           pd.DataFrame(y_test, 
                        columns=['values'])], axis=1).to_csv('data/output_catboost_bbbp.csv', 
                                                             index=False)

## Compound-protein Interaction Prediction with End-to-end Learning of Neural Networks for Graphs and Sequences

In [38]:
!git clone git@github.com:masashitsubaki/molecularGNN_smiles.git

fatal: destination path 'molecularGNN_smiles' already exists and is not an empty directory.


In [39]:
!mkdir molecularGNN_smiles/dataset/classification/dp/

mkdir: cannot create directory ‘molecularGNN_smiles/dataset/classification/dp/’: File exists


In [44]:
X = bbbp_df['smiles']
y = bbbp_df['values']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=0, stratify=y)

In [45]:
pd.concat([X_train, y_train], axis=1).to_csv('molecularGNN_smiles/dataset/classification/dp/data_train.txt', 
                                             index=False, 
                                             header=None,
                                             sep=' ')

In [46]:
pd.concat([X_val, y_val], axis=1).to_csv('molecularGNN_smiles/dataset/classification/dp/data_test.txt', 
                                             index=False, 
                                             header=None,
                                             sep=' ')

Данная модель GNN обучается в соответствии с кодом в репозитории, производится лишь добавление датасетов в папку проекта (выше). Ссылка на репозиторий [тут](https://github.com/masashitsubaki/molecularGNN_smiles).

## DeepChem -- ChemBert transfer learning

In [47]:
!git clone git@github.com:seyonechithrananda/bert-loves-chemistry.git

fatal: destination path 'bert-loves-chemistry' already exists and is not an empty directory.


In [48]:
!mv bert-loves-chemistry/chemberta chemberta

mv: cannot stat 'bert-loves-chemistry/chemberta': No such file or directory


In [49]:
!ls

1YPr9QIOU_JKYBbeZcTccJx56_Qq_kSIq  molecularGNN_smiles		 vocab.txt.1
bert-loves-chemistry		   output_Ames_Mutagenicity.csv  vocab.txt.2
cache_dir			   output_key			 vocab.txt.3
catboost_info			   Pharma_datasets		 vocab.txt.4
chemberta			   README.md			 vocab.txt.5
data				   requirements.txt		 vocab.txt.6
main.ipynb			   runs
main.py				   vocab.txt


In [50]:
import torch

In [51]:
import os

import numpy as np
import pandas as pd

from typing import List

# import molnet loaders from deepchem
from deepchem.molnet import load_bbbp, load_clearance, load_clintox, load_delaney, load_hiv, load_qm7, load_tox21
from rdkit import Chem


# import MolNet dataloder from bert-loves-chemistry fork
from chemberta.utils.molnet_dataloader import load_molnet_dataset, write_molnet_dataset_for_chemprop

2023-04-30 20:58:01.150005: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-04-30 20:58:01.218754: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
No GPU/TPU found, falling back to CPU. (Set TF_CPP_MIN_LOG_LEVEL=0 and rerun for more info.)


In [52]:
!wget https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/vocab.txt

--2023-04-30 20:58:04--  https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/vocab.txt
Resolving deepchemdata.s3-us-west-1.amazonaws.com (deepchemdata.s3-us-west-1.amazonaws.com)... 52.219.216.2
Connecting to deepchemdata.s3-us-west-1.amazonaws.com (deepchemdata.s3-us-west-1.amazonaws.com)|52.219.216.2|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3524 (3.4K) [text/plain]
Saving to: ‘vocab.txt.7’


2023-04-30 20:58:05 (220 MB/s) - ‘vocab.txt.7’ saved [3524/3524]



In [54]:
X = bbbp_df['smiles']
y = bbbp_df['values']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=0, stratify=y)

train_df = pd.concat([X_train, y_train], axis=1)
test_df = pd.concat([X_val, y_val], axis=1)

In [55]:
from simpletransformers.classification import ClassificationModel, ClassificationArgs

model = ClassificationModel('roberta', 'seyonec/PubChem10M_SMILES_BPE_396_250', args={'evaluate_each_epoch': True, 'evaluate_during_training_verbose': True, 'no_save': True, 'num_train_epochs': 10, 'auto_weights': True}) # You can set class weights by using the optional weight argument

Some weights of the model checkpoint at seyonec/PubChem10M_SMILES_BPE_396_250 were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'lm_head.decoder.bias', 'roberta.pooler.dense.bias', 'lm_head.decoder.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at seyonec/PubChem10M_SMILES_BPE

In [None]:
model.train_model(train_df,
                  eval_df=test_df,
                  output_dir='./BPE_PubChem_10M_ClinTox_run',)

# Обучение моделей для всех датасетов

Регрессия

In [57]:
for key, value in datasets_regression.items():
    print(key)

LD50_intraperitoneal.csv
LDLo_oral.csv
LD50_Intravenous.csv
LDLo_Subcutaneous.csv
bioconcentration.csv
LD50_oral.csv
LD50_Subcutaneous.csv
Skin_LD50.csv
LDLo_intraperitoneal.csv
Skin_LDLo.csv
IGC50.csv
LDLo_Intravenous.csv
TD50_mouse.csv


In [58]:
def regress(dataset):
    X = dataset.iloc[:, 0]
    y = dataset.iloc[:, 1:]
    X, y = get_data(pd.DataFrame(X), pd.DataFrame(y))
    
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=0)
    
    reg = CatBoostRegressor(
        iterations=50,
        random_seed=0
    )

    reg.fit(
        X_train, y_train,
        eval_set=(X_val, y_val),
        verbose=False,
        plot=False
    )
    
    y_pred = reg.predict(X_val)
    score = mean_squared_error(y_val, y_pred, squared=False)
    return reg, score

In [59]:
for key, value in datasets_regression.items():
    try:
        reg, score = regress(value)
        print(key, score)
        y_test_smiles = reg.predict(X_test_smiles)
        y_test_smiles.to_csv('output_' + key, index=False, header=None)
    except:
        continue

[20:58:26] Explicit valence for atom # 6 N, 4, is greater than permitted
[20:58:26] Explicit valence for atom # 10 Cl, 2, is greater than permitted
[20:58:26] Explicit valence for atom # 21 Br, 2, is greater than permitted
[20:58:26] Explicit valence for atom # 20 Br, 2, is greater than permitted
[20:58:26] Explicit valence for atom # 45 Cl, 2, is greater than permitted
[20:58:26] Explicit valence for atom # 10 Cl, 2, is greater than permitted
[20:58:26] Explicit valence for atom # 11 Cl, 2, is greater than permitted
[20:58:26] Explicit valence for atom # 3 N, 4, is greater than permitted
[20:58:26] Explicit valence for atom # 14 Br, 2, is greater than permitted
[20:58:26] Explicit valence for atom # 30 Br, 2, is greater than permitted
[20:58:26] Explicit valence for atom # 5 Cl, 2, is greater than permitted
[20:58:26] Explicit valence for atom # 25 Br, 2, is greater than permitted
[20:58:26] Explicit valence for atom # 38 Cl, 2, is greater than permitted
[20:58:26] Explicit valence fo

LD50_intraperitoneal.csv 0.5466331112385991


[20:59:22] Explicit valence for atom # 10 Cl, 2, is greater than permitted
[20:59:22] Explicit valence for atom # 17 Cl, 2, is greater than permitted
[20:59:22] Explicit valence for atom # 0 Cl, 2, is greater than permitted
[20:59:22] Explicit valence for atom # 24 Cl, 2, is greater than permitted
[20:59:22] Explicit valence for atom # 18 Br, 2, is greater than permitted
[20:59:22] Explicit valence for atom # 11 Cl, 2, is greater than permitted
[20:59:22] Explicit valence for atom # 22 Cl, 2, is greater than permitted
[20:59:22] Explicit valence for atom # 21 Cl, 2, is greater than permitted
[20:59:22] Explicit valence for atom # 20 Cl, 2, is greater than permitted
[20:59:22] Explicit valence for atom # 25 Br, 2, is greater than permitted
[20:59:22] Explicit valence for atom # 27 Cl, 2, is greater than permitted
[20:59:22] Explicit valence for atom # 28 Cl, 2, is greater than permitted
[20:59:22] Explicit valence for atom # 19 Cl, 2, is greater than permitted
[20:59:22] Explicit valenc

LDLo_oral.csv 0.9497499880931344


[20:59:24] Explicit valence for atom # 10 Cl, 2, is greater than permitted
[20:59:24] Explicit valence for atom # 17 Cl, 2, is greater than permitted
[20:59:24] Explicit valence for atom # 0 Cl, 2, is greater than permitted
[20:59:24] Explicit valence for atom # 24 Cl, 2, is greater than permitted
[20:59:24] Explicit valence for atom # 18 Br, 2, is greater than permitted
[20:59:24] Explicit valence for atom # 11 Cl, 2, is greater than permitted
[20:59:24] Explicit valence for atom # 22 Cl, 2, is greater than permitted
[20:59:24] Explicit valence for atom # 21 Cl, 2, is greater than permitted
[20:59:24] Explicit valence for atom # 20 Cl, 2, is greater than permitted
[20:59:24] Explicit valence for atom # 25 Br, 2, is greater than permitted
[20:59:24] Explicit valence for atom # 27 Cl, 2, is greater than permitted
[20:59:24] Explicit valence for atom # 28 Cl, 2, is greater than permitted
[20:59:24] Explicit valence for atom # 19 Cl, 2, is greater than permitted
[20:59:24] Explicit valenc

LD50_Intravenous.csv 0.9497499880931344


[20:59:25] Explicit valence for atom # 10 Cl, 2, is greater than permitted
[20:59:25] Explicit valence for atom # 17 Cl, 2, is greater than permitted
[20:59:25] Explicit valence for atom # 0 Cl, 2, is greater than permitted
[20:59:25] Explicit valence for atom # 24 Cl, 2, is greater than permitted
[20:59:25] Explicit valence for atom # 18 Br, 2, is greater than permitted
[20:59:25] Explicit valence for atom # 11 Cl, 2, is greater than permitted
[20:59:25] Explicit valence for atom # 22 Cl, 2, is greater than permitted
[20:59:25] Explicit valence for atom # 21 Cl, 2, is greater than permitted
[20:59:25] Explicit valence for atom # 20 Cl, 2, is greater than permitted
[20:59:25] Explicit valence for atom # 25 Br, 2, is greater than permitted
[20:59:25] Explicit valence for atom # 27 Cl, 2, is greater than permitted
[20:59:25] Explicit valence for atom # 28 Cl, 2, is greater than permitted
[20:59:25] Explicit valence for atom # 19 Cl, 2, is greater than permitted
[20:59:25] Explicit valenc

LDLo_Subcutaneous.csv 0.9497499880931344


[20:59:26] Explicit valence for atom # 10 Cl, 2, is greater than permitted
[20:59:26] Explicit valence for atom # 17 Cl, 2, is greater than permitted
[20:59:26] Explicit valence for atom # 0 Cl, 2, is greater than permitted
[20:59:26] Explicit valence for atom # 24 Cl, 2, is greater than permitted
[20:59:26] Explicit valence for atom # 18 Br, 2, is greater than permitted
[20:59:26] Explicit valence for atom # 11 Cl, 2, is greater than permitted
[20:59:26] Explicit valence for atom # 22 Cl, 2, is greater than permitted
[20:59:26] Explicit valence for atom # 21 Cl, 2, is greater than permitted
[20:59:26] Explicit valence for atom # 20 Cl, 2, is greater than permitted
[20:59:26] Explicit valence for atom # 25 Br, 2, is greater than permitted
[20:59:26] Explicit valence for atom # 27 Cl, 2, is greater than permitted
[20:59:26] Explicit valence for atom # 28 Cl, 2, is greater than permitted
[20:59:26] Explicit valence for atom # 19 Cl, 2, is greater than permitted
[20:59:26] Explicit valenc

LD50_oral.csv 0.9497499880931344


[20:59:27] Explicit valence for atom # 10 Cl, 2, is greater than permitted
[20:59:27] Explicit valence for atom # 17 Cl, 2, is greater than permitted
[20:59:27] Explicit valence for atom # 0 Cl, 2, is greater than permitted
[20:59:27] Explicit valence for atom # 24 Cl, 2, is greater than permitted
[20:59:27] Explicit valence for atom # 18 Br, 2, is greater than permitted
[20:59:27] Explicit valence for atom # 11 Cl, 2, is greater than permitted
[20:59:27] Explicit valence for atom # 22 Cl, 2, is greater than permitted
[20:59:27] Explicit valence for atom # 21 Cl, 2, is greater than permitted
[20:59:27] Explicit valence for atom # 20 Cl, 2, is greater than permitted
[20:59:27] Explicit valence for atom # 25 Br, 2, is greater than permitted
[20:59:27] Explicit valence for atom # 27 Cl, 2, is greater than permitted
[20:59:27] Explicit valence for atom # 28 Cl, 2, is greater than permitted
[20:59:27] Explicit valence for atom # 19 Cl, 2, is greater than permitted
[20:59:27] Explicit valenc

LD50_Subcutaneous.csv 0.9497499880931344


[20:59:28] Explicit valence for atom # 10 Cl, 2, is greater than permitted
[20:59:28] Explicit valence for atom # 17 Cl, 2, is greater than permitted
[20:59:28] Explicit valence for atom # 0 Cl, 2, is greater than permitted
[20:59:28] Explicit valence for atom # 24 Cl, 2, is greater than permitted
[20:59:28] Explicit valence for atom # 18 Br, 2, is greater than permitted
[20:59:28] Explicit valence for atom # 11 Cl, 2, is greater than permitted
[20:59:28] Explicit valence for atom # 22 Cl, 2, is greater than permitted
[20:59:28] Explicit valence for atom # 21 Cl, 2, is greater than permitted
[20:59:28] Explicit valence for atom # 20 Cl, 2, is greater than permitted
[20:59:28] Explicit valence for atom # 25 Br, 2, is greater than permitted
[20:59:28] Explicit valence for atom # 27 Cl, 2, is greater than permitted
[20:59:28] Explicit valence for atom # 28 Cl, 2, is greater than permitted
[20:59:28] Explicit valence for atom # 19 Cl, 2, is greater than permitted
[20:59:28] Explicit valenc

Skin_LD50.csv 0.9497499880931344


[20:59:28] Explicit valence for atom # 10 Cl, 2, is greater than permitted
[20:59:28] Explicit valence for atom # 17 Cl, 2, is greater than permitted
[20:59:28] Explicit valence for atom # 0 Cl, 2, is greater than permitted
[20:59:28] Explicit valence for atom # 24 Cl, 2, is greater than permitted
[20:59:28] Explicit valence for atom # 18 Br, 2, is greater than permitted
[20:59:28] Explicit valence for atom # 11 Cl, 2, is greater than permitted
[20:59:28] Explicit valence for atom # 22 Cl, 2, is greater than permitted
[20:59:28] Explicit valence for atom # 21 Cl, 2, is greater than permitted
[20:59:28] Explicit valence for atom # 20 Cl, 2, is greater than permitted
[20:59:28] Explicit valence for atom # 25 Br, 2, is greater than permitted
[20:59:28] Explicit valence for atom # 27 Cl, 2, is greater than permitted
[20:59:28] Explicit valence for atom # 28 Cl, 2, is greater than permitted
[20:59:28] Explicit valence for atom # 19 Cl, 2, is greater than permitted
[20:59:28] Explicit valenc

LDLo_intraperitoneal.csv 0.9497499880931344


[20:59:30] Explicit valence for atom # 10 Cl, 2, is greater than permitted
[20:59:30] Explicit valence for atom # 17 Cl, 2, is greater than permitted
[20:59:30] Explicit valence for atom # 0 Cl, 2, is greater than permitted
[20:59:30] Explicit valence for atom # 24 Cl, 2, is greater than permitted
[20:59:30] Explicit valence for atom # 18 Br, 2, is greater than permitted
[20:59:30] Explicit valence for atom # 11 Cl, 2, is greater than permitted
[20:59:30] Explicit valence for atom # 22 Cl, 2, is greater than permitted
[20:59:30] Explicit valence for atom # 21 Cl, 2, is greater than permitted
[20:59:30] Explicit valence for atom # 20 Cl, 2, is greater than permitted
[20:59:30] Explicit valence for atom # 25 Br, 2, is greater than permitted
[20:59:30] Explicit valence for atom # 27 Cl, 2, is greater than permitted
[20:59:30] Explicit valence for atom # 28 Cl, 2, is greater than permitted
[20:59:30] Explicit valence for atom # 19 Cl, 2, is greater than permitted
[20:59:30] Explicit valenc

Skin_LDLo.csv 0.9497499880931344


[20:59:31] Explicit valence for atom # 10 Cl, 2, is greater than permitted
[20:59:31] Explicit valence for atom # 17 Cl, 2, is greater than permitted
[20:59:31] Explicit valence for atom # 0 Cl, 2, is greater than permitted
[20:59:31] Explicit valence for atom # 24 Cl, 2, is greater than permitted
[20:59:31] Explicit valence for atom # 18 Br, 2, is greater than permitted
[20:59:31] Explicit valence for atom # 11 Cl, 2, is greater than permitted
[20:59:31] Explicit valence for atom # 22 Cl, 2, is greater than permitted
[20:59:31] Explicit valence for atom # 21 Cl, 2, is greater than permitted
[20:59:31] Explicit valence for atom # 20 Cl, 2, is greater than permitted
[20:59:31] Explicit valence for atom # 25 Br, 2, is greater than permitted
[20:59:31] Explicit valence for atom # 27 Cl, 2, is greater than permitted
[20:59:31] Explicit valence for atom # 28 Cl, 2, is greater than permitted
[20:59:31] Explicit valence for atom # 19 Cl, 2, is greater than permitted
[20:59:31] Explicit valenc

LDLo_Intravenous.csv 0.9497499880931344


Метрики (RMSE):

LD50_intraperitoneal.csv 0.5466331112385991

LDLo_oral.csv 0.9497499880931344

LD50_Intravenous.csv 0.9497499880931344

LDLo_Subcutaneous.csv 0.9497499880931344

LD50_Subcutaneous.csv 0.9497499880931344

LD50_oral.csv 0.9497499880931344

Skin_LD50.csv 0.9497499880931344

LDLo_intraperitoneal.csv 0.9497499880931344

Skin_LDLo.csv 0.9497499880931344

LDLo_Intravenous.csv 0.9497499880931344

Классификация

In [60]:
for key, value in datasets_classification.items():
    print(key)

Ames_Mutagenicity.csv
bbbp.csv
sider.csv
Developmental_toxicity.csv


In [61]:
def classify(dataset):
    X = dataset.iloc[:, 0]
    y = dataset.iloc[:, 1:]
    X, y = get_data(pd.DataFrame(X), pd.DataFrame(y))
    
    # y = dataset.iloc[:, 1:]
    stratify = y if y.shape[1] == 1 else None
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=0,
                                                      stratify=stratify
                                                     )
    
    clf = CatBoostClassifier(
        iterations=50,
        random_seed=0,
        custom_loss=['AUC']
    )

    clf.fit(
        X_train, y_train,
        eval_set=(X_val, y_val),
        verbose=False,
        plot=False
    )
    
    y_pred = clf.predict(X_val)
    score = roc_auc_score(y_val, y_pred)
    return clf, score

In [62]:
for key, value in datasets_classification.items():
    try:
        clf, score = classify(value)
        print(key, score)
        y_test_smiles = clf.predict(X_test_smiles)
        pd.DataFrame(y_test_smiles).to_csv('output_' + key, index=False, header=None)
    except:
        continue

[20:59:32] SMILES Parse Error: syntax error while parsing: NNC(=O)CNC(=O)\C=N\#N
[20:59:32] SMILES Parse Error: Failed parsing SMILES 'NNC(=O)CNC(=O)\C=N\#N' for input: 'NNC(=O)CNC(=O)\C=N\#N'
[20:59:32] SMILES Parse Error: syntax error while parsing: O=C1NC(=O)\C(=N/#N)\C=N1
[20:59:32] SMILES Parse Error: Failed parsing SMILES 'O=C1NC(=O)\C(=N/#N)\C=N1' for input: 'O=C1NC(=O)\C(=N/#N)\C=N1'
[20:59:32] SMILES Parse Error: syntax error while parsing: NC(=O)CNC(=O)\C=N\#N
[20:59:32] SMILES Parse Error: Failed parsing SMILES 'NC(=O)CNC(=O)\C=N\#N' for input: 'NC(=O)CNC(=O)\C=N\#N'
[20:59:32] SMILES Parse Error: syntax error while parsing: CCCCN(CC(O)C1=C\C(=N/#N)\C(=O)C=C1)N=O
[20:59:32] SMILES Parse Error: Failed parsing SMILES 'CCCCN(CC(O)C1=C\C(=N/#N)\C(=O)C=C1)N=O' for input: 'CCCCN(CC(O)C1=C\C(=N/#N)\C(=O)C=C1)N=O'
[20:59:32] SMILES Parse Error: syntax error while parsing: NC(COC(=O)\C=N/#N)C(=O)O
[20:59:32] SMILES Parse Error: Failed parsing SMILES 'NC(COC(=O)\C=N/#N)C(=O)O' for inp

Ames_Mutagenicity.csv 0.763156146179402


[20:59:43] Explicit valence for atom # 1 N, 4, is greater than permitted
[20:59:43] Explicit valence for atom # 6 N, 4, is greater than permitted
[20:59:43] Explicit valence for atom # 6 N, 4, is greater than permitted
[20:59:43] Explicit valence for atom # 11 N, 4, is greater than permitted
[20:59:43] Explicit valence for atom # 12 N, 4, is greater than permitted
[20:59:43] Explicit valence for atom # 5 N, 4, is greater than permitted
[20:59:43] Explicit valence for atom # 5 N, 4, is greater than permitted
[20:59:43] Explicit valence for atom # 5 N, 4, is greater than permitted
[20:59:43] Explicit valence for atom # 5 N, 4, is greater than permitted
[20:59:43] Explicit valence for atom # 5 N, 4, is greater than permitted
[20:59:43] Explicit valence for atom # 5 N, 4, is greater than permitted


Метрики (ROC AUC):

Ames_Mutagenicity.csv 0.763156146179402

bbbp.csv 0.7227564102564104

In [64]:
from rdkit.Chem import Descriptors
import rdkit.Chem
import math


def normulize(smiles, val):
    val = val / 1000
    val = val / Descriptors.MolWt(rdkit.Chem.MolFromSmiles(smiles))
    val = - math.log10(val)
    return val

def predict_regression_values(model, filename):
    input = pd.read_csv(filename)
    predicted = model.predict(input.smiles)
    predicted['values'] = predicted.apply( lambda x: normulize(x['smiles'], x['value']), axis=1 )
    predicted['smiles', 'values'].to_csv(filename + 'output.csv')

def predict_categories_values(model, filename):
    input = pd.read_csv(filename)
    predicted = model.predict(input.smiles)
    predicted.to_csv(filename + 'output.csv')