In [1]:
!git clone https://github.com/eyalmazuz/DrugWithdrawn.git
!pip install wandb
!pip install datasets
!pip install accelerate>=0.20.1
!pip install transformers

Cloning into 'DrugWithdrawn'...
remote: Enumerating objects: 148, done.[K
remote: Counting objects: 100% (7/7), done.[K
remote: Compressing objects: 100% (7/7), done.[K
remote: Total 148 (delta 2), reused 0 (delta 0), pack-reused 141[K
Receiving objects: 100% (148/148), 162.51 MiB | 18.41 MiB/s, done.
Resolving deltas: 100% (63/63), done.
Updating files: 100% (117/117), done.
Collecting wandb
  Downloading wandb-0.16.0-py3-none-any.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m39.6 MB/s[0m eta [36m0:00:00[0m
Collecting GitPython!=3.1.29,>=1.0.0 (from wandb)
  Downloading GitPython-3.1.40-py3-none-any.whl (190 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.6/190.6 kB[0m [31m23.4 MB/s[0m eta [36m0:00:00[0m
Collecting sentry-sdk>=1.0.0 (from wandb)
  Downloading sentry_sdk-1.38.0-py2.py3-none-any.whl (252 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m252.8/252.8 kB[0m [31m33.6 M

In [2]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
Collecting responses<0.19 (from evaluate)
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Installing collected packages: responses, evaluate
Successfully installed evaluate-0.4.1 responses-0.18.0


In [3]:
from sklearn.metrics import average_precision_score, roc_auc_score
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, IntervalStrategy
import torch
from torch.utils.data import DataLoader
from torch import nn
import torch.nn.functional as F
import wandb
from datasets import load_dataset
import evaluate
import pandas as pd
import numpy as np

In [4]:
metrics = ["roc_auc", "accuracy", "f1", "precision", "recall"]
loaded_metrics = {metric_name: evaluate.load(metric_name) for metric_name in metrics}

Downloading builder script:   0%|          | 0.00/9.54k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.55k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.36k [00:00<?, ?B/s]

In [5]:
split_type = 'db_agree_no_dups'
dataset_name = 'NCATS'

In [6]:
def get_dataset_path(split_type, dataset_name, split_name):
    return f'/content/DrugWithdrawn/split/{split_type}/{dataset_name}/{split_name}.csv'


dataset = load_dataset('csv', data_files={
    'train': get_dataset_path(split_type, dataset_name, 'train2'),
    'validation': get_dataset_path(split_type, dataset_name, 'val'),
    'test': get_dataset_path(split_type, dataset_name, 'test'),
})

Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [7]:
dataset

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'index', 'smiles', 'length', 'inchikey', 'name', 'groups', 'withdrawn_class', 'source'],
        num_rows: 1883
    })
    validation: Dataset({
        features: ['Unnamed: 0', 'index', 'smiles', 'length', 'inchikey', 'name', 'groups', 'withdrawn_class', 'source'],
        num_rows: 471
    })
    test: Dataset({
        features: ['Unnamed: 0', 'index', 'smiles', 'length', 'inchikey', 'name', 'groups', 'withdrawn_class', 'source'],
        num_rows: 3601
    })
})

In [8]:
# Dataset preparation
dataset = dataset.rename_column('withdrawn_class', 'labels')\
    .remove_columns(['Unnamed: 0', 'index', 'length', 'inchikey', 'groups', 'source'])\
    .with_format('torch')

In [9]:
dataset

DatasetDict({
    train: Dataset({
        features: ['smiles', 'name', 'labels'],
        num_rows: 1883
    })
    validation: Dataset({
        features: ['smiles', 'name', 'labels'],
        num_rows: 471
    })
    test: Dataset({
        features: ['smiles', 'name', 'labels'],
        num_rows: 3601
    })
})

In [10]:
pd.DataFrame(dataset['train'])

Unnamed: 0,smiles,name,labels
0,CC1(C)S[C@@H]2[C@H](NC(=O)[C@H](C(=O)[O-])c3cc...,ticarcillin disodium,tensor(0)
1,CCN(CC)C(=O)C1CN2CCC3=CC(=C(C=C3C2CC1OC(=O)C)O...,benzquinamide,tensor(1)
2,C[C@@H]1O[C@@H](OC[C@H]2O[C@@H](OC3=C(OC4=CC(O...,rutin,tensor(0)
3,C1=CC=C(C=C1)C2=C(OC(=N2)N(CCO)CCO)C3=CC=CC=C3,ditazole,tensor(1)
4,CN1CC[C@]23c4c5ccc(O)c4O[C@H]2C(=O)CC[C@H]3[C@...,hydromorphone hydrochloride,tensor(0)
...,...,...,...
1878,O=C1NOCC1\N=C\C1=CC=C(\C=N\C2CONC2=O)C=C1,terizidone,tensor(0)
1879,CC(C)(C)NCC(O)C1=CC(Cl)=C(N)C(Cl)=C1,clenbuterol,tensor(0)
1880,[99Tc].CC(C)C1=CC=CC(C(C)C)=C1NC(=O)CN(CC(O)=O...,technetium tc-99m disofenin,tensor(0)
1881,NC1=NC(=O)N(C=C1)[C@H]1CC[C@@H](CO)O1,zalcitabine,tensor(0)


In [11]:
pretrained_path = "DeepChem/ChemBERTa-77M-MTR"
tokenizer = AutoTokenizer.from_pretrained(pretrained_path)
model = AutoModelForSequenceClassification.from_pretrained(pretrained_path, num_labels=2,
                                                           id2label={0: 'Not Withdrawn', 1:'Withdrawn'},
                                                           label2id={'Not Withdrawn': 0, 'Withdrawn': 1})

tokenizer_config.json:   0%|          | 0.00/1.27k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/17.7k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/6.96k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/8.26k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/420 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


pytorch_model.bin:   0%|          | 0.00/14.0M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at DeepChem/ChemBERTa-77M-MTR and are newly initialized: ['classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight', 'classifier.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [12]:
def tokenize_function(examples):
    return tokenizer(examples["smiles"], padding="max_length", truncation=True, max_length=300)

dataset = dataset.map(tokenize_function, batched=True).remove_columns(['smiles'])

Map:   0%|          | 0/1883 [00:00<?, ? examples/s]

Map:   0%|          | 0/471 [00:00<?, ? examples/s]

Map:   0%|          | 0/3601 [00:00<?, ? examples/s]

In [13]:
# Compute metrics function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    metrics_dict = {
        **loaded_metrics["f1"].compute(predictions=predictions, references=labels),
        **{'PR-AUC': average_precision_score(y_score=logits[:, 1], y_true=labels)},
        **loaded_metrics["accuracy"].compute(predictions=predictions, references=labels),
        **loaded_metrics["roc_auc"].compute(prediction_scores=logits[:, 1], references=labels),
        **loaded_metrics["precision"].compute(predictions=predictions, references=labels),
        **loaded_metrics["recall"].compute(predictions=predictions, references=labels),
    }
    return metrics_dict

In [15]:
training_args = TrainingArguments(
    output_dir=f"./results/{split_type}/{dataset_name}/{pretrained_path}",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=4,
    num_train_epochs=5,
    weight_decay=0.01,
    evaluation_strategy=IntervalStrategy.STEPS,
    save_strategy=IntervalStrategy.STEPS,
    report_to='wandb',
    run_name=f'{pretrained_path} {split_type} {dataset_name}',
    logging_steps=50,
    save_steps=50,
)

In [16]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset={'Validation': dataset["validation"], 'Test': dataset["test"]},
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

trainer.train()

<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


You're using a RobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Validation F1,Validation Pr-auc,Validation Accuracy,Validation Roc Auc,Validation Precision,Validation Recall,Test Loss,Test F1,Test Pr-auc,Test Accuracy,Test Roc Auc,Test Precision,Test Recall
50,0.6713,No log,0.033058,0.279593,0.751592,0.541086,1.0,0.016807,0.646407,0.014101,0.234373,0.767009,0.51941,0.26087,0.007246
100,0.6129,No log,0.0,0.286702,0.747346,0.543497,0.0,0.0,0.596434,0.00241,0.244201,0.770064,0.5374,0.5,0.001208
150,0.5816,No log,0.0,0.294408,0.747346,0.56059,0.0,0.0,0.568109,0.00241,0.251133,0.770064,0.546504,0.5,0.001208
200,0.5734,No log,0.0,0.307338,0.747346,0.586134,0.0,0.0,0.561887,0.00241,0.253129,0.770064,0.544522,0.5,0.001208
250,0.5314,No log,0.0,0.308517,0.747346,0.596495,0.0,0.0,0.558302,0.00241,0.254264,0.770064,0.544012,0.5,0.001208
300,0.544,No log,0.0,0.31442,0.747346,0.607453,0.0,0.0,0.559748,0.00241,0.255495,0.770064,0.543874,0.5,0.001208
350,0.5202,No log,0.0,0.318638,0.747346,0.617313,0.0,0.0,0.562083,0.0,0.255917,0.769786,0.542823,0.0,0.0
400,0.5259,No log,0.0,0.323839,0.747346,0.622493,0.0,0.0,0.563454,0.0,0.256502,0.769786,0.542866,0.0,0.0
450,0.5335,No log,0.0,0.330755,0.747346,0.628915,0.0,0.0,0.564626,0.0,0.257635,0.769786,0.543406,0.0,0.0
500,0.5455,No log,0.0,0.333986,0.747346,0.632425,0.0,0.0,0.565297,0.0,0.258186,0.769786,0.543406,0.0,0.0


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


TrainOutput(global_step=585, training_loss=0.5559794271094167, metrics={'train_runtime': 160.5835, 'train_samples_per_second': 58.63, 'train_steps_per_second': 3.643, 'total_flos': 50516775172800.0, 'train_loss': 0.5559794271094167, 'epoch': 4.97})