<a href="https://colab.research.google.com/github/veren4/SMILES_featurization/blob/master/SMILES_featurization_try_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This notebook is based on an official AllenNLP example: https://allennlp.org/tutorials \
[Second, more detailed tutorial](https://mlexplained.com/2019/01/30/an-in-depth-tutorial-to-allennlp-from-basics-to-elmo-and-bert/)

In [None]:
!pip install allennlp
!pip install SmilesPE

## Imports

In [1]:
from typing import Iterator, List, Dict         # type annotations

import torch                                    # AllenNLP is built on PyTorch
import torch.optim as optim
import numpy as np

from allennlp.data import Instance              # training sample = Instance containing fields

In [2]:
from allennlp.data.fields import TextField, SequenceLabelField      # possible fields: http://docs.allennlp.org/v0.9.0/api/allennlp.data.fields.html

In [3]:
from allennlp.data.dataset_readers import DatasetReader     # reads a file and produces a stream of Instances

In [6]:
# load data as a cached_path??

from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

file_id = '1lX3mV3DBYiMxp4xICvUt4fxRyeNSpwN7'
downloaded = drive.CreateFile({'id': file_id})

In [8]:
print(downloaded.GetContentString())

C(C(C(COP(=O)(O)O)O)O)C(=O)C(=O)O
CCC(C)C(=O)C(=O)O
CC(C)(COP(=O)(O)OP(=O)(O)OCC1C(C(C(O1)N2C=NC3=C(N=CN=C32)N)O)OP(=O)(O)O)C(C(=O)NCCC(=O)NCCSC(=O)CC(CCC(=O)O)O)O
C1=CC(=CC(=C1)O)CCC(=O)O
COC1=CC2=C(C=CN=C2C=C1)C(C3CC4CCN3CC4(C=C)O)O
C(CC(=O)O)C(=O)CC(=O)O
CC(C)(COP(=O)(O)OP(=O)(O)OCC1C(C(C(O1)N2C=NC3=C(N=CN=C32)N)O)OP(=O)(O)O)C(C(=O)NCCC(=O)NCCSC(=O)CC(=O)CCC(=O)O)O
C(C(C(C(=O)C(C(=O)O)O)O)O)O
CC(=O)CC(=O)O
C(C1C(C(C(C(O1)O)O)O)OC2C(C(=O)C(C(O2)CO)O)O)O
C(C(=O)C(=O)O)S
CC(=CC(=O)SCCNC(=O)CCNC(=O)C(C(C)(C)COP(=O)(O)OP(=O)(O)OCC1C(C(C(O1)N2C=NC3=C(N=CN=C32)N)O)OP(=O)(O)O)O)C
CC(=CCCC(=CCCC(=CCCC(=CCCC(=CCCC(=CCCC(=CCCC(=CCC1=C(C=CC(=C1)C(=O)O)O)C)C)C)C)C)C)C)C
C1=CC(=CC(=C1)O)C=O





In [9]:
from allennlp.data.token_indexers import TokenIndexer, SingleIdTokenIndexer     # Tokenindexer: rule for how to turn a token into indices
from allennlp.data.tokenizers import Token

# The Tokenizer I used before:
from SmilesPE.pretokenizer import atomwise_tokenizer

In [11]:
from allennlp.data.vocabulary import Vocabulary     # mapping from strings -> integers

from allennlp.models import Model                   # a PyTorch Module
                                                    # input: tensors
                                                    # output: dict of tensor output (including the training loss)

from allennlp.modules.text_field_embedders import TextFieldEmbedder, BasicTextFieldEmbedder
from allennlp.modules.token_embedders import Embedding
from allennlp.modules.seq2seq_encoders import Seq2SeqEncoder, PytorchSeq2SeqWrapper
from allennlp.nn.util import get_text_field_mask, sequence_cross_entropy_with_logits

from allennlp.training.metrics import CategoricalAccuracy       # for tracking accuracy on the training and validation datasets
                                                                # accuracy = (TP+TN)/(TP+FP+TN+FN)

from allennlp.data.iterators import BucketIterator      # for creating (mini?)batches

from allennlp.training.trainer import Trainer

from allennlp.predictors import SentenceTaggerPredictor # make predictions on new input

ModuleNotFoundError: ignored

Iterator: Batches the data \
Trainer: Handles training and metric recording \
(Predictor: Generates predictions from raw strings)

## Setup

Achtung: Ich muss bei dem PubChem sample file die Zeilennummerierung wegschneiden!

In [12]:
torch.manual_seed(1)   # Set random seed manually to replicate results

<torch._C.Generator at 0x7f5023d59570>

DatasetReader: Extracts necessary information from data into a list of Instance objects \
1. Reading the data from disk
2. Extracting relevant information from the data
3. Converting the data into a list of Instances

In [13]:
class SMILES_tokens_DatasetReader(DatasetReader):

    def __init__(self, token_indexers: Dict[str, TokenIndexer] = None):
        super().__init__(lazy=False)
        self.token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()}      # unique ID for each distinct token

    def text_to_instance(self, tokens: List[Token], tags: List[str] = None) -> Instance:
        sentence_field = TextField(tokens, self.token_indexers)
        fields = {"sentence": sentence_field}

        if tags:
            label_field = SequenceLabelField(labels=tags, sequence_field=sentence_field)
            fields["labels"] = label_field

        return Instance(fields)

    def _read(self, file_path: str) -> Iterator[Instance]:
        with open(file_path) as f:
            for line in f:
                pairs = line.strip().split()
                sentence, tags = zip(*(pair.split("###") for pair in pairs))
                yield self.text_to_instance([Token(word) for word in sentence], tags)

Model: The model to be trained

In [None]:
class LstmTagger(Model):                # subclass of the torch.nn.Module

    def __init__(self,

                 word_embeddings: TextFieldEmbedder,

                 encoder: Seq2SeqEncoder,

                 vocab: Vocabulary) -> None:
        super().__init__(vocab)
        self.word_embeddings = word_embeddings
        self.encoder = encoder

        self.hidden2tag = torch.nn.Linear(in_features=encoder.get_output_dim(),
                                          out_features=vocab.get_vocab_size('labels'))

        self.accuracy = CategoricalAccuracy()

    def forward(self,
                sentence: Dict[str, torch.Tensor],
                labels: torch.Tensor = None) -> Dict[str, torch.Tensor]:
        mask = get_text_field_mask(sentence)

        embeddings = self.word_embeddings(sentence)

        encoder_out = self.encoder(embeddings, mask)

        tag_logits = self.hidden2tag(encoder_out)
        output = {"tag_logits": tag_logits}

        if labels is not None:
            self.accuracy(tag_logits, labels, mask)
            output["loss"] = sequence_cross_entropy_with_logits(tag_logits, labels, mask)

        return output

    def get_metrics(self, reset: bool = False) -> Dict[str, float]:
        return {"accuracy": self.accuracy.get_metric(reset)}


## Training

In [None]:
reader = PosDatasetReader()                     # create an instance of the DatasetReader

train_dataset = reader.read(cached_path(
    'https://raw.githubusercontent.com/allenai/allennlp'
    '/master/tutorials/tagger/training.txt'))
validation_dataset = reader.read(cached_path(
    'https://raw.githubusercontent.com/allenai/allennlp'
    '/master/tutorials/tagger/validation.txt'))

vocab = Vocabulary.from_instances(train_dataset + validation_dataset)

EMBEDDING_DIM = 6
HIDDEN_DIM = 6

token_embedding = Embedding(num_embeddings=vocab.get_vocab_size('tokens'),
                            embedding_dim=EMBEDDING_DIM)
word_embeddings = BasicTextFieldEmbedder({"tokens": token_embedding})

lstm = PytorchSeq2SeqWrapper(torch.nn.LSTM(EMBEDDING_DIM, HIDDEN_DIM, batch_first=True))

model = LstmTagger(word_embeddings, lstm, vocab)

if torch.cuda.is_available():
    cuda_device = 0

    model = model.cuda(cuda_device)
else:

    cuda_device = -1

optimizer = optim.SGD(model.parameters(), lr=0.1)

iterator = BucketIterator(batch_size=2, sorting_keys=[("sentence", "num_tokens")])

iterator.index_with(vocab)

trainer = Trainer(model=model,              # instantiate the trainer
                  optimizer=optimizer,
                  iterator=iterator,
                  train_dataset=train_dataset,
                  validation_dataset=validation_dataset,
                  patience=10,
                  num_epochs=1000,
                  cuda_device=cuda_device)

trainer.train()                             # run the trainer

predictor = SentenceTaggerPredictor(model, dataset_reader=reader)

tag_logits = predictor.predict("The dog ate the apple")['tag_logits']

tag_ids = np.argmax(tag_logits, axis=-1)

print([model.vocab.get_token_from_index(i, 'labels') for i in tag_ids])

In [None]:
predictor2 = SentenceTaggerPredictor(model2, dataset_reader=reader)
tag_logits2 = predictor2.predict("The dog ate the apple")['tag_logits']
np.testing.assert_array_almost_equal(tag_logits2, tag_logits)