<a href="https://colab.research.google.com/github/veren4/SMILES_featurization/blob/master/3_LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This notebook ist based on [this tutorial](https://www.kdnuggets.com/2020/07/pytorch-lstm-text-generation-tutorial.html). The code is from the referenced Github repo.\
[Tensorboard tutorial](https://medium.com/@iamsdt/using-tensorboard-in-google-colab-with-pytorch-458f9bb95212)

Open problems with this model:
* Wenn ich in dem Satz, den ich hinten reinfüttere zum Predicten, ein Zeichen habe, das im Trainings-Datensatz nicht vorkam, kriege ich einen Fehler. => Generell muss ich unknown tokens einführen.
* Ich schaue das Vokabular des ganzen Datensatzes an. Wenn ich den aber am Anfang nicht einlese, geht das nicht => Vorher bestimmen und hier nur einlesen!

Might be useful: [pack sequence](https://pytorch.org/docs/stable/generated/torch.nn.utils.rnn.pack_sequence.html#torch-nn-utils-rnn-pack-sequence), [pack padded sequence](https://pytorch.org/docs/stable/generated/torch.nn.utils.rnn.pack_padded_sequence.html#torch-nn-utils-rnn-pack-padded-sequence)

[How bidirectionality works in PyTorch](https://towardsdatascience.com/understanding-bidirectional-rnn-in-pytorch-5bd25a5dd66)

#Setup

In [6]:
import torch
import pandas as pd
from collections import Counter
from torch import nn, optim
import argparse
import numpy as np
from torch.utils.data import DataLoader

!pip install -q SmilesPE
from SmilesPE.pretokenizer import atomwise_tokenizer
import pickle
import re

In [7]:
from google.colab import drive
drive.mount('/content/drive', force_remount=False)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [8]:
import platform
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

print('Python: ', platform.python_version())
print('PyTorch: ', torch.__version__)
if(device.type == 'cuda'):
  print('Using GPU (cuda)')
else:
  print('Using CPU!')

Python:  3.6.9
PyTorch:  1.7.0+cu101
Using GPU (cuda)


In [9]:
%load_ext tensorboard
import tensorflow as tf
import datetime, os

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


In [10]:
#!cp drive/MyDrive/Rostlab_internship/8_KDNuggets_LSTM_Approach/LSTM_model.py .
#from LSTM_model import LSTMModel

#Dataset class

In [11]:
class Massive_Dataset(torch.utils.data.Dataset):
    def __init__(
        self,
        offsets,
        sequence_length,
        vocabulary,
        length_of_dataset=99,  # how many instances are in the dataset in total (how many rows without the header)
        data_path='/content/drive/My Drive/Rostlab_internship/8_KDNuggets_LSTM_Approach/data/CID-SMILES-first100',
    ):
        self.sequence_length = sequence_length
        self.offsets = offsets
        self.length_of_dataset = length_of_dataset
        self.data_path = data_path

        self.data_stream = open(self.data_path, 'r')
        self.current_offset = 0
        self.vocabulary = vocabulary
        self.word_to_index = {word: index for index, word in enumerate(self.vocabulary)}

    def __len__(self):
        return self.length_of_dataset

    def line_to_instance(self, line):
        tokenized_SMILES = atomwise_tokenizer(line)
        tokenized_SMILES.insert(0, 'SOL')
        tokenized_SMILES.append('EOL')
        tokenized_SMILES = [self.word_to_index[w] if (w in self.vocabulary) else self.word_to_index['UNK'] for w in tokenized_SMILES]
        return tokenized_SMILES

    def __getitem__(self, line):
        offset = self.offsets[line]        
        self.data_stream.seek(offset)
        line = self.data_stream.readline()
        line = line.rstrip("\n")
        match = re.match(pattern = "^\d+\\t(.+)$", string = line)
        if match:
          line = match.group(1)
        else:
          raise ValueError('I couldn\'t find a SMILES in a line of the dataset. :(')

        instance = self.line_to_instance(line)
        label_instance = instance[1:]
        label_instance.append(0)
        
        # Padding with 'PAD' --> mapping to vocab
        instance += [self.word_to_index['PAD']] * (self.sequence_length - len(instance))
        label_instance += [self.word_to_index['PAD']] * (self.sequence_length - len(label_instance))
        
        return (torch.cuda.LongTensor(instance), torch.cuda.LongTensor(label_instance))

#Model

[GRU example](https://towardsdatascience.com/understanding-bidirectional-rnn-in-pytorch-5bd25a5dd66)

In [12]:
class LSTMModel(nn.Module):
    def __init__(self, vocabulary_length, sequence_length, padding_index, lstm_size=128, embedding_dim=128, num_layers=3):
        super(LSTMModel, self).__init__()

        self.vocabulary_length = vocabulary_length
        self.lstm_size = lstm_size
        self.embedding_dim = embedding_dim
        self.num_layers = num_layers
        self.sequence_length = sequence_length
        self.padding_index = padding_index

        self.embedding = nn.Embedding(
            num_embeddings=self.vocabulary_length,
            embedding_dim=self.embedding_dim,
            padding_idx=self.padding_index
        )

        self.lstm = nn.LSTM(
            input_size=self.embedding_dim,
            hidden_size=self.lstm_size,
            num_layers=self.num_layers,
            dropout=0.2,
            bidirectional=False
        )

        self.fc = nn.Linear(self.lstm_size, self.vocabulary_length)

    def forward(self, x, prev_state):
        embedding = self.embedding(x)

        output, state = self.lstm(embedding, prev_state)  # input_size, hidden_size, num_layers
        logits = self.fc(output)

        return logits, state

    def init_state(self):
      return (torch.rand(self.num_layers, self.sequence_length, self.lstm_size, device='cuda'),
              torch.rand(self.num_layers, self.sequence_length, self.lstm_size, device='cuda'))

#Metadata

In [9]:
with open('/content/drive/My Drive/Rostlab_internship/8_KDNuggets_LSTM_Approach/data/vocabulary_list_CID_SMILES_complete.pkl', 'rb') as fid:
     vocabulary_list = pickle.load(fid)

* Number of lines
* regex: cut away line number + tabs/spaces
* byte offset 

wc -l CID-SMILES\
108826964 CID-SMILES

108 826 964   lines are in that file.

In [13]:
with open('/content/drive/My Drive/Rostlab_internship/8_KDNuggets_LSTM_Approach/data/offset_list_CID-first100.pkl', 'rb') as fid:
     offset_list = pickle.load(fid)

#with open('/content/drive/My Drive/Rostlab_internship/8_KDNuggets_LSTM_Approach/data/offset_list.pkl', 'rb') as fid:
#     offset_list = pickle.load(fid)

#with open('/content/drive/My Drive/Rostlab_internship/8_KDNuggets_LSTM_Approach/data/offset_list_worded_smiles_no_header.pkl', 'rb') as fid:
#     offset_list = pickle.load(fid)

In [14]:
dict_offset = { i : offset_list[i] for i in range(0, len(offset_list) ) }

#Model Parameters

In [15]:
special_tokens = ['UNK', 'SOL', 'EOL', 'PAD']
#vocabulary = special_tokens + vocabulary_list
# for the first 100 SMILES:
vocabulary = special_tokens + ['(', ')', 'O', 'S', '1', '#', '6', '=', 'C', 'N', '7', '[N+]', '2', '4', 'P', '[O-]', '3', '5', 'Cl']
max_sequence_length = 150
padding_index = 999
#bidirectional = False

#Training

In [21]:
dataset = Massive_Dataset(dict_offset, max_sequence_length, vocabulary)

LSTM_options = {
  'vocabulary_length': len(vocabulary),
  'sequence_length': max_sequence_length,
  'lstm_size': 128,
  'num_layers': 1,
  'embedding_dim': 1024,
  'padding_index': dataset.word_to_index['PAD']
}

model = LSTMModel(**LSTM_options)
model.cuda()  # I can call this, as my model class inherits it from nn.Module

print(f"Dataset contains {len(dataset)} samples.")

Dataset contains 99 samples.


  "num_layers={}".format(dropout, num_layers))


In [22]:
training_options = {
  'batch_size': 3,
  'learning_rate': 0.001,
  'max_epochs': 10
}

model.train()

dataloader = DataLoader(dataset, batch_size=training_options['batch_size'])
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=training_options['learning_rate'])

state_h, state_c = model.init_state()

[Pytorch tutorial](https://pytorch.org/tutorials/recipes/recipes/tensorboard_with_pytorch.html)\
[Towards Data Science](https://towardsdatascience.com/pytorch-performance-analysis-with-tensorboard-7c61f91071aa)\
[Medium: Tensorboard in Colab](https://medium.com/@iamsdt/using-tensorboard-in-google-colab-with-pytorch-458f9bb95212)

In [23]:
logs_base_dir = "runs"
#logs_base_dir = "runs/unidirectional"
os.makedirs(logs_base_dir, exist_ok=True)

from torch.utils.tensorboard import SummaryWriter
writer = SummaryWriter()

In [None]:
cell_states = {}
hidden_states = {}

for epoch in range(options['max_epochs']):
  for batch, (x, y) in enumerate(dataloader):
    optimizer.zero_grad()
    
    y_pred, (state_h, state_c) = model(x, (state_h, state_c))
    loss = criterion(y_pred.transpose(1, 2), y)
    writer.add_scalar("Loss", loss, epoch)

    state_h = state_h.detach()
    state_c = state_c.detach()

    loss.backward()
    optimizer.step()

    print({ 'epoch': epoch, 'batch': batch, 'loss': loss.item() })

writer.flush() # Call flush() method to make sure that all pending events have been written to disk.

num_layers=1,   max_sequence_length=150,   lstm_size=128\
batch_size

#Sonstiges

In [None]:
x.shape

torch.Size([3, 150])

In [None]:
state_h.shape # chop off padding to get actual sequence length

torch.Size([1, 150, 128])

In [None]:
%tensorboard --logdir=runs

UsageError: Line magic function `%tensorboard` not found.


In [None]:
model.eval()

# embedding per token

LSTMModel(
  (embedding): Embedding(23, 1024, padding_idx=3)
  (lstm): LSTM(1024, 128, dropout=0.2)
  (fc): Linear(in_features=128, out_features=23, bias=True)
)

There is no need to save the tensors as e.g. ndarrays, as they will be used as input to another Pytorch model.

In [None]:
#hidden_states["epoch_7"]

In [None]:
!nvidia-smi       # monitoring and management capabilities of the GPU

Mon Feb  1 12:18:27 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   49C    P0    28W /  70W |   1117MiB / 15079MiB |      0%      Default |
|                               |                      |                 ERR! |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

#Export cell & hidden states

In [None]:
import pickle

with open('/content/drive/MyDrive/Rostlab internship/8_KDNuggets_LSTM_Approach/data/pickled_cell_states.pkl', 'wb') as fid:
  pickle.dump(cell_states, fid)

with open('/content/drive/MyDrive/Rostlab internship/8_KDNuggets_LSTM_Approach/data/pickled_hidden_states.pkl', 'wb') as fid2:
  pickle.dump(hidden_states, fid2)