<a href="https://colab.research.google.com/github/veren4/SMILES_featurization/blob/master/LSTM_KDNuggets_train.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This notebook ist based on [this tutorial](https://www.kdnuggets.com/2020/07/pytorch-lstm-text-generation-tutorial.html). The code is from their Github repo.

Open problems with this model:
* Wenn ich in dem Satz, den ich hinten reinfüttere zum Predicten, ein Zeichen habe, das im Trainings-Datensatz nicht vorkam, kriege ich einen Fehler. => Generell muss ich unknown tokens einführen.
* Ich schaue das Vokabular des ganzen Datensatzes an. Wenn ich den aber am Anfang nicht einlese, geht das nicht => Vorher bestimmen und hier nur einlesen!
* Datensatz einlesen, ohne komplett in den Cache zu laden ([Massive Dataset class](https://github.com/pytorch/text/issues/130))
* Adapt lstm size and embedding dim?

#Setup

In [1]:
import torch
import pandas as pd
from collections import Counter
from torch import nn, optim
import argparse
import numpy as np
from torch.utils.data import DataLoader

!pip install -q SmilesPE
from SmilesPE.pretokenizer import atomwise_tokenizer

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
import platform
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

print('Python: ', platform.python_version())
print('PyTorch: ', torch.__version__)
if(device.type == 'cuda'):
  print('Using GPU (cuda)')
else:
  print('Using CPU!')

Python:  3.6.9
PyTorch:  1.7.0+cu101
Using GPU (cuda)


#Dataset

In [4]:
#import torch
#import pandas as pd
#from collections import Counter


class Dataset(torch.utils.data.Dataset):
    def __init__(
        self,
        args,
    ):
        self.args = args
        self.words = self.load_words()
        self.uniq_words = ['UNK', 'SOL', 'EOL', 'PAD', '1', 'N', ')', 'C', 'S', '=', '4', 'O', '(', '2', '3', 'P']

        # tokenization dictionaries (numerization)
        self.index_to_word = {index: word for index, word in enumerate(self.uniq_words)}
        self.word_to_index = {word: index for index, word in enumerate(self.uniq_words)}

        # numericalize all the tokens
        #self.words_indexes = [self.word_to_index[w] for w in self.words]

        # if the token is in word_to index, then the index, otherwise 'UNK' = 0
        # [f(x) if condition else g(x) for x in sequence]
        #self.words_indexes = [self.word_to_index[w] if (w in uniq_words) else self.word_to_index['UNK'] for w in self.words]
        self.words_indexes = [self.word_to_index[w] if (w in self.uniq_words) else 0 for w in self.words]

    def load_words(self):

        #train_df = pd.read_csv('data/worded_smiles.csv')
        infile = '/content/drive/My Drive/Rostlab internship/8_KDNuggets_LSTM_Approach/data/worded_smiles.csv'
        with open(infile, "r") as file1:
            train_df = pd.read_csv(file1)
        file1.close()

        # Tokenize
        train_df['tokenized_SMILES'] = ''
        for row in range(train_df.shape[0]):
          train_df.loc[row, 'tokenized_SMILES'] = atomwise_tokenizer(train_df.loc[row, 'SMILES'])
        
        # Padding + SOL + EOL
        for row in range(train_df.shape[0]):              # ATTENTION: Are the column indexes correct?!
          actual_length = len(train_df.loc[row, 'tokenized_SMILES'])
          length_before_delimiters = self.args.sequence_length - 2

          if actual_length > length_before_delimiters:
            train_df.loc[row, 'tokenized_SMILES'] = train_df.loc[row, 'tokenized_SMILES'][:length_before_delimiters]
            train_df.loc[row, 'tokenized_SMILES'].append('EOL')
          elif actual_length < length_before_delimiters:
            temp = ['UNK']*length_before_delimiters
            shortie = train_df.loc[row, 'tokenized_SMILES']
            shortie.append('EOL')
            temp[:actual_length] = shortie
            train_df.loc[row, 'tokenized_SMILES'] = temp
          train_df.loc[row, 'tokenized_SMILES'].insert(0, 'SOL')
          

        # return the whole dataset as 1 list of tokens
        total_token_list = []
        for row in range(train_df.shape[0]):
          total_token_list.extend(train_df.loc[row, 'tokenized_SMILES'])      # can be combined with above.
        return total_token_list

    def __len__(self):
        return len(self.words_indexes) - self.args.sequence_length

    def __getitem__(self, index):
        return (
            torch.tensor(self.words_indexes[index:index+self.args.sequence_length]),        # turn into cuda tensor?
            torch.tensor(self.words_indexes[index+1:index+self.args.sequence_length+1]),
        )

In [11]:
dataset = Dataset(args)

In [12]:
dataset.args.sequence_length

4

args:\
batch_size = 3 (for now; later: try 256)\
max_epochs = 10\
sequence_length = 200

I need to pad accordingly! SOL + 200 + EOL

In [None]:
#dataloader = DataLoader(dataset, batch_size=args.batch_size)
dataloader = DataLoader(dataset, batch_size=3)

In [None]:
type(dataloader)

In [None]:
for batch, (x, y) in enumerate(dataloader):

            #optimizer.zero_grad()

            #y_pred, (state_h, state_c) = model(x, (state_h, state_c))
            #loss = criterion(y_pred.transpose(1, 2), y)

            #state_h = state_h.detach()
            #state_c = state_c.detach()

            #loss.backward()
            #optimizer.step()

            print('batch: ', batch, '(x, y):', (x, y))

In [None]:
class MassiveDataset(Dataset):
    def __init__(self, data_path, line_to_instance, dataset_metadata):
        """
        Arguments:
            data_path:          path to file with data.
            line_to_instance:   a method converting a line of a file
                                to a dataset instance
            dataset_metadata:   information required to imitate an in-memory 
                                dataset: length, offset_dict
        """
        #self.args = args
        self.words = self.load_words()
        self.uniq_words = self.get_uniq_words()

        self.index_to_word = {index: word for index, word in enumerate(self.uniq_words)}
        self.word_to_index = {word: index for index, word in enumerate(self.uniq_words)}

        self.words_indexes = [self.word_to_index[w] for w in self.words]
        
        
        #data_path = '/content/drive/My Drive/Rostlab internship/8_KDNuggets_LSTM_Approach/data/worded_smiles.csv'
        self.data_path = data_path
        # should be reset in __iter__
        self.data_stream = open(data_path, 'r')
        self.current_offset = 0

        self.meta = dataset_metadata
        self.line_to_instance = line_to_instance   # convert the line of the file to a dataset instance

    def __len__(self):
        return len(self.meta['length'])
        # original code:
        # return len(self.words_indexes) - self.args.sequence_length

    def __getitem__(self, line):
        offset = self.meta['offset_dict'][line]     # absolute position in the file
        self.data_stream.seek(offset)       # sets the file's current position at the offset.
        line = self.data_stream.readline()
        instance = self.line_to_instance(line)
        # reset to previous location for iteration
        self.data_stream.seek(self.current_offset)
        
        
        
        return instance
        
        # original code:
        #        return (
        #    torch.tensor(self.words_indexes[index:index+self.args.sequence_length]),
        #    torch.tensor(self.words_indexes[index+1:index+self.args.sequence_length+1]),
        #)

    def __next__(self):
        line = self.data_stream.readline()
        self.current_offset = self.data_stream.tell()
        return self.line_to_instance(line)
        
    def load_words(self):
        # return all possible tokens -> a predetermined [datastructure]
        
        
        #text = train_df['Joke'].str.cat(sep=' ')
        #return text.split(' ')
        pass
        
    def get_uniq_words(self):
        word_counts = Counter(self.words)
        return sorted(word_counts, key=word_counts.get, reverse=True)
        
       
        
    # Sould I close self.data_stream somewhere?

In [None]:
massive_dataset = MassiveDataset(
    data_path='/content/drive/My Drive/Rostlab internship/8_KDNuggets_LSTM_Approach/data/worded_smiles.csv',
    line_to_instance=bla,
    dataset_metadata=bla
)

In [None]:
    massive_dataloader = DataLoader(
        massive_dataset,
        batch_size=args.batch_size,
    )

#Model

In [5]:
#import torch
#from torch import nn

class Model(nn.Module):
    def __init__(self, dataset):
        super(Model, self).__init__()
        self.lstm_size = 128
        self.embedding_dim = 128
        self.num_layers = 3 #1

        n_vocab = len(dataset.uniq_words)       # provide predermined number here!
        self.embedding = nn.Embedding(
            num_embeddings=n_vocab,
            embedding_dim=self.embedding_dim,
        )
        self.lstm = nn.LSTM(
            input_size=self.lstm_size,
            hidden_size=self.lstm_size,
            num_layers=self.num_layers,
            dropout=0.2#,  #0,
            #bidirectional=True
        )
        self.fc = nn.Linear(self.lstm_size, n_vocab)

    def forward(self, x, prev_state):
        embed = self.embedding(x)
        output, state = self.lstm(embed, prev_state)
        logits = self.fc(output)

        return logits, state

    def init_state(self, sequence_length):
        return (torch.zeros(self.num_layers, sequence_length, self.lstm_size),
                torch.zeros(self.num_layers, sequence_length, self.lstm_size))
                
'''
    def get_hidden_state(self, x):
        out, (h_n, c_n) = self.lstm(embed, prev_state)      # problem: usually, I provide the entire dataset as an input here. That is not possible. => I need to extract this information at the end of the training.
        return h_n, c_n
        '''

'\n    def get_hidden_state(self, x):\n        out, (h_n, c_n) = self.lstm(embed, prev_state)      # problem: usually, I provide the entire dataset as an input here. That is not possible. => I need to extract this information at the end of the training.\n        return h_n, c_n\n        '

#Train

In [6]:
#import argparse
#import torch
#import numpy as np
#from torch import nn, optim
#from torch.utils.data import DataLoader

def train(dataset, model, args):
    model.train()       # set the model to training mode

    dataloader = DataLoader(
        dataset,
        batch_size=args.batch_size,
    )

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)

    for epoch in range(args.max_epochs):
        state_h, state_c = model.init_state(args.sequence_length)

        for batch, (x, y) in enumerate(dataloader):

            optimizer.zero_grad()

            y_pred, (state_h, state_c) = model(x, (state_h, state_c))
            loss = criterion(y_pred.transpose(1, 2), y)

            state_h = state_h.detach()
            state_c = state_c.detach()

            loss.backward()
            optimizer.step()

            print({ 'epoch': epoch, 'batch': batch, 'loss': loss.item() })
    
    #return state_h, state_c

def predict(dataset, model, text, next_words=100):   # use this function to output the states at all timesteps!
    words = text.split(' ')
    model.eval()

    state_h, state_c = model.init_state(len(words))

    for i in range(0, next_words):
        x = torch.tensor([[dataset.word_to_index[w] for w in words[i:]]])
        y_pred, (state_h, state_c) = model(x, (state_h, state_c))

        last_word_logits = y_pred[0][-1]
        p = torch.nn.functional.softmax(last_word_logits, dim=0).detach().numpy()
        word_index = np.random.choice(len(last_word_logits), p=p)
        words.append(dataset.index_to_word[word_index])
        
    return words

#Apply

In [13]:
import argparse

parser = argparse.ArgumentParser()
parser.add_argument('--max-epochs', type=int, default=10)
parser.add_argument('--batch-size', type=int, default=256)
parser.add_argument('--sequence-length', type=int, default=150)
args = parser.parse_args("")

Change the dataset in dataset.py.\
Set max epochs, batch size, seqence length in train.py.

In [14]:
dataset = Dataset(args)
model = Model(dataset)

In [15]:
train(dataset, model, args)

{'epoch': 0, 'batch': 0, 'loss': 2.6882803440093994}
{'epoch': 0, 'batch': 1, 'loss': 2.647362470626831}
{'epoch': 0, 'batch': 2, 'loss': 2.466413736343384}
{'epoch': 0, 'batch': 3, 'loss': 2.492624044418335}
{'epoch': 0, 'batch': 4, 'loss': 1.986050009727478}
{'epoch': 0, 'batch': 5, 'loss': 1.7159440517425537}
{'epoch': 0, 'batch': 6, 'loss': 2.1603097915649414}
{'epoch': 0, 'batch': 7, 'loss': 1.3696824312210083}
{'epoch': 1, 'batch': 0, 'loss': 1.027949571609497}
{'epoch': 1, 'batch': 1, 'loss': 1.5320022106170654}
{'epoch': 1, 'batch': 2, 'loss': 0.9575214385986328}
{'epoch': 1, 'batch': 3, 'loss': 1.754170298576355}
{'epoch': 1, 'batch': 4, 'loss': 0.6899461150169373}
{'epoch': 1, 'batch': 5, 'loss': 0.7947619557380676}
{'epoch': 1, 'batch': 6, 'loss': 1.9124035835266113}
{'epoch': 1, 'batch': 7, 'loss': 1.09309983253479}
{'epoch': 2, 'batch': 0, 'loss': 0.8300540447235107}
{'epoch': 2, 'batch': 1, 'loss': 1.3994698524475098}
{'epoch': 2, 'batch': 2, 'loss': 0.8111574053764343}
{

In [16]:
print(predict(dataset, model, text='C C ( O 2 ) C'))

['C', 'C', '(', 'O', '2', ')', 'C', 'P', '4', '2', '(', 'SOL', '3', '=', ')', ')', '(', '=', 'O', 'N', '(', 'O', 'C', ')', '(', '=', 'SOL', 'N', 'O', 'O', 'C', '=', 'O', 'C', '=', 'O', 'PAD', 'S', '=', ')', ')', 'C', '(', 'O', ')', '2', 'C', '(', 'O', 'O', ')', 'O', '(', '=', '(', 'C', 'C', 'EOL', 'N', ')', 'N', 'O', 'C', '(', 'P', 'EOL', 'O', 'C', '1', 'O', 'O', ')', ')', '3', '1', 'C', 'O', 'O', 'O', ')', ')', 'C', ')', 'O', '=', '(', ')', 'O', ')', '(', ')', 'C', 'C', 'O', 'C', '(', 'C', 'C', 'C', '=', ')', ')', 'UNK', 'UNK', 'UNK', 'UNK', 'UNK']


In [None]:
state_h

NameError: ignored

In [None]:
state_h.d

In [None]:
state_c