<a href="https://colab.research.google.com/github/zakonreal/ds_homework/blob/main/HW_DL_4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import datetime
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
data_dir = 'drive/My Drive/'
train_lang = 'en'

In [4]:
import torch
from torch.utils.data import Dataset
from torch.nn.utils.rnn import pad_sequence

class DatasetSeq(Dataset):
    def __init__(self, data_dir, train_lang='en'):
	#open file
        with open(data_dir + train_lang + '.train', 'r') as f:
            train = f.read().split('\n\n')

        # delete extra tag markup
        train = [x for x in train if not '_ ' in x]
	    #init vocabs of tokens for encoding {<str> token: <int> id}
        self.target_vocab = {} # {p: 1, a: 2, r: 3, pu: 4}
        self.word_vocab = {} # {cat: 1, sat: 2, on: 3, mat: 4, '.': 5}
        self.char_vocab = {} # {c: 1, a: 2, t: 3, ' ': 4, s: 5}
	    
        # Cat sat on mat. -> [1, 2, 3, 4, 5]
        # p    a  r  p pu -> [1, 2, 3, 1, 4]
        # chars  -> [1, 2, 3, 4, 5, 2, 3, 4]

	    #init encoded sequences lists (processed data)
        self.encoded_sequences = []
        self.encoded_targets = []
        self.encoded_char_sequences = []
        # n=1 because first value is padding
        n_word = 1
        n_target = 1
        n_char = 1
        for line in train:
            sequence = []
            target = []
            chars = []
            for item in line.split('\n'):
                if item != '':
                    word, label = item.split(' ')

                    if self.word_vocab.get(word) is None:
                        self.word_vocab[word] = n_word
                        n_word += 1
                    if self.target_vocab.get(label) is None:
                        self.target_vocab[label] = n_target
                        n_target += 1
                    for char in word:
                        if self.char_vocab.get(char) is None:
                            self.char_vocab[char] = n_char
                            n_char += 1
                    sequence.append(self.word_vocab[word])
                    target.append(self.target_vocab[label])
                    chars.append([self.char_vocab[char] for char in word])
            self.encoded_sequences.append(sequence)
            self.encoded_targets.append(target)
            self.encoded_char_sequences.append(chars)

    def __len__(self):
        return len(self.encoded_sequences)

    def __getitem__(self, index):
        return {
            'data': self.encoded_sequences[index], # [1, 2, 3, 4, 6] len=5
            'char': self.encoded_char_sequences[index],# [[1,2,3], [4,5], [1,2], [2,6,5,4], []] len=5
            'target': self.encoded_targets[index], #  (1)
        }

In [5]:
dataset = DatasetSeq(data_dir)

In [6]:
#padding
# seq1 = [1, 2, 3, 4]
# seq2 = [9, 7, 6, 4, 3, 7, 5]
# pad seq1 equal seq2
# seq1 = [1, 2, 3, 4, 0, 0, 0]
# concat(seq1, seq2) [[1, 2, 3, 4, 0, 0, 0],
#                     [9, 7, 6, 4, 3, 7, 5]]

In [7]:
def collate_fn(batch):
    data = []
    target = []
    for item in batch:
        data.append(torch.as_tensor(item['data']))
        target.append(torch.as_tensor(item['target']))
    # pad different length sequences
    data = pad_sequence(data, batch_first=True, padding_value=0)
    target = pad_sequence(target, batch_first=True, padding_value=0)

    return {'data': data, 'target': target}

In [8]:
class RNNCellPredictor(nn.Module):
    def __init__(self, vocab_size, emb_dim, hidden_dim, n_classes):
        super().__init__()
        self.word_emb = nn.Embedding(vocab_size, emb_dim)
        self.rnn_cell = nn.GRUCell(emb_dim, hidden_dim)
        self.clf = nn.Linear(hidden_dim, n_classes)
        self.do = nn.Dropout(0.1)
        self.hidden_dim = hidden_dim

    def forward(self, x): # B x T
        b, t = x.size()
        emb = self.word_emb(x) # B x T x Ebm_dim
        rnn_out = []
        hidden = torch.zeros((b, self.hidden_dim), device=x.device)
        for i in range(t):
            hidden = self.gru_cell(emb[:, i, :], # emb[:, i, :]: B x Emb_dim
                                   hidden) # hidden: B x Hid_dim
            rnn_out.append(hidden.unsqueeze(1)) # B x 1 x Hid_dim
        rnn_out = torch.cat(rnn_out, dim=1) # B x T x Hid_dim

        return self.clf(self.do(rnn_out))


In [9]:
# #hyper params
# vocab_size = len(dataset.word_vocab) + 1
# n_classes = len(dataset.target_vocab) + 1
# n_chars = len(dataset.char_vocab) + 1
# #TODO try to use other model parameters
# emb_dim = 256
# hidden = 256
# n_epochs = 10
# cuda_device = 0
# batch_size = 100
# device = f'cuda:{cuda_device}' if cuda_device != -1 else 'cpu'

In [10]:
#hyper params
vocab_size = len(dataset.word_vocab) + 1
n_classes = len(dataset.target_vocab) + 1
n_chars = len(dataset.char_vocab) + 1
#TODO try to use other model parameters
emb_dim = 128 # поменял с 256 на 128
hidden = 256
char_hid = 64
char_emb = 32
n_epochs = 10
batch_size = 64
cuda_device = 0
batch_size = 100
device = f'cuda:{cuda_device}' if cuda_device != -1 else 'cpu'

In [11]:
class RNN_GRU(nn.Module):
    def __init__(self, vocab_size, emb_dim, hidden_dim, n_classes):
        super().__init__()
        #TODO try to use other RNN archicetures, f.e. RNN and LSTM
        self.word_emb = nn.Embedding(vocab_size, emb_dim)
        # batch_first = False: T x B x Vec
        # batch_first = True: B x T x Vec
        self.rnn = nn.GRU(emb_dim, hidden_dim, batch_first=True) 
        self.clf = nn.Linear(hidden_dim, n_classes)
        self.do = nn.Dropout(0.1)
        self.hidden_dim = hidden_dim

    def forward(self, x):
        emb = self.word_emb(x)
        hidden, _ = self.rnn(emb)

        return self.clf(self.do(hidden))
        

In [12]:
model1 = RNN_GRU(vocab_size, emb_dim, hidden, n_classes).to(device)
model1.train()
optim = torch.optim.Adam(model1.parameters(), lr=0.001)
loss_func = nn.CrossEntropyLoss()

In [13]:
%%time
start1 = datetime.datetime.now()
for epoch in range(n_epochs):
    dataloader = DataLoader(dataset, 
                            batch_size, 
                            shuffle=True, 
                            collate_fn=collate_fn,
                            drop_last = True,
                            )
    for i, batch in enumerate(dataloader):
        optim.zero_grad()

        predict = model1(batch['data'].to(device))
        loss = loss_func(predict.view(-1, n_classes),
                         batch['target'].to(device).view(-1), 
                         )
        loss.backward()
        optim.step()
        if i % 100 == 0:
            print(f'epoch: {epoch}, step: {i}, loss: {loss.item()}')
   
    torch.save(model1.state_dict(), f'./rnn_chkpt_{epoch}.pth')
GRU_train_time = datetime.datetime.now() - start1
GRU_train_loss = loss.item()
print(GRU_train_time)
print(GRU_train_loss)

epoch: 0, step: 0, loss: 3.080223560333252
epoch: 0, step: 100, loss: 0.3486131727695465
epoch: 0, step: 200, loss: 0.27183297276496887
epoch: 1, step: 0, loss: 0.2454240620136261
epoch: 1, step: 100, loss: 0.1549500674009323
epoch: 1, step: 200, loss: 0.20760221779346466
epoch: 2, step: 0, loss: 0.12521789968013763
epoch: 2, step: 100, loss: 0.16640493273735046
epoch: 2, step: 200, loss: 0.12093928456306458
epoch: 3, step: 0, loss: 0.15461379289627075
epoch: 3, step: 100, loss: 0.12022522836923599
epoch: 3, step: 200, loss: 0.1174144297838211
epoch: 4, step: 0, loss: 0.12748974561691284
epoch: 4, step: 100, loss: 0.10212807357311249
epoch: 4, step: 200, loss: 0.06491125375032425
epoch: 5, step: 0, loss: 0.09487593173980713
epoch: 5, step: 100, loss: 0.060610298067331314
epoch: 5, step: 200, loss: 0.07648179680109024
epoch: 6, step: 0, loss: 0.04808083176612854
epoch: 6, step: 100, loss: 0.06961005181074142
epoch: 6, step: 200, loss: 0.0790475383400917
epoch: 7, step: 0, loss: 0.050908

In [14]:
#example
phrase = 'He ran quickly after the red bus and caught it'
words = phrase.split(' ')
tokens = [dataset.word_vocab[w] for w in words]

start11 = datetime.datetime.now()
with torch.no_grad():
    model1.eval()
    predict = model1(torch.tensor(tokens).unsqueeze(0).to(device)) # 1 x T x N_classes
    labels = torch.argmax(predict, dim=-1).squeeze().cpu().detach().tolist()
    GRU_inference_time = datetime.datetime.now() - start11

target_labels = list(dataset.target_vocab.keys())
print([target_labels[l-1] for l in labels])
print(GRU_inference_time)

['PRON', 'VERB', 'ADV', 'SCONJ', 'DET', 'ADJ', 'NOUN', 'CCONJ', 'VERB', 'PRON']
0:00:00.013239


In [15]:
class RNN_LSTM(nn.Module):
    def __init__(self, vocab_size, emb_dim, hidden_dim, n_classes):
        super().__init__()
        #TODO try to use other RNN archicetures, f.e. RNN and LSTM
        self.word_emb = nn.Embedding(vocab_size, emb_dim)
        # batch_first = False: T x B x Vec
        # batch_first = True: B x T x Vec
        self.rnn = nn.LSTM(emb_dim, hidden_dim, batch_first=True) 
        self.clf = nn.Linear(hidden_dim, n_classes)
        self.do = nn.Dropout(0.1)
        self.hidden_dim = hidden_dim

    def forward(self, x):
        emb = self.word_emb(x)
        hidden, _ = self.rnn(emb)

        return self.clf(self.do(hidden))

In [16]:
model2 = RNN_LSTM(vocab_size, emb_dim, hidden, n_classes).to(device)
model2.train()
optim = torch.optim.Adam(model2.parameters(), lr=0.001)
loss_func = nn.CrossEntropyLoss()

In [17]:
%%time
start2 = datetime.datetime.now()
for epoch in range(n_epochs):
    dataloader = DataLoader(dataset, 
                            batch_size, 
                            shuffle=True, 
                            collate_fn=collate_fn,
                            drop_last = True,
                            )
    for i, batch in enumerate(dataloader):
        optim.zero_grad()

        predict = model2(batch['data'].to(device))
        loss = loss_func(predict.view(-1, n_classes),
                         batch['target'].to(device).view(-1), 
                         )
        loss.backward()
        optim.step()
        if i % 100 == 0:
            print(f'epoch: {epoch}, step: {i}, loss: {loss.item()}')
   
    torch.save(model2.state_dict(), f'./rnn_chkpt_{epoch}.pth')
LSTM_train_time = datetime.datetime.now() - start2
LSTM_train_loss = loss.item()
print(LSTM_train_time)
print(LSTM_train_loss)

epoch: 0, step: 0, loss: 2.953162670135498
epoch: 0, step: 100, loss: 0.2958807349205017
epoch: 0, step: 200, loss: 0.2459028959274292
epoch: 1, step: 0, loss: 0.2459316849708557
epoch: 1, step: 100, loss: 0.2505510747432709
epoch: 1, step: 200, loss: 0.11047572642564774
epoch: 2, step: 0, loss: 0.1543336659669876
epoch: 2, step: 100, loss: 0.1551549732685089
epoch: 2, step: 200, loss: 0.13508357107639313
epoch: 3, step: 0, loss: 0.12240509688854218
epoch: 3, step: 100, loss: 0.08197642117738724
epoch: 3, step: 200, loss: 0.08835366368293762
epoch: 4, step: 0, loss: 0.10079345852136612
epoch: 4, step: 100, loss: 0.1129704937338829
epoch: 4, step: 200, loss: 0.09497163444757462
epoch: 5, step: 0, loss: 0.06897866725921631
epoch: 5, step: 100, loss: 0.09662687033414841
epoch: 5, step: 200, loss: 0.0806700736284256
epoch: 6, step: 0, loss: 0.04874956235289574
epoch: 6, step: 100, loss: 0.07281622290611267
epoch: 6, step: 200, loss: 0.08576046675443649
epoch: 7, step: 0, loss: 0.0267908107

In [18]:
#example
phrase = 'He ran quickly after the red bus and caught it'
words = phrase.split(' ')
tokens = [dataset.word_vocab[w] for w in words]

start22 = datetime.datetime.now()
with torch.no_grad():
    model2.eval()
    predict = model2(torch.tensor(tokens).unsqueeze(0).to(device)) # 1 x T x N_classes
    labels = torch.argmax(predict, dim=-1).squeeze().cpu().detach().tolist()
    LSTM_inference_time = datetime.datetime.now() - start22

target_labels = list(dataset.target_vocab.keys())
print([target_labels[l-1] for l in labels])
print(LSTM_inference_time)

['PRON', 'VERB', 'ADV', 'ADP', 'DET', 'ADJ', 'NOUN', 'CCONJ', 'VERB', 'PRON']
0:00:00.002074


In [19]:
class RNN(nn.Module):
    def __init__(self, vocab_size, emb_dim, hidden_dim, n_classes):
        super().__init__()
        #TODO try to use other RNN archicetures, f.e. RNN and LSTM
        self.word_emb = nn.Embedding(vocab_size, emb_dim)
        # batch_first = False: T x B x Vec
        # batch_first = True: B x T x Vec
        self.rnn = nn.RNN(emb_dim, hidden_dim, batch_first=True) 
        self.clf = nn.Linear(hidden_dim, n_classes)
        self.do = nn.Dropout(0.1)
        self.hidden_dim = hidden_dim

    def forward(self, x):
        emb = self.word_emb(x)
        hidden, _ = self.rnn(emb)

        return self.clf(self.do(hidden))

In [20]:
model3 = RNN_LSTM(vocab_size, emb_dim, hidden, n_classes).to(device)
model3.train()
optim = torch.optim.Adam(model3.parameters(), lr=0.001)
loss_func = nn.CrossEntropyLoss()

In [21]:
%%time
start3 = datetime.datetime.now()
for epoch in range(n_epochs):
    dataloader = DataLoader(dataset, 
                            batch_size, 
                            shuffle=True, 
                            collate_fn=collate_fn,
                            drop_last = True,
                            )
    for i, batch in enumerate(dataloader):
        optim.zero_grad()

        predict = model3(batch['data'].to(device))
        loss = loss_func(predict.view(-1, n_classes),
                         batch['target'].to(device).view(-1), 
                         )
        loss.backward()
        optim.step()
        if i % 100 == 0:
            print(f'epoch: {epoch}, step: {i}, loss: {loss.item()}')
   
    torch.save(model3.state_dict(), f'./rnn_chkpt_{epoch}.pth')
RNN_train_time = datetime.datetime.now() - start3
RNN_train_loss = loss.item()
print(RNN_train_time)
print(RNN_train_loss)

epoch: 0, step: 0, loss: 3.0121350288391113
epoch: 0, step: 100, loss: 0.3853045403957367
epoch: 0, step: 200, loss: 0.27589738368988037
epoch: 1, step: 0, loss: 0.21977126598358154
epoch: 1, step: 100, loss: 0.1568412035703659
epoch: 1, step: 200, loss: 0.19692200422286987
epoch: 2, step: 0, loss: 0.18353889882564545
epoch: 2, step: 100, loss: 0.17285144329071045
epoch: 2, step: 200, loss: 0.1160384863615036
epoch: 3, step: 0, loss: 0.1256759613752365
epoch: 3, step: 100, loss: 0.0927983894944191
epoch: 3, step: 200, loss: 0.11238019168376923
epoch: 4, step: 0, loss: 0.07978789508342743
epoch: 4, step: 100, loss: 0.09297657757997513
epoch: 4, step: 200, loss: 0.09826873987913132
epoch: 5, step: 0, loss: 0.06542126834392548
epoch: 5, step: 100, loss: 0.06067018583416939
epoch: 5, step: 200, loss: 0.0764235332608223
epoch: 6, step: 0, loss: 0.0869976133108139
epoch: 6, step: 100, loss: 0.047332506626844406
epoch: 6, step: 200, loss: 0.033394765108823776
epoch: 7, step: 0, loss: 0.088686

In [22]:
#example
phrase = 'He ran quickly after the red bus and caught it'
words = phrase.split(' ')
tokens = [dataset.word_vocab[w] for w in words]

start33 = datetime.datetime.now()
with torch.no_grad():
    model3.eval()
    predict = model3(torch.tensor(tokens).unsqueeze(0).to(device)) # 1 x T x N_classes
    labels = torch.argmax(predict, dim=-1).squeeze().cpu().detach().tolist()
    RNN_inference_time = datetime.datetime.now() - start33

target_labels = list(dataset.target_vocab.keys())
print([target_labels[l-1] for l in labels])
print(RNN_inference_time)

['PRON', 'VERB', 'ADV', 'ADP', 'DET', 'ADJ', 'NOUN', 'CCONJ', 'VERB', 'PRON']
0:00:00.001282


In [23]:
def collate_fn1(input_data):
    data = []
    chars = []
    targets = []
    max_len = 0
    for item in input_data:
        if len(item['data']) > max_len:
            max_len = len(item['data'])
        data.append(torch.as_tensor(item['data']))
        chars.append(item['char'])
        targets.append(torch.as_tensor(item['target']))
    chars_seq = [[torch.as_tensor([0]) for _ in range(len(input_data))] for _ in range(max_len)]
    for j in range(len(input_data)):
        for i in range(max_len):
            if len(chars[j]) > i:
                chars_seq[i][j] = torch.as_tensor(chars[j][i])
    for j in range(max_len):
        chars_seq[j] = pad_sequence(chars_seq[j], batch_first=True, padding_value=0)
    data = pad_sequence(data, batch_first=True, padding_value=0)
    targets = pad_sequence(targets, batch_first=True, padding_value=0)
    return {'data': data, 'chars': chars_seq, 'target': targets}

In [24]:
class CharRNN_GRU(nn.Module):
    def __init__(self, vocab_size, emb_dim, hidden_dim):
        super().__init__()
        self.char_emb = nn.Embedding(vocab_size, emb_dim)
        self.rnn = nn.GRU(emb_dim, hidden_dim, batch_first=True)

    def forward(self, x):
        emb = self.char_emb(x) # B x T x Emb_dim
        _, out = self.rnn(emb)
        # _: B x T x Hidden 
        # out: 1 x B x Hidden

        return out.transpose(0, 1) # B x 1 x Hidden

In [25]:
class RNN_GRU_CH(nn.Module):
    def __init__(self, vocab_size, emb_dim, hidden_dim, n_classes,
                 char_vocab, char_emb, char_hidden):
        super().__init__()
        #TODO try to use other RNN archicetures, f.e. RNN and LSTM
        self.word_emb = nn.Embedding(vocab_size, emb_dim)
        # batch_first = False: T x B x Vec
        # batch_first = True: B x T x Vec
        self.rnn = nn.GRU(emb_dim + char_hidden, hidden_dim, batch_first=True) 
        self.clf = nn.Linear(hidden_dim, n_classes)
        self.do = nn.Dropout(0.1)
        self.hidden_dim = hidden_dim
        self.char_rnn = CharRNN_GRU(char_vocab, char_emb, char_hidden)

    def forward(self, x, chars):
        emb = self.word_emb(x)
        char_features = [self.char_rnn(c.to(x.device)) for c in chars]
        char_features = torch.cat(char_features, dim=1) # конкатенация по времени B x T x Char_hid
        emb = torch.cat((emb, char_features), dim=-1) # конкатенация векторов
        hidden, _ = self.rnn(emb)

        return self.clf(self.do(hidden))

In [26]:
model4 = RNN_GRU_CH(vocab_size, emb_dim, hidden, n_classes, n_chars, char_emb, char_hid).to(device)
model4.train()
optim = torch.optim.Adam(model4.parameters(), lr=0.001)
loss_func = nn.CrossEntropyLoss()

In [27]:
start4 = datetime.datetime.now()
for epoch in range(n_epochs):
    dataloader = DataLoader(dataset, 
                            batch_size, 
                            shuffle=True, 
                            collate_fn=collate_fn1,
                            drop_last = True,
                            )
    for i, batch in enumerate(dataloader):
        optim.zero_grad()

        predict = model4(batch['data'].to(device), batch['chars'])
        loss = loss_func(predict.view(-1, n_classes),
                         batch['target'].to(device).view(-1), 
                         )
        loss.backward()
        optim.step()
        if i % 100 == 0:
            print(f'epoch: {epoch}, step: {i}, loss: {loss.item()}')
   
    torch.save(model4.state_dict(), f'./rnn_chkpt_{epoch}.pth')
GRUch_train_time = datetime.datetime.now() - start4
GRUch_train_loss = loss.item()
print(GRUch_train_time)
print(GRUch_train_loss)

epoch: 0, step: 0, loss: 3.0374703407287598
epoch: 0, step: 100, loss: 0.36379021406173706
epoch: 0, step: 200, loss: 0.14729586243629456
epoch: 1, step: 0, loss: 0.23315732181072235
epoch: 1, step: 100, loss: 0.10437831282615662
epoch: 1, step: 200, loss: 0.12685254216194153
epoch: 2, step: 0, loss: 0.09111455827951431
epoch: 2, step: 100, loss: 0.09418246150016785
epoch: 2, step: 200, loss: 0.07498889416456223
epoch: 3, step: 0, loss: 0.11670227348804474
epoch: 3, step: 100, loss: 0.09714780747890472
epoch: 3, step: 200, loss: 0.072321318089962
epoch: 4, step: 0, loss: 0.07048870623111725
epoch: 4, step: 100, loss: 0.07767422497272491
epoch: 4, step: 200, loss: 0.061972834169864655
epoch: 5, step: 0, loss: 0.06485084444284439
epoch: 5, step: 100, loss: 0.05626639351248741
epoch: 5, step: 200, loss: 0.060968827456235886
epoch: 6, step: 0, loss: 0.04947524517774582
epoch: 6, step: 100, loss: 0.049071624875068665
epoch: 6, step: 200, loss: 0.04698457568883896
epoch: 7, step: 0, loss: 0.

In [28]:
#example
#TODO modify inference for model with char input
phrase = 'He ran quickly after the red bus and caught it'
words = phrase.split(' ')
tokens = [dataset.word_vocab[w] for w in words]
chars = [torch.tensor([dataset.char_vocab[c] for c in w]).unsqueeze(0).to(device) for w in words]

start44 = datetime.datetime.now()
with torch.no_grad():
    model4.eval()
    predict = model4(torch.tensor(tokens).unsqueeze(0).to(device), chars) # 1 x T x N_classes
    labels = torch.argmax(predict, dim=-1).squeeze().cpu().detach().tolist()
    GRUch_inference_time = datetime.datetime.now() - start44

target_labels = list(dataset.target_vocab.keys())
print([target_labels[l-1] for l in labels])
print(GRUch_inference_time)

['PRON', 'VERB', 'ADV', 'ADP', 'DET', 'ADJ', 'NOUN', 'CCONJ', 'VERB', 'PRON']
0:00:00.006047


In [29]:
class CharRNN_LSTM(nn.Module):
    def __init__(self, vocab_size, emb_dim, hidden_dim):
        super().__init__()
        self.char_emb = nn.Embedding(vocab_size, emb_dim)
        self.rnn = nn.LSTM(emb_dim, hidden_dim, batch_first=True)

    def forward(self, x):
        emb = self.char_emb(x) # B x T x Emb_dim
        _, (out, _) = self.rnn(emb)
        # _: B x T x Hidden 
        # out: 1 x B x Hidden

        return out.transpose(0, 1) # B x 1 x Hidden

In [30]:
class RNN_LSTM_CH(nn.Module):
    def __init__(self, vocab_size, emb_dim, hidden_dim, n_classes,
                 char_vocab, char_emb, char_hidden):
        super().__init__()
        #TODO try to use other RNN archicetures, f.e. RNN and LSTM
        self.word_emb = nn.Embedding(vocab_size, emb_dim)
        # batch_first = False: T x B x Vec
        # batch_first = True: B x T x Vec
        self.rnn = nn.LSTM(emb_dim + char_hidden, hidden_dim, batch_first=True) 
        self.clf = nn.Linear(hidden_dim, n_classes)
        self.do = nn.Dropout(0.1)
        self.hidden_dim = hidden_dim
        self.char_rnn = CharRNN_LSTM(char_vocab, char_emb, char_hidden)

    def forward(self, x, chars):
        emb = self.word_emb(x)
        char_features = [self.char_rnn(c.to(x.device)) for c in chars]
        char_features = torch.cat(char_features, dim=1) # конкатенация по времени B x T x Char_hid
        emb = torch.cat((emb, char_features), dim=-1) # конкатенация векторов
        hidden, _ = self.rnn(emb)

        return self.clf(self.do(hidden))

In [31]:
model5 = RNN_LSTM_CH(vocab_size, emb_dim, hidden, n_classes, n_chars, char_emb, char_hid).to(device)
model5.train()
optim = torch.optim.Adam(model5.parameters(), lr=0.001)
loss_func = nn.CrossEntropyLoss()

In [32]:
start5 = datetime.datetime.now()
for epoch in range(n_epochs):
    dataloader = DataLoader(dataset, 
                            batch_size, 
                            shuffle=True, 
                            collate_fn=collate_fn1,
                            drop_last = True,
                            )
    for i, batch in enumerate(dataloader):
        optim.zero_grad()

        predict = model5(batch['data'].to(device), batch['chars'])
        loss = loss_func(predict.view(-1, n_classes),
                         batch['target'].to(device).view(-1), 
                         )
        loss.backward()
        optim.step()
        if i % 100 == 0:
            print(f'epoch: {epoch}, step: {i}, loss: {loss.item()}')
   
    torch.save(model5.state_dict(), f'./rnn_chkpt_{epoch}.pth')
LSTMch_train_time = datetime.datetime.now() - start5
LSTMch_train_loss = loss.item()
print(LSTMch_train_time)
print(LSTMch_train_loss)

epoch: 0, step: 0, loss: 2.922806739807129
epoch: 0, step: 100, loss: 0.3720332086086273
epoch: 0, step: 200, loss: 0.1747216135263443
epoch: 1, step: 0, loss: 0.22429560124874115
epoch: 1, step: 100, loss: 0.18093912303447723
epoch: 1, step: 200, loss: 0.14818258583545685
epoch: 2, step: 0, loss: 0.16499438881874084
epoch: 2, step: 100, loss: 0.08606958389282227
epoch: 2, step: 200, loss: 0.0834396705031395
epoch: 3, step: 0, loss: 0.09470351040363312
epoch: 3, step: 100, loss: 0.06257470697164536
epoch: 3, step: 200, loss: 0.09022928029298782
epoch: 4, step: 0, loss: 0.07597386837005615
epoch: 4, step: 100, loss: 0.06765821576118469
epoch: 4, step: 200, loss: 0.07093150913715363
epoch: 5, step: 0, loss: 0.07372043281793594
epoch: 5, step: 100, loss: 0.05874081701040268
epoch: 5, step: 200, loss: 0.08245965838432312
epoch: 6, step: 0, loss: 0.05250940099358559
epoch: 6, step: 100, loss: 0.06494863331317902
epoch: 6, step: 200, loss: 0.06947361677885056
epoch: 7, step: 0, loss: 0.05452

In [33]:
#example
#TODO modify inference for model with char input
phrase = 'He ran quickly after the red bus and caught it'
words = phrase.split(' ')
tokens = [dataset.word_vocab[w] for w in words]
chars = [torch.tensor([dataset.char_vocab[c] for c in w]).unsqueeze(0).to(device) for w in words]

start55 = datetime.datetime.now()
with torch.no_grad():
    model5.eval()
    predict = model5(torch.tensor(tokens).unsqueeze(0).to(device), chars) # 1 x T x N_classes
    labels = torch.argmax(predict, dim=-1).squeeze().cpu().detach().tolist()
    LSTMch_inference_time = datetime.datetime.now() - start55

target_labels = list(dataset.target_vocab.keys())
print([target_labels[l-1] for l in labels])
print(LSTMch_inference_time)

['PRON', 'VERB', 'ADV', 'SCONJ', 'DET', 'ADJ', 'NOUN', 'CCONJ', 'VERB', 'PRON']
0:00:00.007471


In [34]:
class CharRNN(nn.Module):
    def __init__(self, vocab_size, emb_dim, hidden_dim):
        super().__init__()
        self.char_emb = nn.Embedding(vocab_size, emb_dim)
        self.rnn = nn.RNN(emb_dim, hidden_dim, batch_first=True)

    def forward(self, x):
        emb = self.char_emb(x) # B x T x Emb_dim
        _, (out, _) = self.rnn(emb)
        # _: B x T x Hidden 
        # out: 1 x B x Hidden

        return out.transpose(0, 1) # B x 1 x Hidden

In [35]:
class RNN_CH(nn.Module):
    def __init__(self, vocab_size, emb_dim, hidden_dim, n_classes,
                 char_vocab, char_emb, char_hidden):
        super().__init__()
        #TODO try to use other RNN archicetures, f.e. RNN and LSTM
        self.word_emb = nn.Embedding(vocab_size, emb_dim)
        # batch_first = False: T x B x Vec
        # batch_first = True: B x T x Vec
        self.rnn = nn.RNN(emb_dim + char_hidden, hidden_dim, batch_first=True) 
        self.clf = nn.Linear(hidden_dim, n_classes)
        self.do = nn.Dropout(0.1)
        self.hidden_dim = hidden_dim
        self.char_rnn = CharRNN(char_vocab, char_emb, char_hidden)

    def forward(self, x, chars):
        emb = self.word_emb(x)
        char_features = [self.char_rnn(c.to(x.device)) for c in chars]
        char_features = torch.cat(char_features, dim=1) # конкатенация по времени B x T x Char_hid
        emb = torch.cat((emb, char_features), dim=-1) # конкатенация векторов
        hidden, _ = self.rnn(emb)

        return self.clf(self.do(hidden))

In [36]:
model6 = RNN_LSTM_CH(vocab_size, emb_dim, hidden, n_classes, n_chars, char_emb, char_hid).to(device)
model6.train()
optim = torch.optim.Adam(model6.parameters(), lr=0.001)
loss_func = nn.CrossEntropyLoss()

In [37]:
start6 = datetime.datetime.now()
for epoch in range(n_epochs):
    dataloader = DataLoader(dataset, 
                            batch_size, 
                            shuffle=True, 
                            collate_fn=collate_fn1,
                            drop_last = True,
                            )
    for i, batch in enumerate(dataloader):
        optim.zero_grad()

        predict = model6(batch['data'].to(device), batch['chars'])
        loss = loss_func(predict.view(-1, n_classes),
                         batch['target'].to(device).view(-1), 
                         )
        loss.backward()
        optim.step()
        if i % 100 == 0:
            print(f'epoch: {epoch}, step: {i}, loss: {loss.item()}')
   
    torch.save(model6.state_dict(), f'./rnn_chkpt_{epoch}.pth')
RNNch_train_time = datetime.datetime.now() - start6
RNNch_train_loss = loss.item()
print(RNNch_train_time)
print(RNNch_train_loss)

epoch: 0, step: 0, loss: 2.6681737899780273
epoch: 0, step: 100, loss: 0.2783052623271942
epoch: 0, step: 200, loss: 0.15804603695869446
epoch: 1, step: 0, loss: 0.14774030447006226
epoch: 1, step: 100, loss: 0.19393180310726166
epoch: 1, step: 200, loss: 0.14544349908828735
epoch: 2, step: 0, loss: 0.07463293522596359
epoch: 2, step: 100, loss: 0.12899021804332733
epoch: 2, step: 200, loss: 0.08633100241422653
epoch: 3, step: 0, loss: 0.1144949421286583
epoch: 3, step: 100, loss: 0.09094712883234024
epoch: 3, step: 200, loss: 0.06707315891981125
epoch: 4, step: 0, loss: 0.08500108122825623
epoch: 4, step: 100, loss: 0.06733015179634094
epoch: 4, step: 200, loss: 0.0754786804318428
epoch: 5, step: 0, loss: 0.04523622617125511
epoch: 5, step: 100, loss: 0.04777074605226517
epoch: 5, step: 200, loss: 0.034155361354351044
epoch: 6, step: 0, loss: 0.04652193933725357
epoch: 6, step: 100, loss: 0.04352507367730141
epoch: 6, step: 200, loss: 0.03464328870177269
epoch: 7, step: 0, loss: 0.068

In [38]:
#example
#TODO modify inference for model with char input
phrase = 'He ran quickly after the red bus and caught it'
words = phrase.split(' ')
tokens = [dataset.word_vocab[w] for w in words]
chars = [torch.tensor([dataset.char_vocab[c] for c in w]).unsqueeze(0).to(device) for w in words]

start66 = datetime.datetime.now()
with torch.no_grad():
    model5.eval()
    predict = model5(torch.tensor(tokens).unsqueeze(0).to(device), chars) # 1 x T x N_classes
    labels = torch.argmax(predict, dim=-1).squeeze().cpu().detach().tolist()
    RNNch_inference_time = datetime.datetime.now() - start66

target_labels = list(dataset.target_vocab.keys())
print([target_labels[l-1] for l in labels])
print(RNNch_inference_time)

['PRON', 'VERB', 'ADV', 'SCONJ', 'DET', 'ADJ', 'NOUN', 'CCONJ', 'VERB', 'PRON']
0:00:00.005693


In [58]:
class BidirGRU(nn.Module):
    def __init__(self, vocab_size, emb_dim, hidden_dim, n_classes):
        super().__init__()
        self.word_emb = nn.Embedding(vocab_size, emb_dim)
        self.rnn = nn.GRU(emb_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.clf = nn.Linear(hidden_dim * 2, n_classes)
        self.do = nn.Dropout(0.1)

    def forward(self, x):
        emb = self.word_emb(x) # B x T x Emb_dim
        hidden, _ = self.rnn(emb)   # B x T x Hid, B x 1 x Hid
        pred = self.clf(self.do(hidden)) # B x T x N_classes

        return pred

In [59]:
model7 = BidirGRU(vocab_size, emb_dim, hidden, n_classes).to(device)
model7.train()
optim = torch.optim.Adam(model7.parameters(), lr=0.001)
loss_func = nn.CrossEntropyLoss()

In [60]:
start7 = datetime.datetime.now()
for epoch in range(n_epochs):
    dataloader = DataLoader(dataset, 
                            batch_size, 
                            shuffle=True, 
                            collate_fn=collate_fn,
                            drop_last = True,
                            )
    for i, batch in enumerate(dataloader):
        optim.zero_grad()

        predict = model7(batch['data'].to(device))
        loss = loss_func(predict.view(-1, n_classes),           
                         batch['target'].to(device).view(-1),   
                         )
        loss.backward()
        optim.step()
        if i % 100 == 0:
            print(f'epoch: {epoch}, step: {i}, loss: {loss.item()}')
BidirGRU_train_time = datetime.datetime.now() - start7
BidirGRU_train_loss = loss.item()
print(BidirGRU_train_time)
print(BidirGRU_train_loss)

epoch: 0, step: 0, loss: 2.906492233276367
epoch: 0, step: 100, loss: 0.22063975036144257
epoch: 0, step: 200, loss: 0.2156628668308258
epoch: 1, step: 0, loss: 0.15258242189884186
epoch: 1, step: 100, loss: 0.11334028840065002
epoch: 1, step: 200, loss: 0.09139696508646011
epoch: 2, step: 0, loss: 0.1473435014486313
epoch: 2, step: 100, loss: 0.12349286675453186
epoch: 2, step: 200, loss: 0.10069135576486588
epoch: 3, step: 0, loss: 0.0828949511051178
epoch: 3, step: 100, loss: 0.06977857649326324
epoch: 3, step: 200, loss: 0.0662541538476944
epoch: 4, step: 0, loss: 0.06941991299390793
epoch: 4, step: 100, loss: 0.06935705244541168
epoch: 4, step: 200, loss: 0.06514900922775269
epoch: 5, step: 0, loss: 0.04567907750606537
epoch: 5, step: 100, loss: 0.06725462526082993
epoch: 5, step: 200, loss: 0.0535103976726532
epoch: 6, step: 0, loss: 0.038662999868392944
epoch: 6, step: 100, loss: 0.044430047273635864
epoch: 6, step: 200, loss: 0.03158709034323692
epoch: 7, step: 0, loss: 0.03081

In [61]:
phrase = 'He ran quickly after the red bus and caught it'
words = phrase.split(' ')
tokens = [dataset.word_vocab[w] for w in words]

start77 = datetime.datetime.now()
with torch.no_grad():
    model7.eval()
    predict = model7(torch.tensor(tokens).unsqueeze(0).to(device)) # 1 x T x N_classes
    labels = torch.argmax(predict, dim=-1).squeeze().cpu().detach().tolist()
    BidirGRU_inference_time = datetime.datetime.now() - start77

target_labels = list(dataset.target_vocab.keys())
print([target_labels[l] for l in labels])
print(BidirGRU_inference_time)

['PART', 'DET', 'CCONJ', 'AUX', 'ADP', 'NOUN', 'VERB', 'X', 'DET', 'PART']
0:00:00.001572


In [62]:
class BidirLSTM(nn.Module):
    def __init__(self, vocab_size, emb_dim, hidden_dim, n_classes):
        super().__init__()
        self.word_emb = nn.Embedding(vocab_size, emb_dim)
        # делается эмбеддинг последовательности и целиком передается в RNN
        self.rnn = nn.LSTM(emb_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.clf = nn.Linear(hidden_dim * 2, n_classes)
        self.do = nn.Dropout(0.1)

    def forward(self, x):
        emb = self.word_emb(x) # B x T x Emb_dim
        hidden, _ = self.rnn(emb)   # B x T x Hid, B x 1 x Hid
        pred = self.clf(self.do(hidden)) # B x T x N_classes

        return pred

In [63]:
model8 = BidirLSTM(vocab_size, emb_dim, hidden, n_classes).to(device)
model8.train()
optim = torch.optim.Adam(model8.parameters(), lr=0.001)
loss_func = nn.CrossEntropyLoss()

In [64]:
start8 = datetime.datetime.now()
for epoch in range(n_epochs):
    dataloader = DataLoader(dataset, 
                            batch_size, 
                            shuffle=True, 
                            collate_fn=collate_fn,
                            drop_last = True,
                            )
    for i, batch in enumerate(dataloader):
        optim.zero_grad()

        predict = model8(batch['data'].to(device))
        loss = loss_func(predict.view(-1, n_classes),           
                         batch['target'].to(device).view(-1), 
                         )
        loss.backward()
        optim.step()
        if i % 100 == 0:
            print(f'epoch: {epoch}, step: {i}, loss: {loss.item()}')
BidirLSTM_train_time = datetime.datetime.now() - start8
BidirLSTM_train_loss = loss.item()
print(BidirLSTM_train_time)
print(BidirLSTM_train_loss)

epoch: 0, step: 0, loss: 2.8287649154663086
epoch: 0, step: 100, loss: 0.4153827130794525
epoch: 0, step: 200, loss: 0.19193825125694275
epoch: 1, step: 0, loss: 0.10457520186901093
epoch: 1, step: 100, loss: 0.11908552050590515
epoch: 1, step: 200, loss: 0.1354791820049286
epoch: 2, step: 0, loss: 0.0984041765332222
epoch: 2, step: 100, loss: 0.15424974262714386
epoch: 2, step: 200, loss: 0.12271416932344437
epoch: 3, step: 0, loss: 0.08871078491210938
epoch: 3, step: 100, loss: 0.07945622503757477
epoch: 3, step: 200, loss: 0.09227223694324493
epoch: 4, step: 0, loss: 0.054314859211444855
epoch: 4, step: 100, loss: 0.052780892699956894
epoch: 4, step: 200, loss: 0.05633893609046936
epoch: 5, step: 0, loss: 0.04962976649403572
epoch: 5, step: 100, loss: 0.061578765511512756
epoch: 5, step: 200, loss: 0.03836958110332489
epoch: 6, step: 0, loss: 0.037247184664011
epoch: 6, step: 100, loss: 0.04315862059593201
epoch: 6, step: 200, loss: 0.020625604316592216
epoch: 7, step: 0, loss: 0.03

In [66]:
phrase = 'He ran quickly after the red bus and caught it'
words = phrase.split(' ')
tokens = [dataset.word_vocab[w] for w in words]

start88 = datetime.datetime.now()
with torch.no_grad():
    model8.eval()
    predict = model8(torch.tensor(tokens).unsqueeze(0).to(device))
    labels = torch.argmax(predict, dim=-1).squeeze().cpu().detach().tolist()
    BidirLSTM_inference_time = datetime.datetime.now() - start88

target_labels = list(dataset.target_vocab.keys())
print([target_labels[l] for l in labels])
print(BidirLSTM_inference_time)

['PART', 'DET', 'CCONJ', 'AUX', 'ADP', 'NOUN', 'VERB', 'X', 'DET', 'PART']
0:00:00.002357


In [51]:
class BidirRNN(nn.Module):
    def __init__(self, vocab_size, emb_dim, hidden_dim, n_classes):
        super().__init__()
        self.word_emb = nn.Embedding(vocab_size, emb_dim)
        self.rnn = nn.RNN(emb_dim, hidden_dim, batch_first=True, bidirectional=True)
        self.clf = nn.Linear(hidden_dim * 2, n_classes)
        self.do = nn.Dropout(0.1)

    def forward(self, x):
        emb = self.word_emb(x) # B x T x Emb_dim
        hidden, _ = self.rnn(emb)   # B x T x Hid, B x 1 x Hid
        pred = self.clf(self.do(hidden)) # B x T x N_classes

        return pred

In [52]:
model9 = BidirRNN(vocab_size, emb_dim, hidden, n_classes).to(device)
model9.train()
optim = torch.optim.Adam(model9.parameters(), lr=0.001)
loss_func = nn.CrossEntropyLoss()

In [54]:
start9 = datetime.datetime.now()
for epoch in range(n_epochs):
    dataloader = DataLoader(dataset, 
                            batch_size, 
                            shuffle=True, 
                            collate_fn=collate_fn,
                            drop_last = True,
                            )
    for i, batch in enumerate(dataloader):
        optim.zero_grad()

        predict = model9(batch['data'].to(device))
        loss = loss_func(predict.view(-1, n_classes),          
                         batch['target'].to(device).view(-1),   
                         )
        loss.backward()
        optim.step()
        if i % 100 == 0:
            print(f'epoch: {epoch}, step: {i}, loss: {loss.item()}')
BidirRNN_train_time = datetime.datetime.now() - start9
BidirRNN_train_loss = loss.item()
print(BidirRNN_train_time)
print(BidirRNN_train_loss)

epoch: 0, step: 0, loss: 2.954439878463745
epoch: 0, step: 100, loss: 0.26842403411865234
epoch: 0, step: 200, loss: 0.17197299003601074
epoch: 1, step: 0, loss: 0.195296049118042
epoch: 1, step: 100, loss: 0.16741614043712616
epoch: 1, step: 200, loss: 0.15104952454566956
epoch: 2, step: 0, loss: 0.08822799474000931
epoch: 2, step: 100, loss: 0.14336693286895752
epoch: 2, step: 200, loss: 0.14464113116264343
epoch: 3, step: 0, loss: 0.10332568734884262
epoch: 3, step: 100, loss: 0.06360110640525818
epoch: 3, step: 200, loss: 0.10531620681285858
epoch: 4, step: 0, loss: 0.07737813144922256
epoch: 4, step: 100, loss: 0.06274492293596268
epoch: 4, step: 200, loss: 0.07158917188644409
epoch: 5, step: 0, loss: 0.05476449429988861
epoch: 5, step: 100, loss: 0.06136023625731468
epoch: 5, step: 200, loss: 0.05160776898264885
epoch: 6, step: 0, loss: 0.03711242973804474
epoch: 6, step: 100, loss: 0.06285872310400009
epoch: 6, step: 200, loss: 0.0571368969976902
epoch: 7, step: 0, loss: 0.05482

In [55]:
phrase = 'He ran quickly after the red bus and caught it'
words = phrase.split(' ')
tokens = [dataset.word_vocab[w] for w in words]

start99 = datetime.datetime.now()
with torch.no_grad():
    model9.eval()
    predict = model9(torch.tensor(tokens).unsqueeze(0).to(device)) # 1 x T x N_classes
    labels = torch.argmax(predict, dim=-1).squeeze().cpu().detach().tolist()
    BidirRNN_inference_time = datetime.datetime.now() - start99

target_labels = list(dataset.target_vocab.keys())
print([target_labels[l] for l in labels])
print(BidirRNN_inference_time)

['PART', 'DET', 'CCONJ', 'AUX', 'ADP', 'NOUN', 'VERB', 'X', 'DET', 'PART']
0:00:00.003517


In [67]:
result = pd.DataFrame(data=[[GRU_train_time, GRU_train_loss, GRU_inference_time],
                            [LSTM_train_time, LSTM_train_loss, LSTM_inference_time],
                            [RNN_train_time, RNN_train_loss, RNN_inference_time],
                            [GRUch_train_time, GRUch_train_loss, GRUch_inference_time],
                            [LSTMch_train_time, LSTMch_train_loss, LSTMch_inference_time],
                            [RNNch_train_time, RNNch_train_loss, RNNch_inference_time],
                            [BidirGRU_train_time, BidirGRU_train_loss, BidirGRU_inference_time],
                            [BidirLSTM_train_time, BidirLSTM_train_loss, BidirLSTM_inference_time],
                            [BidirRNN_train_time, BidirRNN_train_loss, BidirRNN_inference_time]],
                      index = ['GRU','LSTM','RNN', 'GRUch','LSTMch','RNNch','Bidirectional GRU','Bidirectional LSTM','Bidirectional RNN'],
                      columns = ['Время обучения', 'Loss на обучающей выборке', 'Время инференса'])
result

Unnamed: 0,Время обучения,Loss на обучающей выборке,Время инференса
GRU,0 days 00:00:23.703363,0.050798,0 days 00:00:00.013239
LSTM,0 days 00:00:24.675970,0.050206,0 days 00:00:00.002074
RNN,0 days 00:00:24.574385,0.046103,0 days 00:00:00.001282
GRUch,0 days 00:04:35.486453,0.024748,0 days 00:00:00.006047
LSTMch,0 days 00:04:42.086672,0.058071,0 days 00:00:00.007471
RNNch,0 days 00:04:37.866263,0.04888,0 days 00:00:00.005693
Bidirectional GRU,0 days 00:00:34.523410,0.023734,0 days 00:00:00.001572
Bidirectional LSTM,0 days 00:00:40.553661,0.035056,0 days 00:00:00.002357
Bidirectional RNN,0 days 00:00:24.457324,0.040262,0 days 00:00:00.003517
