<a href="https://colab.research.google.com/github/ummagumm-a/vorontsov_ml/blob/master/Task2LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from tqdm.notebook import tqdm
import torch.nn.functional as F
import torch
import torch.optim as optim
from sklearn.model_selection import train_test_split

In [None]:
#!wget https://storage.yandexcloud.net/natasha-nerus/data/nerus_lenta.conllu.gz

--2022-02-05 12:32:46--  https://storage.yandexcloud.net/natasha-nerus/data/nerus_lenta.conllu.gz
Resolving storage.yandexcloud.net (storage.yandexcloud.net)... 213.180.193.243, 2a02:6b8::1d9
Connecting to storage.yandexcloud.net (storage.yandexcloud.net)|213.180.193.243|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1961465886 (1.8G) [application/octet-stream]
Saving to: ‘nerus_lenta.conllu.gz’


2022-02-05 12:34:11 (22.2 MB/s) - ‘nerus_lenta.conllu.gz’ saved [1961465886/1961465886]



In [None]:
from nerus import load_nerus

docs = load_nerus('nerus_lenta.conllu.gz')

In [None]:
sents = []
tags = []
counter = 0
lim = 2000
for doc in tqdm(docs, total=lim):
    if counter == lim:
        break
    counter += 1
    for sent in doc.sents:
        words = []
        tags_ = []
        for token in sent.tokens:
            words.append(token.text.lower())
            tags_.append(token.pos)
        sents.append(words)
        tags.append(tags_)
data = list(zip(sents, tags))
train_data, test_data = train_test_split(data, test_size=0.2, shuffle=True)

CHAR_LEVEL_EMBEDDING_DIM = 32
EMBEDDING_DIM = 32
HIDDEN_DIM = 32

  0%|          | 0/2000 [00:00<?, ?it/s]

In [None]:
def ret_map(lst):
    el_to_ix = {}
    for big_el in tqdm(lst):
        for el in big_el:
            if el not in el_to_ix:
                el_to_ix[el] = len(el_to_ix)

    return el_to_ix
word_to_ix = ret_map(sents)
tag_to_ix = ret_map(tags)
char_to_ix = ret_map(['абвгдеёжзийклмнопрстуфхцчшщьыъэюя.,- '])
ix_to_word = {v:k for (k,v) in word_to_ix.items()}
ix_to_tag = {v:k for k,v in tag_to_ix.items()}
ix_to_char = {v:k for k,v in char_to_ix.items()}

  0%|          | 0/23500 [00:00<?, ?it/s]

  0%|          | 0/23500 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
class LSTMTagger(torch.nn.Module):
    def __init__(
        self, 
        num_layers,
        dropout,
        embedding_dim, 
        char_emb_dim, 
        hidden_dim, 
        vocab_size, 
        char_vocab_size, 
        tagset_size
    ):
        super(LSTMTagger, self).__init__()
        self.hidden_dim = hidden_dim
        self.word_embeddings = torch.nn.Embedding(vocab_size, embedding_dim) 
        self.lstm = torch.nn.LSTM(embedding_dim + char_emb_dim, 
                                  hidden_dim, 
                                  num_layers=num_layers,
                                  dropout=dropout)
        self.batch_norm = torch.nn.BatchNorm1d(hidden_dim)
        #self.char_lstm = torch.nn.LSTM(1, hidden_dim)

        self.hidden2tag = torch.nn.Linear(hidden_dim, tagset_size)

    def _char_emb(self, word):
        word = ix_to_word.get(word, ' ')
        chars = prepare_sequence(word, char_to_ix).view(len(word), 1, -1).type(torch.FloatTensor)
        char_lstm_out, _ = self.char_lstm(chars)
        char_lstm_out = char_lstm_out.view(len(word), -1)[-1]

        return char_lstm_out
        
    def _char_emb_for_each_word(self, sentence):
        otu = [self._char_emb(word.item()).view(1, -1) for word in sentence]
        return torch.cat(otu, dim=0)

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        #char_embeds = self._char_emb_for_each_word(sentence)
        #sup_embeds = torch.cat((embeds, char_embeds), dim=1)
        lstm_out, _ = self.lstm(embeds.view(len(sentence), 1, -1))
        lstm_out = self.batch_norm(lstm_out.view(len(sentence), -1))
        tag_space = self.hidden2tag(lstm_out)
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores

In [None]:
model = LSTMTagger(
    1,
    0,
    EMBEDDING_DIM, 
    0,
    HIDDEN_DIM, 
    len(word_to_ix), 
    len(char_to_ix), 
    len(tag_to_ix)
)
model(prepare_sequence('владимир путин подписал указ'.split(), word_to_ix))


torch.Size([1, 1, 32])
(tensor([[[-0.1815, -0.1963, -0.1283,  0.2885, -0.0506, -0.1373, -0.0612,
          -0.2137, -0.1211, -0.0150, -0.0266, -0.0449, -0.0657,  0.0955,
          -0.2151,  0.0258,  0.0532,  0.1148, -0.0974, -0.2156, -0.0323,
          -0.0307,  0.0501,  0.0022, -0.0049,  0.1623,  0.0714, -0.0957,
           0.0005, -0.2245,  0.0555, -0.0456]]], grad_fn=<StackBackward0>), tensor([[[-0.5585, -0.2714, -0.1935,  0.4536, -0.1659, -0.3295, -0.1334,
          -0.3712, -0.3664, -0.0274, -0.0492, -0.1367, -0.1448,  0.1830,
          -0.4336,  0.1374,  0.1699,  0.2477, -0.2210, -0.6391, -0.0972,
          -0.0836,  0.0878,  0.0099, -0.0148,  0.3259,  0.1221, -0.1960,
           0.0011, -0.3218,  0.0944, -0.1019]]], grad_fn=<StackBackward0>))


tensor([[-2.9502, -2.8971, -2.7784, -2.9296, -2.7739, -2.7993, -2.9435, -2.7453,
         -2.8358, -2.8090, -2.7595, -3.0376, -2.7293, -3.0440, -2.7459, -2.7698,
         -2.7066],
        [-2.8297, -2.9455, -2.7289, -2.8686, -2.7095, -2.7122, -3.0347, -2.8878,
         -2.8537, -2.9268, -2.7165, -3.0082, -2.7392, -2.9752, -2.7486, -2.8077,
         -2.7667],
        [-2.8623, -2.8829, -2.6673, -2.7777, -2.8344, -2.7539, -3.0727, -2.9762,
         -2.8701, -2.9392, -2.7511, -3.0859, -2.5838, -2.8990, -2.7067, -2.8867,
         -2.7595],
        [-2.8013, -2.8646, -2.7331, -2.7776, -2.7659, -2.7289, -3.0113, -3.0060,
         -2.8004, -2.8716, -2.8733, -3.0274, -2.7008, -2.9198, -2.7005, -2.9133,
         -2.7613]], grad_fn=<LogSoftmaxBackward0>)

In [None]:
def prepare_sequence(seq, to_ix):
    idxs = [to_ix.get(w, -1) for w in seq]
    return torch.tensor(idxs, dtype=torch.long)

In [None]:
def train(model):
    loss_function = torch.nn.NLLLoss()
    optimizer = optim.SGD(model.parameters(), lr=0.1)
    
    for epoch in tqdm(range(1)):
        for sent, tags in tqdm(train_data):
            model.zero_grad()
    
            sentence_in = prepare_sequence(list(map(lambda x: x.lower(), sent)), word_to_ix)
            targets = prepare_sequence(tags, tag_to_ix)
    
            tag_scores = model(sentence_in)
    
            loss = loss_function(tag_scores, targets)
            loss.backward()
            optimizer.step()

In [None]:
def test(model):
    loss = 0
    pred = []
    real = []
    with torch.no_grad():
        for sent, tags in tqdm(test_data):
            sentence_in = prepare_sequence(list(map(lambda x: x.lower(), sent)), word_to_ix)
            targets = prepare_sequence(tags, tag_to_ix)
    
            tag_scores = model(sentence_in)
    
            loss = loss_function(tag_scores, targets)
    
            pred.extend(torch.argmax(tag_scores, dim=1).tolist())
            real.extend(targets.tolist())
    
    acc = sum(list(map(lambda x: x[0] == x[1], list(zip(pred, real))))) / len(pred)
    loss /= len(test_data)
    print('acc:', acc)
    print('loss:', loss)

In [None]:
model = LSTMTagger(
    1,
    0,
    EMBEDDING_DIM, 
    CHAR_LEVEL_EMBEDDING_DIM,
    HIDDEN_DIM, 
    len(word_to_ix), 
    len(char_to_ix), 
    len(tag_to_ix)
    )
train(model)
test(model)

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/18800 [00:00<?, ?it/s]

  0%|          | 0/4700 [00:00<?, ?it/s]

acc: 0.758308855841062
loss: tensor(0.0001)


In [None]:
model = LSTMTagger(
    2,
    0,
    EMBEDDING_DIM, 
    CHAR_LEVEL_EMBEDDING_DIM,
    HIDDEN_DIM, 
    len(word_to_ix), 
    len(char_to_ix), 
    len(tag_to_ix)
    )
train(model)
test(model)

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/18800 [00:00<?, ?it/s]

  0%|          | 0/4700 [00:00<?, ?it/s]

acc: 0.7574646714619423
loss: tensor(0.0002)


In [None]:
model = LSTMTagger(
    3,
    0,
    EMBEDDING_DIM, 
    CHAR_LEVEL_EMBEDDING_DIM,
    HIDDEN_DIM, 
    len(word_to_ix), 
    len(char_to_ix), 
    len(tag_to_ix)
    )
train(model)
test(model)

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/18800 [00:00<?, ?it/s]

  0%|          | 0/4700 [00:00<?, ?it/s]

acc: 0.7183777782917409
loss: tensor(0.0002)


In [None]:
model = LSTMTagger(
    1,
    0.2,
    EMBEDDING_DIM, 
    CHAR_LEVEL_EMBEDDING_DIM,
    HIDDEN_DIM, 
    len(word_to_ix), 
    len(char_to_ix), 
    len(tag_to_ix)
    )
train(model)
test(model)

  "num_layers={}".format(dropout, num_layers))


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/18800 [00:00<?, ?it/s]

  0%|          | 0/4700 [00:00<?, ?it/s]

acc: 0.7631773712329718
loss: tensor(0.0001)


In [None]:
model = LSTMTagger(
    1,
    0.5,
    EMBEDDING_DIM, 
    CHAR_LEVEL_EMBEDDING_DIM,
    HIDDEN_DIM, 
    len(word_to_ix), 
    len(char_to_ix), 
    len(tag_to_ix)
    )
train(model)
test(model)

  "num_layers={}".format(dropout, num_layers))


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/18800 [00:00<?, ?it/s]

  0%|          | 0/4700 [00:00<?, ?it/s]

acc: 0.7562504336563591
loss: tensor(0.0001)


In [None]:
model = LSTMTagger(
    2,
    0.2,
    EMBEDDING_DIM, 
    CHAR_LEVEL_EMBEDDING_DIM,
    HIDDEN_DIM, 
    len(word_to_ix), 
    len(char_to_ix), 
    len(tag_to_ix)
    )
train(model)
test(model)

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/18800 [00:00<?, ?it/s]

  0%|          | 0/4700 [00:00<?, ?it/s]

acc: 0.72968753613803
loss: tensor(0.0002)


In [None]:
model = LSTMTagger(
    2,
    0.5,
    EMBEDDING_DIM, 
    CHAR_LEVEL_EMBEDDING_DIM,
    HIDDEN_DIM, 
    len(word_to_ix), 
    len(char_to_ix), 
    len(tag_to_ix)
    )
train(model)
test(model)

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/18800 [00:00<?, ?it/s]

  0%|          | 0/4700 [00:00<?, ?it/s]

acc: 0.6889585308878969
loss: tensor(0.0002)


In [None]:
# with batch normalization
model = LSTMTagger(
    1,
    0.2,
    EMBEDDING_DIM, 
    0,
    HIDDEN_DIM, 
    len(word_to_ix), 
    len(char_to_ix), 
    len(tag_to_ix)
    )
train(model)
test(model)

  "num_layers={}".format(dropout, num_layers))


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/18800 [00:00<?, ?it/s]

  0%|          | 0/4700 [00:00<?, ?it/s]

ValueError: ignored