<a href="https://colab.research.google.com/github/ummagumm-a/vorontsov_ml/blob/master/Task2LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install fasttext nerus

In [1]:
from tqdm.notebook import tqdm
import torch.nn.functional as F
from torch.utils.data import DataLoader
import torch
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import fasttext
import fasttext.util
import numpy as np
from torch.nn.utils.rnn import pad_packed_sequence, pack_padded_sequence, pad_sequence

from nerus import load_nerus
from nltk.tokenize import RegexpTokenizer

In [6]:
!wget https://storage.yandexcloud.net/natasha-nerus/data/nerus_lenta.conllu.gz

--2022-02-07 10:24:00--  https://storage.yandexcloud.net/natasha-nerus/data/nerus_lenta.conllu.gz
Resolving storage.yandexcloud.net (storage.yandexcloud.net)... 213.180.193.243, 2a02:6b8::1d9
Connecting to storage.yandexcloud.net (storage.yandexcloud.net)|213.180.193.243|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1961465886 (1.8G) [application/octet-stream]
Saving to: ‘nerus_lenta.conllu.gz.1’


2022-02-07 10:25:38 (19.3 MB/s) - ‘nerus_lenta.conllu.gz.1’ saved [1961465886/1961465886]



In [2]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cuda'

In [3]:
docs = load_nerus('nerus_lenta.conllu.gz')

In [4]:
sents = []
tags = []
#chars = {'а'}
counter = 0
lim = 50000
for doc in tqdm(docs, total=lim):
    if counter == lim:
        break
    counter += 1
    for sent in doc.sents:
        words = []
        tags_ = []
        for token in sent.tokens:
            words.append(token.text.lower())
            tags_.append(token.pos)
            #for ch in token.text:
            #    chars.add(ch)
        sents.append(words)
        tags.append(tags_)
data = list(zip(sents, tags))
train_data, test_data = train_test_split(data, test_size=0.3, shuffle=True)

  0%|          | 0/50000 [00:00<?, ?it/s]

In [5]:
def ret_map(lst):
    el_to_ix = {'<def>':0}
    for big_el in tqdm(lst):
        for el in big_el:
            if el not in el_to_ix:
                el_to_ix[el] = len(el_to_ix)

    return el_to_ix
#word_to_ix = ret_map(sents)
tag_to_ix = ret_map(tags)
#char_to_ix = ret_map([list(chars)])
#ix_to_word = {v:k for (k,v) in word_to_ix.items()}
ix_to_tag = {v:k for k,v in tag_to_ix.items()}
#ix_to_char = {v:k for k,v in char_to_ix.items()}

  0%|          | 0/572873 [00:00<?, ?it/s]

In [6]:
def prepare_sequence(seq, to_ix):
    idxs = [to_ix.get(w, 0) for w in seq]
    return torch.tensor(idxs, dtype=torch.long)

In [7]:
def train_on_batch(model, x_batch, y_batch, optimizer, loss_function):
    model.train()
    optimizer.zero_grad()
    
    output = model(x_batch.to(model.device))
    output = torch.transpose(output, 1, 2)
    
    y_batch, _ = pad_packed_sequence(y_batch, batch_first=True)

    loss = loss_function(output, y_batch.to(model.device))
    loss.backward()

    optimizer.step()
    return loss.cpu().item()

def train_epoch(train_generator, model, loss_function, optimizer, callback = None):
    epoch_loss = 0
    total = 0
    for it, (batch_of_x, batch_of_y) in enumerate(train_generator):
        batch_loss = train_on_batch(model, batch_of_x, batch_of_y, optimizer, loss_function)
        
        if callback is not None:
            with torch.no_grad():
                callback(model, batch_loss)
            
        epoch_loss += batch_loss*len(batch_of_x)
        total += len(batch_of_x)
    
    return epoch_loss/total
    
def trainer(count_of_epoch, 
            batch_size, 
            dataset,
            model, 
            loss_function,
            optimizer,
            lr = 0.001,
            callback = None):

    optima = optimizer(model.parameters(), lr=lr)
    
    iterations = tqdm(range(count_of_epoch), desc='epoch')
    iterations.set_postfix({'train epoch loss': np.nan})
    for it in iterations:
        batch_generator = tqdm(
            torch.utils.data.DataLoader(dataset=dataset, 
                                        batch_size=batch_size, 
                                        shuffle=True, pin_memory=False, 
                                        collate_fn=collate_batch), 
            leave=False, total=len(dataset)//batch_size+(len(dataset)%batch_size>0))
        
        epoch_loss = train_epoch(train_generator=batch_generator, 
                    model=model, 
                    loss_function=loss_function, 
                    optimizer=optima, 
                    callback=callback)
        
        iterations.set_postfix({'train epoch loss': epoch_loss})

In [8]:
def collate_batch(batch):
    label_list, text_list = [], []
    for (_text, _label) in batch:
        processed_text = text_pipeline(_text)
        text_list.append(processed_text)
        label_list.append(label_pipeline(_label))

    lengths = [len(el) for el in label_list]

    label_list = pack_padded_sequence(
        pad_sequence(label_list, batch_first=True),
        batch_first=True, lengths=lengths, enforce_sorted=False)
    text_list = pack_padded_sequence(
        pad_sequence(text_list, batch_first=True),
        batch_first=True, lengths=lengths, enforce_sorted=False)

    return text_list.to(device), label_list.to(device)

In [9]:
def test(model, data):
    batch_generator = torch.utils.data.DataLoader(dataset=data, 
                                              batch_size=64, 
                                              pin_memory=False,
                                              collate_fn=collate_batch)
            
    pred = []
    real = []
    model.eval()
    for it, (x_batch, y_batch) in tqdm(enumerate(batch_generator), total=int(len(data) / 64)):
        x_batch = x_batch.to(device)
        with torch.no_grad():
            output = model(x_batch)
        y_batch, _ = pad_packed_sequence(y_batch, batch_first=True)

        out = [el for sel in torch.argmax(output, dim=-1).cpu().numpy().tolist() for el in sel]
        rel = [el for sel in y_batch.cpu().numpy().tolist() for el in sel]

        pred.extend(out)
        real.extend(rel)

    print(classification_report(real, pred))

In [25]:
class LSTMTagger(torch.nn.Module):
    @property
    def device(self):
        return next(self.parameters()).device

    def __init__(
        self, 
        num_layers,
        dropout,
        embedding_dim, 
        hidden_dim, 
        vocab_size, 
        tagset_size,
        char_emb_dim=0, 
    ):
        super(LSTMTagger, self).__init__()
        self.hidden_dim = hidden_dim
        self.word_embeddings = torch.nn.Embedding(vocab_size, embedding_dim) 
        self.lstm = torch.nn.LSTM(embedding_dim + char_emb_dim, 
                                  hidden_dim, 
                                  num_layers=num_layers,
                                  dropout=dropout)
        if char_emb_dim != 0:
            self.char_lstm = torch.nn.LSTM(1, char_emb_dim)

        self.hidden2tag = torch.nn.Linear(hidden_dim, tagset_size)

    def _char_emb(self, word):
        word = ix_to_word.get(word, ' ')
        chars = prepare_sequence(word, char_to_ix).view(len(word), 1, -1).type(torch.FloatTensor)
        char_lstm_out, _ = self.char_lstm(chars)
        char_lstm_out = char_lstm_out.view(len(word), -1)[-1]

        return char_lstm_out
        
    def _char_emb_for_each_word(self, sentence):
        otu = [self._char_emb(word.item()).view(1, -1) for word in sentence]
        return torch.cat(otu, dim=0)

    def _elementwise_apply(self, fn, *args):
        return torch.nn.utils.rnn.PackedSequence(fn(*[(arg.data if type(arg)==torch.nn.utils.rnn.PackedSequence else arg) for arg in args]), args[0].batch_sizes)

    def forward(self, sentence):
        embeds = self._elementwise_apply(self.word_embeddings, sentence)
        #char_embeds = self._elementwise_apply(self._char_emb_for_each_word, sentence)
        #sup_embeds = self._elementwise_apply(lambda x: torch.cat(x, dim=1), embeds, char_embeds)
        lstm_out, _ = self.lstm(embeds)
        lstm_out, output_lengths = pad_packed_sequence(lstm_out, batch_first=True)
        tag_space = self.hidden2tag(lstm_out)
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores

In [23]:
EMBEDDING_DIM = 64
CHAR_LEVEL_EMBEDDING_DIM = 0 
HIDDEN_DIM = 64

text_pipeline = lambda y: prepare_sequence(list(map(lambda x: x.lower(), y)), word_to_ix)
label_pipeline = lambda x: prepare_sequence(x, tag_to_ix)

In [27]:
loss_function = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam

model = LSTMTagger(
    num_layers=3,
    dropout=0.2,
    embedding_dim=EMBEDDING_DIM, 
    char_emb_dim=CHAR_LEVEL_EMBEDDING_DIM,
    hidden_dim=HIDDEN_DIM, 
    vocab_size=len(word_to_ix), 
    tagset_size=len(tag_to_ix)
    )
model.to(device)

trainer(count_of_epoch=1,
        batch_size=64,
        model=model,
        dataset=train_data,
        loss_function=loss_function,
        optimizer=optimizer)
test(model, test_data)

epoch:   0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/12217 [00:00<?, ?it/s]

  0%|          | 0/5235 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.79      0.98      0.87  10057378
           1       0.27      0.49      0.34   1687211
           2       0.32      0.15      0.20    708707
           3       0.00      0.00      0.00    559804
           4       0.00      0.00      0.00    456850
           5       0.22      0.00      0.00    676336
           6       0.14      0.00      0.00   1067179
           7       0.00      0.00      0.00     90490
           8       0.00      0.00      0.00    148138
           9       0.17      0.00      0.00    198857
          10       0.00      0.00      0.00    135272
          11       0.00      0.00      0.00     92020
          12       0.00      0.00      0.00    121508
          13       0.00      0.00      0.00     65129
          14       0.00      0.00      0.00     41535
          15       0.00      0.00      0.00     69030
          16       0.00      0.00      0.00      1069
          17       0.00    

  _warn_prf(average, modifier, msg_start, len(result))


In [28]:
loss_function = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam

model = LSTMTagger(
    num_layers=2,
    dropout=0,
    embedding_dim=EMBEDDING_DIM, 
    char_emb_dim=CHAR_LEVEL_EMBEDDING_DIM,
    hidden_dim=HIDDEN_DIM, 
    vocab_size=len(word_to_ix), 
    tagset_size=len(tag_to_ix)
    )
model.to(device)

trainer(count_of_epoch=1,
        batch_size=64,
        model=model,
        dataset=train_data,
        loss_function=loss_function,
        optimizer=optimizer)
test(model, test_data)

epoch:   0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/12217 [00:00<?, ?it/s]

  0%|          | 0/5235 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.79      0.97      0.87  10057378
           1       0.27      0.51      0.35   1687211
           2       0.32      0.15      0.20    708707
           3       0.00      0.00      0.00    559804
           4       0.00      0.00      0.00    456850
           5       0.20      0.00      0.00    676336
           6       0.03      0.00      0.00   1067179
           7       0.00      0.00      0.00     90490
           8       0.08      0.00      0.00    148138
           9       0.11      0.00      0.00    198857
          10       0.02      0.00      0.00    135272
          11       0.03      0.00      0.00     92020
          12       0.00      0.00      0.00    121508
          13       0.00      0.00      0.00     65129
          14       0.00      0.00      0.00     41535
          15       0.01      0.00      0.00     69030
          16       0.00      0.00      0.00      1069
          17       0.00    

  _warn_prf(average, modifier, msg_start, len(result))


In [29]:
loss_function = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam

model = LSTMTagger(
    num_layers=1,
    dropout=0.2,
    embedding_dim=EMBEDDING_DIM, 
    char_emb_dim=CHAR_LEVEL_EMBEDDING_DIM,
    hidden_dim=HIDDEN_DIM, 
    vocab_size=len(word_to_ix), 
    tagset_size=len(tag_to_ix)
    )
model.to(device)

trainer(count_of_epoch=1,
        batch_size=64,
        model=model,
        dataset=train_data,
        loss_function=loss_function,
        optimizer=optimizer)
test(model, test_data)

  "num_layers={}".format(dropout, num_layers))


epoch:   0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/12217 [00:00<?, ?it/s]

  0%|          | 0/5235 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.78      0.98      0.87  10057378
           1       0.27      0.44      0.34   1687211
           2       0.32      0.15      0.20    708707
           3       0.00      0.00      0.00    559804
           4       0.00      0.00      0.00    456850
           5       0.00      0.00      0.00    676336
           6       0.00      0.00      0.00   1067179
           7       0.00      0.00      0.00     90490
           8       0.00      0.00      0.00    148138
           9       0.00      0.00      0.00    198857
          10       0.00      0.00      0.00    135272
          11       0.00      0.00      0.00     92020
          12       0.00      0.00      0.00    121508
          13       0.00      0.00      0.00     65129
          14       0.00      0.00      0.00     41535
          15       0.00      0.00      0.00     69030
          16       0.00      0.00      0.00      1069
          17       0.00    

  _warn_prf(average, modifier, msg_start, len(result))


In [35]:
class LSTMTaggerBatchNorm(torch.nn.Module):
    @property
    def device(self):
        return next(self.parameters()).device

    def __init__(
        self, 
        num_layers,
        dropout,
        embedding_dim, 
        hidden_dim, 
        vocab_size, 
        tagset_size,
        char_emb_dim=0, 
    ):
        super(LSTMTaggerBatchNorm, self).__init__()
        self.hidden_dim = hidden_dim
        self.word_embeddings = torch.nn.Embedding(vocab_size, embedding_dim) 
        self.lstm = torch.nn.LSTM(embedding_dim + char_emb_dim, 
                                  hidden_dim, 
                                  num_layers=num_layers,
                                  dropout=dropout)
        if char_emb_dim != 0:
            self.char_lstm = torch.nn.LSTM(1, char_emb_dim)

        self.batch_norm = torch.nn.BatchNorm1d(hidden_dim)
        self.hidden2tag = torch.nn.Linear(hidden_dim, tagset_size)

    def _char_emb(self, word):
        word = ix_to_word.get(word, ' ')
        chars = prepare_sequence(word, char_to_ix).view(len(word), 1, -1).type(torch.FloatTensor)
        char_lstm_out, _ = self.char_lstm(chars)
        char_lstm_out = char_lstm_out.view(len(word), -1)[-1]

        return char_lstm_out
        
    def _char_emb_for_each_word(self, sentence):
        otu = [self._char_emb(word.item()).view(1, -1) for word in sentence]
        return torch.cat(otu, dim=0)

    def _elementwise_apply(self, fn, *args):
        return torch.nn.utils.rnn.PackedSequence(fn(*[(arg.data if type(arg)==torch.nn.utils.rnn.PackedSequence else arg) for arg in args]), args[0].batch_sizes)

    def forward(self, sentence):
        embeds = self._elementwise_apply(self.word_embeddings, sentence)
        #char_embeds = self._elementwise_apply(self._char_emb_for_each_word, sentence)
        #sup_embeds = self._elementwise_apply(lambda x: torch.cat(x, dim=1), embeds, char_embeds)
        lstm_out, _ = self.lstm(embeds)
        lstm_out = self._elementwise_apply(self.batch_norm, lstm_out)
        lstm_out, output_lengths = pad_packed_sequence(lstm_out, batch_first=True)
        tag_space = self.hidden2tag(lstm_out)
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores

In [34]:
loss_function = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam

model = LSTMTaggerBatchNorm(
    num_layers=1,
    dropout=0.2,
    embedding_dim=EMBEDDING_DIM, 
    char_emb_dim=CHAR_LEVEL_EMBEDDING_DIM,
    hidden_dim=HIDDEN_DIM, 
    vocab_size=len(word_to_ix), 
    tagset_size=len(tag_to_ix)
    )
model.to(device)

trainer(count_of_epoch=1,
        batch_size=64,
        model=model,
        dataset=train_data,
        loss_function=loss_function,
        optimizer=optimizer)
test(model, test_data)

  "num_layers={}".format(dropout, num_layers))


epoch:   0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/12217 [00:00<?, ?it/s]

  0%|          | 0/5235 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.77      0.99      0.86  10057378
           1       0.28      0.48      0.35   1687211
           2       0.00      0.00      0.00    708707
           3       0.00      0.00      0.00    559804
           4       0.00      0.00      0.00    456850
           5       0.00      0.00      0.00    676336
           6       0.17      0.00      0.00   1067179
           7       0.00      0.00      0.00     90490
           8       0.00      0.00      0.00    148138
           9       0.00      0.00      0.00    198857
          10       0.00      0.00      0.00    135272
          11       0.00      0.00      0.00     92020
          12       0.00      0.00      0.00    121508
          13       0.00      0.00      0.00     65129
          14       0.00      0.00      0.00     41535
          15       0.00      0.00      0.00     69030
          16       0.00      0.00      0.00      1069
          17       0.00    

  _warn_prf(average, modifier, msg_start, len(result))


In [36]:
class LSTMTaggerLayerNorm(torch.nn.Module):
    @property
    def device(self):
        return next(self.parameters()).device

    def __init__(
        self, 
        num_layers,
        dropout,
        embedding_dim, 
        hidden_dim, 
        vocab_size, 
        tagset_size,
        char_emb_dim=0, 
    ):
        super(LSTMTaggerLayerNorm, self).__init__()
        self.hidden_dim = hidden_dim
        self.word_embeddings = torch.nn.Embedding(vocab_size, embedding_dim) 
        self.lstm = torch.nn.LSTM(embedding_dim + char_emb_dim, 
                                  hidden_dim, 
                                  num_layers=num_layers,
                                  dropout=dropout)
        if char_emb_dim != 0:
            self.char_lstm = torch.nn.LSTM(1, char_emb_dim)

        self.layer_norm = torch.nn.LayerNorm(hidden_dim)
        self.hidden2tag = torch.nn.Linear(hidden_dim, tagset_size)

    def _char_emb(self, word):
        word = ix_to_word.get(word, ' ')
        chars = prepare_sequence(word, char_to_ix).view(len(word), 1, -1).type(torch.FloatTensor)
        char_lstm_out, _ = self.char_lstm(chars)
        char_lstm_out = char_lstm_out.view(len(word), -1)[-1]

        return char_lstm_out
        
    def _char_emb_for_each_word(self, sentence):
        otu = [self._char_emb(word.item()).view(1, -1) for word in sentence]
        return torch.cat(otu, dim=0)

    def _elementwise_apply(self, fn, *args):
        return torch.nn.utils.rnn.PackedSequence(fn(*[(arg.data if type(arg)==torch.nn.utils.rnn.PackedSequence else arg) for arg in args]), args[0].batch_sizes)

    def forward(self, sentence):
        embeds = self._elementwise_apply(self.word_embeddings, sentence)
        #char_embeds = self._elementwise_apply(self._char_emb_for_each_word, sentence)
        #sup_embeds = self._elementwise_apply(lambda x: torch.cat(x, dim=1), embeds, char_embeds)
        lstm_out, _ = self.lstm(embeds)
        lstm_out = self._elementwise_apply(self.layer_norm, lstm_out)
        lstm_out, output_lengths = pad_packed_sequence(lstm_out, batch_first=True)
        tag_space = self.hidden2tag(lstm_out)
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores

In [37]:
loss_function = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam

model = LSTMTaggerLayerNorm(
    num_layers=1,
    dropout=0.2,
    embedding_dim=EMBEDDING_DIM, 
    char_emb_dim=CHAR_LEVEL_EMBEDDING_DIM,
    hidden_dim=HIDDEN_DIM, 
    vocab_size=len(word_to_ix), 
    tagset_size=len(tag_to_ix)
    )
model.to(device)

trainer(count_of_epoch=1,
        batch_size=64,
        model=model,
        dataset=train_data,
        loss_function=loss_function,
        optimizer=optimizer)
test(model, test_data)

  "num_layers={}".format(dropout, num_layers))


epoch:   0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/12217 [00:00<?, ?it/s]

  0%|          | 0/5235 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

           0       0.80      0.97      0.88  10057378
           1       0.26      0.50      0.34   1687211
           2       0.32      0.15      0.20    708707
           3       0.00      0.00      0.00    559804
           4       0.00      0.00      0.00    456850
           5       0.22      0.00      0.00    676336
           6       0.14      0.00      0.00   1067179
           7       0.00      0.00      0.00     90490
           8       0.00      0.00      0.00    148138
           9       0.00      0.00      0.00    198857
          10       0.00      0.00      0.00    135272
          11       0.00      0.00      0.00     92020
          12       0.00      0.00      0.00    121508
          13       0.00      0.00      0.00     65129
          14       0.00      0.00      0.00     41535
          15       0.00      0.00      0.00     69030
          16       0.00      0.00      0.00      1069
          17       0.00    

  _warn_prf(average, modifier, msg_start, len(result))


## With FastText

In [10]:
class LSTMTaggerFastText(torch.nn.Module):
    @property
    def device(self):
        return next(self.parameters()).device

    def __init__(self, 
                 output_dim,
                 num_layers=1, 
                 p=0, 
                 emb_dim=300, 
                 hidden_dim=64,
                 ):
        super(LSTMTaggerFastText, self).__init__()
        self.encoder = torch.nn.LSTM(emb_dim, hidden_dim, num_layers,
                                     dropout=p)
        self.hidden2tag = torch.nn.Linear(hidden_dim, 32)
        self.intermediate = torch.nn.Linear(32, output_dim)

    def forward(self, input):
        output, _ = self.encoder(input)
        output, output_lengths = pad_packed_sequence(output, batch_first=True)
        tag_space = self.intermediate(self.hidden2tag(output))
        tag_scores = F.log_softmax(tag_space, dim=1)

        return tag_scores

In [2]:
!wget https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ru.300.bin.gz
!gunzip cc.ru.300.bin.gz

--2022-02-07 10:02:10--  https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.ru.300.bin.gz
Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 104.22.74.142, 104.22.75.142, 172.67.9.4, ...
Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|104.22.74.142|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4496459151 (4.2G) [application/octet-stream]
Saving to: ‘cc.ru.300.bin.gz’


2022-02-07 10:04:52 (26.6 MB/s) - ‘cc.ru.300.bin.gz’ saved [4496459151/4496459151]

gzip: cc.ru.300.bin already exists; do you wish to overwrite (y or n)? ^C


In [11]:
ft = fasttext.load_model('cc.ru.300.bin')
#fasttext.util.reduce_model(ft, 200)
ft.get_dimension()



300

In [14]:
class FastTextVectorizer(object):
    def __init__(self, ft):
        self.ft = ft

    def __call__(self, sentence):
        vectors = [self.ft.get_word_vector(w.lower()) for w in sentence]

        return torch.tensor(np.array(vectors))
ftv = FastTextVectorizer(ft)
text_pipeline = lambda x: ftv(x)
label_pipeline = lambda x: prepare_sequence(x, tag_to_ix)

In [15]:
loss_function = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam

model = LSTMTaggerFastText(len(tag_to_ix))
_ = model.to(device)
trainer(count_of_epoch=3,
        batch_size=64,
        dataset=train_data,
        model=model,
        loss_function=loss_function,
        optimizer=optimizer,
        lr=0.01,
        callback=None)
test(model, test_data)

epoch:   0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/6266 [00:00<?, ?it/s]

  0%|          | 0/6266 [00:00<?, ?it/s]

  0%|          | 0/6266 [00:00<?, ?it/s]

  0%|          | 0/2685 [00:00<?, ?it/s]

              precision    recall  f1-score   support

           0       1.00      1.00      1.00   5192750
           1       0.97      0.99      0.98    849360
           2       0.98      0.99      0.99    356446
           3       0.95      0.94      0.95    279662
           4       0.93      0.93      0.93    223092
           5       0.98      0.83      0.90    352555
           6       1.00      0.99      1.00    540721
           7       0.72      0.86      0.78     47884
           8       0.89      0.84      0.86     80240
           9       0.91      0.80      0.85    107285
          10       0.79      0.97      0.87     71855
          11       0.70      0.44      0.54     49152
          12       0.93      0.93      0.93     60867
          13       0.90      0.92      0.91     35986
          14       0.74      0.86      0.79     20749
          15       0.91      0.75      0.82     34773
          16       0.08      0.75      0.14       669
          17       0.00    

In [None]:
loss_function = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam

model = LSTMTaggerFastText(len(tag_to_ix), num_layers=3, p=0.2)
_ = model.to(device)
trainer(count_of_epoch=1,
        batch_size=64,
        dataset=train_data,
        model=model,
        loss_function=loss_function,
        optimizer=optimizer,
        lr=0.01,
        callback=None)
test(model, test_data)