In [1]:
import torch
import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence as PACK, pad_packed_sequence as PAD
from torch.nn.utils.rnn import pad_sequence
from abc import ABC, abstractmethod
import nltk

In [2]:
from transformers import AutoTokenizer, AutoModel

# SMALL SBERT
tokenizer = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny2")
nse_model = AutoModel.from_pretrained("cointegrated/rubert-tiny2")

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
def batch_calc_docs_embs(batch_docs):
        list_docs_embs = []
        for doc in batch_docs:
            tokenized_docs = tokenizer(
                doc,
                padding=True,
                truncation=True,
                return_tensors='pt',
                return_token_type_ids=False,
                return_attention_mask=False
                )

            with torch.no_grad():
                tokenized_docs = {k: v.to(nse_model.device) for k, v in tokenized_docs.items()}
                model_output = nse_model(**tokenized_docs)

            docs_embs = 0
            docs_embs = model_output.last_hidden_state[:, 0, :]
            docs_embs = torch.nn.functional.normalize(docs_embs)
            list_docs_embs.append(docs_embs)
        batch_docs_embs = pad_sequence(list_docs_embs, batch_first=True)
        return batch_docs_embs

sample_text = """We use the Pk metric as defined in Beeferman
et al. (1999) to evaluate the performance of our
model. Pk is the probability that when passing a
sliding window of size k over sentences, the sentences at the boundaries of the window will be incorrectly classified as belonging to the same segment (or vice versa). To match the setup of Chen
et al. (2009), we also provide the Pk metric for a
sliding window over words when evaluating on the
datasets from their paper"""
sent_detector = nltk.data.load('tokenizers/punkt/russian.pickle')
sample_sents = sent_detector.tokenize(sample_text)
sample_sents = [sample_sents]
sample_lengths = [len(s) for s in sample_sents]
sample_lengths = torch.LongTensor(sample_lengths)
sample_embs = batch_calc_docs_embs(sample_sents)
sample_targets = torch.zeros(1, len(sample_sents[0]))
sample_targets[:, sample_targets.shape[1]//2] = 1 # split into two segments in the middle

# create dummy batch of copies
batch_size = 2
sample_embs = sample_embs.expand(batch_size, -1, -1)
sample_targets = sample_targets.expand(batch_size, -1)
sample_lengths = sample_lengths.expand(batch_size)

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


In [15]:
from train_fit import *

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\brazen\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  from pandas.core.computation.check import NUMEXPR_INSTALLED





[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\brazen\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\brazen\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [4]:
%load_ext autoreload
%autoreload 2

In [14]:
from models.CRF import BiLSTM

model = BiLSTM(2, 312, 256, 2, loss_fn='BinaryCrossEntropy')
loss = model.loss(sample_embs, sample_lengths, sample_targets)
loss

tensor([[True, True, True,  ..., True, True, True],
        [True, True, True,  ..., True, True, True],
        [True, True, True,  ..., True, True, True],
        ...,
        [True, True, True,  ..., True, True, True],
        [True, True, True,  ..., True, True, True],
        [True, True, True,  ..., True, True, True]])


tensor(0.6904, grad_fn=<BinaryCrossEntropyBackward0>)