In [0]:
# !export IMAGE_FAMILY="pytorch-latest-gpu"
# !export ZONE="us-west1-b"
# !export INSTANCE_NAME="pytorch-colab-backend"
# !gcloud compute ssh --zone ZONE INSTANCE_NAME -- -L 8888:localhost:8888

In [0]:
!pip install transformers==2.5.0



In [0]:
import numpy as np
import pandas as pd
import transformers
from transformers import BertModel, AdamW, BertConfig, BertTokenizer, Model2Model, PreTrainedEncoderDecoder, BertPreTrainedModel
from torch.utils.data import Dataset
import torch
import re
from torch import nn

In [0]:
#HRbot w/o end and start answer tokens. Answer parts are located in different context sentences

In [0]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [0]:
data = pd.read_csv('drive/My Drive/hrbot.csv')
data.head()

Unnamed: 0,Question,Answer
0,Какое управленческое действие относится к функ...,"планирование, прогнозирование, мотивация, орга..."
1,Управленческий персонал включает:,"руководителей, специалистов"
2,К функциям менеджмента относят,"планирование, прогнозирование, мотивация, орга..."
3,К японскому менеджменту персонала относится:,продвижение зависит от возраста рабочего или ...
4,С какими дисциплинами связана система наук о т...,"экономика труда, психология труда, физиология ..."


In [0]:
#data tokenizetion for bert
class BotDataset(Dataset):
  def __init__(self, data, column, context_atn=False,max_leninf =False):
    self.df = data[column].copy()
    self.tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased', do_lower_case=True)
    if max_leninf == False:
      self.maxlen =  max(data[column].apply(lambda x:len(x)))
    else:
      self.maxlen = max_leninf
    self.context_atn = context_atn

  def __len__(self):
    return len(self.df)

  def __getitem__(self, index):
    sentence = self.df.loc[index]
    tokens = self.tokenizer.tokenize(sentence)
    if self.context_atn == False:
      tokens = ['[CLS]'] + tokens + ['[SEP]']
    if len(tokens) < self.maxlen:
      tokens = tokens + ['[PAD]' for _ in range(self.maxlen - len(tokens))]
    else:
      if self.context_atn == False:
        tokens = tokens[:(self.maxlen-1)] + ['[SEP]']
      else: 
        tokens = tokens[:self.maxlen]
    tokens_ids = self.tokenizer.convert_tokens_to_ids(tokens)
    tokens_ids_tensor = torch.tensor(tokens_ids)
    attn_mask = (tokens_ids_tensor != 0).long()
    return tokens_ids_tensor, attn_mask

In [0]:
data_len = len(data)

In [0]:
border = int(0.8*data_len)
Question_dataset_train = BotDataset(data.iloc[:border], 'Question')
Question_dataset_test = BotDataset(data.iloc[border:], 'Question')
Answer_dataset_train = BotDataset(data.iloc[:border], 'Answer')
Answer_dataset_test = BotDataset(data.iloc[border:], 'Answer')

In [0]:
#Context = hr management issues database
context = open('drive/My Drive/context.txt', 'r').read()
print(len(context))
print(context[:30])
pattern = re.compile('[^А-Яа-яЁёA-Za-z0-9/./,/:/?/]')
context = pattern.sub(' ', context)
context = pd.DataFrame(context.split('.'), columns = ['sentences'])
context_len = len(context)


796689
﻿УПРАВЛЕНИЕ ПЕРСОНАЛОМ
Под ред


In [0]:
# context for choosing appropriate sentences for attention 
context_dataset = BotDataset(context, 'sentences', False,16)

# context for answer generation
context_dataset_answgen = BotDataset(context, 'sentences', True,16)

In [0]:
#get CLS embedding 
model_qc = BertModel.from_pretrained('bert-base-multilingual-uncased').to(device)
model_qc.eval()

#question
q_tensor_train = torch.stack([Question_dataset_train[x][0] for x in range(border)]).to(device)
q_tensor_test = torch.stack([Question_dataset_test[x][0] for x in range(border, data_len)]).to(device)
q_atn_train = torch.stack([Question_dataset_train[x][1] for x in range(border)]).to(device)
q_atn_test = torch.stack([Question_dataset_test[x][1] for x in range(border, data_len)]).to(device)
with torch.no_grad():
  q_hid_train, _ = model_qc(q_tensor_train, q_atn_train)
  q_hid_test, _ = model_qc(q_tensor_test, q_atn_test)
q_hid_train = q_hid_train[:, 0]
q_hid_test = q_hid_test[:, 0]


In [0]:
a_tensor_train = torch.stack([Answer_dataset_train[x][0] for x in range(border)]).to(device)
a_tensor_test = torch.stack([Answer_dataset_test[x][0] for x in range(border, data_len)]).to(device)

a_tensor_atn_train = torch.stack([Answer_dataset_train[x][1] for x in range(border)]).to(device)
a_tensor_atn_test = torch.stack([Answer_dataset_test[x][1] for x in range(border, data_len)]).to(device)


In [0]:
train_dataset = torch.utils.data.TensorDataset(q_hid_train, a_tensor_train, a_tensor_atn_train)
test_dataset = torch.utils.data.TensorDataset(q_hid_test, a_tensor_test, a_tensor_atn_test)

In [0]:
#context for choosing sentence
c_tensor = torch.stack([context_dataset[x][0] for x in range(context_len)]).to(device)
c_atn = torch.stack([context_dataset[x][1] for x in range(context_len)]).to(device)
with torch.no_grad():
  c_hid, _ = model_qc(c_tensor, c_atn)
  c_hid = c_hid[:,0]

In [0]:
#context for text generation
c_tensor = torch.stack([context_dataset_answgen[x][0] for x in range(context_len)]).to(device)
c_atn = torch.stack([context_dataset_answgen[x][1] for x in range(context_len)]).to(device)
with torch.no_grad():
  w_embed, _ = model_qc(c_tensor, c_atn)

In [0]:
del model_qc

In [0]:
c_tensor[0]

tensor([28069, 32623, 40705, 21015, 11484, 77377, 76854,   324,     0,     0,
            0,     0,     0,     0,     0,     0], device='cuda:0')

In [0]:
q_hid_train.size()

torch.Size([544, 768])

In [0]:
w_embed.size()

torch.Size([6009, 16, 768])

In [0]:
#settings
batch_size = 4 #max possible
embedding_dim = q_hid_train.size()[1]
attn_size = 16
learning_rate = 0.00002

In [0]:
train_loader = torch.utils.data.DataLoader(dataset=train_dataset,batch_size=batch_size, shuffle=True, drop_last=True)
test_loader = torch.utils.data.DataLoader(dataset=test_dataset,batch_size=batch_size, shuffle=False, drop_last=True)

In [0]:
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased', do_lower_case=True)
tokenizer.sep_token_id

102

In [0]:
# Source: https://github.com/huggingface/transformers/pull/1455/files  p.s. WIP code, so changes are:  mistakes fixed, prob list added
softm = torch.nn.Softmax(dim=1)
class TransformerBeamSearch(nn.Module):
    def __init__(
        self,
        model,
        vocab_size,
        tokenizer,
        batch_size,
        beam_size,
        min_length,
        max_length,
        alpha=0,
        block_repeating_trigram=True,
    ):
        """
        Attributes:
            mask_word_id: token id that corresponds to the mask
        """
        super(TransformerBeamSearch, self).__init__()
        self.model = model
        decoder_config = transformers.AutoConfig.from_pretrained('bert-base-multilingual-uncased', is_decoder=True)
        self.end_token_id = tokenizer.sep_token_id
        self.start_token_id = tokenizer.cls_token_id
        self.beam_size = beam_size
        self.min_length = min_length
        self.max_length = max_length
        self.batch_size = batch_size
        self.vocab_size = vocab_size

        self.block_repeating_trigram = block_repeating_trigram
        self.apply_length_penalty = False if alpha == 0 else True
        self.alpha = alpha

        # State of the beam
        self.hypotheses = [[] for _ in range(batch_size)]
        self.batch_offset = torch.arange(batch_size, dtype=torch.long).cuda()
        self.beam_offset = torch.arange(
            0, batch_size * self.beam_size, step=self.beam_size, dtype=torch.long
        ).cuda()
        self.growing_beam = torch.full(
            (batch_size * self.beam_size, 1), self.start_token_id, dtype=torch.long
        )
        self.growing_prob = torch.full(
            (batch_size,1,self.vocab_size), 0, dtype=torch.long
        ).cuda()

        self.topk_log_probabilities = torch.tensor(
            [0.0] + [float("-inf")] * (self.beam_size - 1), dtype=torch.float
        ).repeat(batch_size)
        self.results = {
            "predictions": [[] for _ in range(batch_size)],
            "scores": [[] for _ in range(batch_size)],
            "probs": [[] for _ in range(batch_size)],
        }
        self._step = 0
        self.is_done = False

    def step(self, log_probabilities):
        """ Grows the beam by one step. """
        self._step += 1

        # The batch size changes as some beams finish so we define _B
        vocab_size = log_probabilities.size(-1)
        _B = log_probabilities.size(0) // self.beam_size

        # Multiply each beam probability with the probability of the
        # next token (conditioned on the words in the beam).
      

        log_probabilities = log_probabilities.squeeze(1)
        
        log_probabilities += self.topk_log_probabilities.view(-1, 1).cuda()

        log_probabilities = self.enforce_min_length(log_probabilities)
        if self.block_repeating_trigram:
            self.remove_repeating_trigrams(log_probabilities, _B)

       
        # Find the `beam_size` (previous_beam + token) combinations with
        # the highest score
        topk_log_probabilities, topk_ids = torch.topk(
            log_probabilities.view(_B, self.beam_size * vocab_size),
            self.beam_size,
            dim=1) 
        prob = log_probabilities.view(_B, -1, vocab_size)
        prob = torch.max(prob, dim=1).values.unsqueeze(1).long()
        prob = (prob >= torch.min(topk_log_probabilities, dim=1).values.unsqueeze(1))*1
        
     
        

        # Apply the length penalty. The +1 accounts for the [EOS] token
        # that will be added if the beam ends.
        topk_scores = topk_log_probabilities / self.length_penalty()

        # Retrieve the corresponding respective beam and token id
        # topk_token_ids[i] will be added to topk_beam_ids[i]
        topk_beam_ids = topk_ids.div(vocab_size)
        topk_token_ids = topk_ids.fmod(vocab_size)

        # Retrieve the row index of the surviving beams in the original
        # view of the log_probabilities tensor
        surviving_beams_rows = (topk_beam_ids + self.beam_offset[:_B].view(-1, 1).cuda()).view(
            -1
        )

        # Append the last predictions
        self.growing_beam = torch.cat(
            [
                self.growing_beam.index_select(0, surviving_beams_rows),
                topk_token_ids.view(-1, 1),
            ],
            1,
        )
        self.growing_prob = torch.cat(
            [
                self.growing_prob,
                prob.cuda(),
            ],
            1,
        )


        # Check if any of the beam searches has ended during this
        # growth step. Also if top beam (most probable) has ended
        # for one element of the batch.
        is_finished = topk_token_ids.eq(self.end_token_id)
        is_finished = self.enforce_max_length(is_finished)
        is_top_beam_finished = is_finished[:, 0].eq(1)

        # Save the finished searches
        if is_finished.any():
            predictions = self.growing_beam.view(
                -1, self.beam_size, self.growing_beam.size(1)
            )
            for i in range(is_finished.size(0)):
                if is_top_beam_finished[i]:
                    is_finished[i].fill_(1)
                finished_hyp = is_finished[i].nonzero().view(-1)

                # Store finished hypotheses for this batch.
                b = self.batch_offset[i]
                for j in finished_hyp:
                    self.hypotheses[b].append((topk_scores[i, j], predictions[i, j, :]))

                # If the batch reached the end, save the best hypotheses
                # in terms of length-penalized score.
                if is_top_beam_finished[i]:
                    best_hyp = sorted(
                        self.hypotheses[b], key=lambda x: x[0], reverse=True
                    )
                    best_score, best_prediction = best_hyp[0]
                    self.results["scores"][b].append(best_score)
                    self.results["predictions"][b].append(best_prediction)
                    self.results["probs"][b].append(self.growing_prob[b])

            non_finished = is_top_beam_finished.eq(0).nonzero().view(-1).cuda()
            if len(non_finished) == 0:
                self.is_done = True

            # Remove finished batches for the next step.
            topk_log_probabilities = topk_log_probabilities.index_select(
                0, non_finished
            )
            self.batch_offset = self.batch_offset.index_select(0, non_finished)
            self.growing_beam = predictions.index_select(0, non_finished).view(
                -1, self.growing_beam.size(-1)
            )

            surviving_beams_rows = surviving_beams_rows.index_select(0, non_finished)

        return surviving_beams_rows

    def forward(self, encoder_input_ids,encoder_attention_mask, **kwargs):
        # keyword arguments come in 3 flavors: encoder-specific (prefixed by
        # `encoder_`), decoder-specific (prefixed by `decoder_`) and those
        # that apply to the model as whole.
        # We let the specific kwargs override the common ones in case of conflict.
        kwargs_encoder = {
            argument[len("encoder_"):]: value
            for argument, value in kwargs.items()
            if argument.startswith("encoder_")
        }
        kwargs_decoder = {
            argument[len("decoder_"):]: value
            for argument, value in kwargs.items()
            if argument.startswith("decoder_")
        }
        kwargs_common = {
            argument: value
            for argument, value in kwargs.items()
            if not (argument.startswith("encoder_") or argument.startswith("decoder_"))
        }
        kwargs_decoder = dict(kwargs_common, **kwargs_decoder)
        kwargs_encoder = dict(kwargs_common, **kwargs_encoder)

        # forward pass on the encoder
        encoder_outputs, _ = self.model.encoder.forward(encoder_input_ids, encoder_attention_mask)

        encoder_hidden_states = tile(
            encoder_outputs, self.beam_size, dim=0
        )

        # grow the beam by generating sequences in an autoregressive way
        self.growing_beam = torch.full(
            (self.batch_size * self.beam_size, 1), self.start_token_id, dtype=torch.long).cuda()
        for step in range(self.max_length):
            decoder_input = self.growing_beam[:, -1]
            outputs = self.model.decoder(decoder_input.view(-1,1).cuda(), encoder_hidden_states = encoder_hidden_states.cuda())
            
            log_probabilities = torch.nn.functional.log_softmax(outputs[0])
            
            surviving_beams_rows = self.step(log_probabilities)
            if self.is_done:
                break

            encoder_hidden_states = encoder_hidden_states.index_select(0, surviving_beams_rows)

        return self.results

    def remove_repeating_trigrams(self, log_probabilities, _B):
        if(self._step + 1 > 3):
            for i in range(_B * self.beam_size):
                tokens = [t for t in self.growing_beam[i]]
                trigrams = [(tokens[i-1], tokens[i], tokens[i+1]) for i in range(1, len(tokens) - 1)]
                last_trigram = tuple(trigrams[-1])
                if last_trigram in trigrams[:-1]:
                    log_probabilities[i] = -1e20

    def enforce_min_length(self, log_probabilities):
        if self._step < self.min_length:
            log_probabilities[:, self.end_token_id] = -1e20
        return log_probabilities

    def enforce_max_length(self, is_finished):
        if self._step + 1 == self.max_length:
            is_finished.fill_(1)
        return is_finished

    def length_penalty(self):
        return ((5.0 + (self._step + 1)) / 6.0) ** self.alpha


def tile(x, count, dim=0):
    """
    Tiles `x` along dimension `dim` `count` times.
    Example:
        >> ex = torch.tensor([1,2],[3,4])
        >> tile(ex, 2, 0)
        torch.Tensor([[1,2],[1,2],[3,4],[3,4]])
    """
    perm = list(range(len(x.size())))
    if dim != 0:
        perm[0], perm[dim] = perm[dim], perm[0]
        x = x.permute(perm).contiguous()
    out_size = list(x.size())
    out_size[0] *= count
    batch = x.size(0)
    x = (
        x.view(batch, -1)
        .transpose(0, 1)
        .repeat(count, 1)
        .transpose(0, 1)
        .contiguous()
        .view(*out_size)
    )
    if dim != 0:
        x = x.permute(perm).contiguous()
    return x

In [0]:

cos = nn.CosineSimilarity(dim=1, eps=1e-6)
class HRbotAnswerGen(nn.Module):
  def __init__(self):
    super(HRbotAnswerGen, self).__init__()
    self.tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-uncased', do_lower_case=True)
    decoder_config = transformers.AutoConfig.from_pretrained('bert-base-multilingual-uncased', is_decoder=True)
    self.model_gen = Model2Model.from_pretrained('bert-base-multilingual-uncased', decoder_config=decoder_config)
    self.Q = nn.Linear(embedding_dim, attn_size)
    torch.nn.init.xavier_uniform(self.Q.weight)
    self.K = nn.Linear(embedding_dim, attn_size)
    torch.nn.init.xavier_uniform(self.K.weight)
    self.V = nn.Linear(embedding_dim, embedding_dim)
    torch.nn.init.xavier_uniform(self.V.weight)
    
  def forward(self, w_embed, q_hid_batch, c_hid, c_tensor, c_atn, a_tensor = None, a_tensor_atn = None, test = False):
    
    q = 0
    for question in q_hid_batch:
      cos_q_tensor = question.repeat(c_hid.size(0),1)
      cos_qc_tensor = cos(cos_q_tensor, c_hid)
      _, attn_index = torch.sort(cos_qc_tensor, descending=True)
      attn_index = attn_index[:10]
      for index in attn_index:
        w_embed_at = w_embed[index]
        c_tensor_at = c_tensor[index]
        # c_atn_at = c_atn[index]
        if attn_index[0] == index:
          w_embed_at_total = w_embed_at.clone()
          c_tensor_at_total = c_tensor_at.clone()
          # c_atn_at_total = c_atn_at.clone()
        else:
          w_embed_at_total = torch.cat([w_embed_at_total, w_embed_at], dim=0)
          c_tensor_at_total = torch.cat([c_tensor_at_total, c_tensor_at], dim=0)
          # c_atn_at_total = torch.cat([c_atn_at_total, c_atn_at], dim=0)
      if q == 0:
        question_total = w_embed_at_total.unsqueeze(0).clone()
        question_context = c_tensor_at_total.unsqueeze(0).clone()
        # question_c_atn = c_atn_at_total.unsqueeze(0).clone()
        q = q + 1
      else:
        question_total = torch.cat([question_total, w_embed_at_total.unsqueeze(0)], dim=0)
        question_context = torch.cat([question_context, c_tensor_at_total.unsqueeze(0)], dim=0)
        # question_c_atn = torch.cat([question_c_atn, c_atn_at_total.unsqueeze(0)], dim=0)    

    q_hid_batch = q_hid_batch.unsqueeze(1)
    Q_question = self.Q(q_hid_batch)
    K_wembed = self.K(question_total)
    # V_embedgen = self.V(question_total)
    softm_val = softm(torch.matmul(Q_question, K_wembed.permute(0,2,1))/(attn_size**(1/2)))
    softm_val = softm_val.squeeze(1)


    # --------for embedding input--------------------------------
    # softm_matrix = torch.zeros(question_total.size(1),question_total.size(1)).repeat(batch_size,1,1).to(device)
    # for i, _ in enumerate(softm_matrix):
      # softm_matrix[i][torch.eye(question_total.size(1)).byte()] =softm_val[i]
    # question_context_embed = torch.matmul(softm_matrix, V_embedgen)
    


    #----------for index input----------------------------------
    softm_val = softm_val/torch.max(softm_val, dim=1).values.unsqueeze(1)
    softm_val = torch.round(softm_val)
    question_context = softm_val * question_context
   
    self.model_gen.encoder.requires_grad_=False
    self.model_gen.decoder.requires_grad_=False
    self.model_gen.decoder.cls.requires_grad_=True

    
    del cos_q_tensor
    del cos_qc_tensor
    del w_embed_at_total
    del c_tensor_at_total
    # del c_atn_at_total
    del w_embed_at
    del c_tensor_at
    # del c_atn_at  
    del question_total
    # del question_c_atn
   
    question_context_atn = ((question_context == 0)*1).to(device)
    torch.cuda.empty_cache()
    
    if test == False:
      model_kwargs = {"encoder_attention_mask": question_context_atn, "decoder_attention_mask": a_tensor_atn }
      out, _ , _= self.model_gen(encoder_input_ids=question_context.long(),decoder_input_ids=a_tensor.long(), **model_kwargs)
      # model.encoder.layer[3].attention.self.key.weight.reqieres_grad = True
      return out
    else:
      # model_kwargs = {"encoder_attention_mask": question_context_atn}
      tran_beams = TransformerBeamSearch(self.model_gen, self.tokenizer.vocab_size,self.tokenizer, batch_size = q_hid_batch.size(0),beam_size=5, min_length = 205 ,max_length = 205)
      out = tran_beams.forward(encoder_input_ids=question_context.long(), encoder_attention_mask = question_context_atn)
      score = torch.stack([x[0] for x in out['scores']])
      
      prob = torch.stack([x[0] for x in out['probs']])
   
      pred = torch.stack([x[0] for x in out['predictions']])
      return score, pred, prob

In [0]:
model = HRbotAnswerGen()

  # Remove the CWD from sys.path while we load stuff.
  if sys.path[0] == '':
  


In [0]:
epochs = 4

In [0]:
def loss_function(real, pred):
    """ Only consider non-zero inputs in the loss; mask needed """
    total_loss = 0
  
    for i in range(len(real[0])):
        
      mask = real[:,i].ge(1).type(torch.cuda.FloatTensor)
      crit = nn.CrossEntropyLoss()
      loss_ = crit(pred[:,i].type(torch.cuda.FloatTensor), real[:,i].long()) * mask 
      total_loss  =  total_loss + torch.mean(loss_)
    return total_loss/len(real[0])

In [0]:

def evaluate(model, test_loader, device, c_hid, c_tensor, c_atn, w_embed):
  model.eval()

  for i, (q_hid_batch_test, answer_batch_test, answer_atn_batch_test) in enumerate(test_loader):
    q_hid_batch_test, answer_batch_test, answer_atn_batch_test = q_hid_batch_test.to(device), answer_batch_test.to(device), answer_atn_batch_test.to(device)
    
    with torch.no_grad():

      test_output = model(w_embed, q_hid_batch_test, c_hid, c_tensor, c_atn, test=True)
    
    #use scores as logits
    
    test_loss = loss_function(answer_batch_test, test_output[2])
    return test_loss

In [0]:
def train(model, train_loader, test_loader, device, epochs,c_hid, c_tensor, c_atn, w_embed):
  model.to(device)
  model.train()
  c_hid, c_tensor, c_atn, w_embed = c_hid.to(device), c_tensor.to(device), c_atn.to(device), w_embed.to(device)
  optimizer = AdamW(model.parameters(), lr=learning_rate)
  scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.1)
  for epoch in range(epochs):
    
    total_loss = 0
    for i , (q_hid_batch, answer_batch, answer_atn_batch) in enumerate(train_loader):
      optimizer.zero_grad()
      
      q_hid_batch, answer_batch, answer_atn_batch = q_hid_batch.to(device), answer_batch.to(device), answer_atn_batch.to(device)

      logits = model(w_embed, q_hid_batch, c_hid, c_tensor, c_atn, answer_batch, answer_atn_batch)

      loss = loss_function(answer_batch, logits)
      loss.backward()
      total_loss = total_loss + loss
     
      nn.utils.clip_grad_norm_(model.parameters(),0.5)

      optimizer.step()
      
    val_loss = evaluate(model, test_loader, device, c_hid, c_tensor, c_atn, w_embed)
    print(f'Epoch {epoch}, Train_loss: {total_loss/(i+1)}, Val_loss: {val_loss}')
     
  return model


In [0]:
model = train(model, train_loader, test_loader, device, epochs,c_hid, c_tensor, c_atn, w_embed)



Epoch 0, Train_loss: 0.0617070272564888, Val_loss: 1.2739115953445435
Epoch 1, Train_loss: 0.0002073720534099266, Val_loss: 1.2754796743392944
Epoch 2, Train_loss: 3.130749973934144e-05, Val_loss: 1.274377703666687
Epoch 3, Train_loss: 0.00019379214791115373, Val_loss: 1.2688442468643188


In [0]:
q_hid_test.size()

torch.Size([136, 768])

In [0]:
 
 with torch.no_grad():
      check_logits = model(w_embed, q_hid_test[100:102], c_hid, c_tensor, c_atn, test=True)
 phrase = tokenizer.convert_ids_to_tokens(check_logits[1][0])    
#  P.S. sentence has no sence at the moment. what can we do- beam search check,  batch is too small, maybe should use T5 atten, check tokenizer, check pretrained model and maybe use another, better architecture wo concep change - maybe use t5 and reformer ideas

