In [48]:
from nltk.tokenize import word_tokenize, wordpunct_tokenize
import pandas as pd
import numpy as np
from torch.utils.data import Dataset, DataLoader
import math
import json
import torch.nn.functional as F
import torch
from torch import nn as nn
from tqdm.auto import tqdm
import youtokentome as yttm
import time
import random
import pickle


In [13]:
from nn_code.transformer_classes import *

In [4]:
from nn_code.bucketing import SequenceBucketingData

In [5]:
tokenizer_path = 'models/saved_models/big_model/bpe_big.model'

In [6]:
tokenizer = yttm.BPE(model=tokenizer_path)

In [7]:
validation_loader_path = 'models/saved_models/big_model/valid_dataset_big.pickle'

In [8]:
with open(validation_loader_path, 'rb') as f:
    loader = pickle.load(f)

In [10]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [14]:
INPUT_DIM = 16000
OUTPUT_DIM = 16000
HID_DIM = 256
ENC_LAYERS = 4
DEC_LAYERS = 4
ENC_HEADS = 8
DEC_HEADS = 8
ENC_PF_DIM = 512
DEC_PF_DIM = 512
ENC_DROPOUT = 0.1
DEC_DROPOUT = 0.1

enc = Encoder(INPUT_DIM, 
              HID_DIM, 
              ENC_LAYERS, 
              ENC_HEADS, 
              ENC_PF_DIM, 
              ENC_DROPOUT, 
              device)

dec = Decoder(OUTPUT_DIM, 
              HID_DIM, 
              DEC_LAYERS, 
              DEC_HEADS, 
              DEC_PF_DIM, 
              DEC_DROPOUT, 
              device)

In [17]:
model = Seq2Seq(enc, dec, 0, 0, device).to(device)

In [18]:
model_path = 'models/saved_models/big_model/big-model.pt'

In [None]:
model.load_state_dict(torch.load(model_path, map_location=torch.device('cpu') ))
model.eval()

In [31]:
def greedy(sentence, max_len = 30): # было sentence
    
    model.eval()
        
    tokens = tokenizer.encode(sentence)
    
    tokens = [2] + tokens + [2]
        
    src_tensor = torch.LongTensor(tokens).unsqueeze(0).to(device)
    
    src_mask = model.make_src_mask(src_tensor)
    
    with torch.no_grad():
        enc_src = model.encoder(src_tensor, src_mask)

    trg_indexes = [2]

    for i in range(max_len):

        trg_tensor = torch.LongTensor(trg_indexes).unsqueeze(0).to(device)

        trg_mask = model.make_trg_mask(trg_tensor)
        
        with torch.no_grad():
            
            
            
            output, attention = model.decoder(trg_tensor, enc_src, trg_mask, src_mask)
        
        pred_token = output.argmax(2)[:,-1].item()
        
        trg_indexes.append(pred_token)

        if pred_token == 3:
            break
            
    

    trg_tokens = ''.join(tokenizer.decode(trg_indexes[1:-1]))
    return trg_tokens


In [32]:
greedy('Что вы хотели спросить у меня')

'Что вы хотите чтобы я сказал'

In [45]:

def top_k_top_p_filtering(logits, top_k=0, top_p=0.0, filter_value=-float('Inf')):
    """ Filter a distribution of logits using top-k and/or nucleus (top-p) filtering
        Args:
            logits: logits distribution shape (vocabulary size)
            top_k >0: keep only top k tokens with highest probability (top-k filtering).
            top_p >0.0: keep the top tokens with cumulative probability >= top_p (nucleus filtering).
                Nucleus filtering is described in Holtzman et al. (http://arxiv.org/abs/1904.09751)
    """
    assert logits.dim() == 1  # batch size 1 for now - could be updated for more but the code would be less clear
    top_k = min(top_k, logits.size(-1))  # Safety check
    if top_k > 0:
        # Remove all tokens with a probability less than the last token of the top-k
        indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
        logits[indices_to_remove] = filter_value

    if top_p > 0.0:
        sorted_logits, sorted_indices = torch.sort(logits, descending=True)
        cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)

        # Remove tokens with cumulative probability above the threshold
        sorted_indices_to_remove = cumulative_probs > top_p
        # Shift the indices to the right to keep also the first token above the threshold
        sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
        sorted_indices_to_remove[..., 0] = 0

        indices_to_remove = sorted_indices[sorted_indices_to_remove]
        logits[indices_to_remove] = filter_value
    return logits



In [33]:
def sampling_next_token(dec_out, temperature, top_k,top_p):
    
    logits = dec_out[0, -1, :] / temperature
    filtered_logits=top_k_top_p_filtering(logits, top_k=top_k, top_p=top_p)
    
    probabilities = F.softmax(filtered_logits, dim=-1)
    next_token = torch.multinomial(probabilities, 1)
    
    return next_token

In [57]:
def sampling(sentence, temperature, top_k, top_p, max_len=30, tokenizer=tokenizer):
    
    scr = torch.tensor(tokenizer.encode(sentence))
    
    scr = scr.unsqueeze(0)
    
    src_mask = model.make_src_mask(scr)

    encoder_output = model.encoder(scr, src_mask)
    
    trg_indexes = [2]
    
    decoder_input = torch.LongTensor(trg_indexes).unsqueeze(0).to(device)
    
    trg_mask = model.make_trg_mask(decoder_input)
    
    
    decoded_tokens = [torch.LongTensor(trg_indexes)]
    for step in range(max_len):
        decoder_output, attention = model.decoder(decoder_input, encoder_output, trg_mask, src_mask)
        next_token = sampling_next_token(dec_out=decoder_output, temperature=temperature, top_k=top_k, top_p=top_p)
            
        decoded_tokens.append(next_token)
        if next_token == 3:
            break
            
        decoder_input=decoded_tokens[-1].unsqueeze(0).to(device)
        trg_mask = model.make_trg_mask(decoder_input)
    
    trg_tokens = ' '.join(tokenizer.decode(decoded_tokens[1:-1]))
    
    return trg_tokens

In [68]:
sampling('Пользуйся этим как хочешь', 0.9, 5, 0.9)

'Ты хочешь Польши с ешь'