In [60]:
import os
from transformers import MBartTokenizerFast, MBartForConditionalGeneration, AutoTokenizer
from transformers import MT5ForConditionalGeneration, MT5Tokenizer
import pytorch_lightning as pl
import torch
from torch.utils.data import TensorDataset, random_split
from transformers.optimization import AdamW
from pytorch_lightning.callbacks import Callback
from tqdm import tqdm

MAX_SOURCE_LENGTH, MAX_TARGET_LENGTH = 200, 200

In [70]:
model = MBartForConditionalGeneration.from_pretrained("../models/bartpho_paws_qqp/epoch_5")
tokenizer = AutoTokenizer.from_pretrained("vinai/bartpho-word")

# model = MT5ForConditionalGeneration.from_pretrained('./models/mt5-base-newer/epoch_9')
# tokenizer = MT5Tokenizer.from_pretrained("google/mt5-base")

In [71]:
device = torch.device('cpu')

def main(sent, num_outputs: int = 10, max_length: int = 200):
    generated = model.generate(
        tokenizer.encode(sent, max_length=max_length, truncation=True, padding='max_length', return_tensors='pt').to(device),
        num_beams=10, num_return_sequences=num_outputs, max_length=max_length
    )

    result = []
    for generated_sentence in generated:
        # print(generated_sentence)
        # print(len(generated_sentence))
        out = tokenizer.decode(
                generated_sentence,
                skip_special_tokens=True
            )
        result.append(out)
    return result

In [72]:
DISTANCE_THRESHOLD = 2

def levenshteinDistance(s1, s2):
    s1 = s1.split()
    s2 = s2.split()
    if len(s1) > len(s2):
        s1, s2 = s2, s1

    distances = range(len(s1) + 1)
    for i2, c2 in enumerate(s2):
        distances_ = [i2+1]
        for i1, c1 in enumerate(s1):
            if c1 == c2:
                distances_.append(distances[i1])
            else:
                distances_.append(1 + min((distances[i1], distances[i1 + 1], distances_[-1])))
        distances = distances_
    final_distance = distances[-1]
    score = final_distance / max(len(s1), len(s2))
    return {
        "raw_distance": final_distance,
        "score": score
    }

def filter_answer(raw_sentence, paraphrase_sentences):
    output = []
    for sen in paraphrase_sentences:
        distance_score = levenshteinDistance(raw_sentence, sen)
        raw_distance = distance_score["raw_distance"]
        score = distance_score["score"]
        if raw_distance >= DISTANCE_THRESHOLD:
            output.append(sen)
    return output

In [73]:
text = "Người ra đi đầu không ngoảnh lại, sau lưng thềm nắng lá rơi đầy."
paraphrases = main(text)

output = filter_answer(text, paraphrases)

In [74]:
from sentence_transformers import SentenceTransformer, util
sentences = [text] + output

compare = SentenceTransformer('all-MiniLM-L12-v2')
embeddings = compare.encode(sentences)

In [75]:
for i, sentence in enumerate(embeddings):
    print(sentences[i], util.cos_sim(embeddings[0], sentence)[0][0])

Người ra đi đầu không ngoảnh lại, sau lưng thềm nắng lá rơi đầy. tensor(1.0000)
Đi đầu không ngoảnh lại, sau lưng thềm nắng lá rơi đầy. tensor(0.9829)
Đi đầu không ngoảnh lại, lưng thềm nắng lá rơi đầy. tensor(0.9689)
