In [None]:
import torch
import transformers
import metrics
import spacy
import pandas as pd

nlp = spacy.load("en_core_web_trf")

class ParaphraseModel(object):
    def __init__(self, model_name, device="cuda"):
        self.model_name = model_name
        self.device = device
        self.tokenizer = transformers.AutoTokenizer.from_pretrained(self.model_name, cache_dir="./cache/models/")
        if "pegasus" in self.model_name.lower():
            self.model = transformers.PegasusForConditionalGeneration.from_pretrained(self.model_name, cache_dir="./cache/models/")
        elif "bart" in self.model_name.lower():
            self.model = transformers.BartForConditionalGeneration.from_pretrained(self.model_name, cache_dir="./cache/models/")
        elif "t5" in self.model_name.lower():
            self.model = transformers.AutoModelForSeq2SeqLM.from_pretrained(self.model_name, cache_dir="./cache/models/")
        self.model = self.model.to(self.device)
        
    def post_process(self, output_texts):
        output_texts = [t.strip(' •-"') for t in output_texts]
        return output_texts
        
    def generate(self, input_text, output_count=10, max_length=256, beam_width=2, num_beam_groups=2):
        if "t5" in self.model_name.lower():
            input_text = "paraphrase: " + input_text
        model_inputs = self.tokenizer([input_text],
                                      truncation="longest_first",
                                      padding="longest",
                                      max_length=max_length,
                                      return_tensors="pt").to(self.device)
        translated = self.model.generate(**model_inputs,
                                         num_return_sequences=output_count,
                                         # number of beams for beam search
                                         num_beams=int(output_count*beam_width),
                                         # number of groups to divide num_beams into in order to ensure diversity
                                         num_beam_groups=num_beam_groups,
                                         repetition_penalty=1.2,
                                         # higher the penalty, the more diverse are the outputs
                                         diversity_penalty=0.3,
                                         early_stopping=True,)
        tgt_text = self.tokenizer.batch_decode(translated, skip_special_tokens=True)
        output_texts = self.post_process(tgt_text)
        output_texts = list(set(output_texts))
        output_texts.sort()
        return output_texts

In [None]:
#model = ParaphraseModel("./pegasus-model")
#model = ParaphraseModel("./bart-model")
model = ParaphraseModel("./t5-large-model")

In [None]:
def generate(input_text, output_count=32, keep_word=None, sort_by="wpd"):
    paraphrase_list = model.generate(input_text, output_count=output_count, max_length=64)
    nlp_input_text = nlp(input_text)
    out_para = []
    out_wpd = []
    out_ld = []
    if sort_by=="wpd":
        wpd_list = [metrics.wpd(nlp_input_text, nlp(t)) for t in paraphrase_list]
        sorted_wpd = [[wpd,para] for wpd,para in sorted(zip(wpd_list, paraphrase_list))]
        max_wpd = max(wpd_list)
        for wpd, para in sorted_wpd:
            nlp_para = nlp(para)
            wpd = metrics.wpd(nlp_input_text, nlp_para)
            ld = metrics.ld(nlp_input_text, nlp_para)
            if keep_word:
                para_tokens = [token.text.lower() for token in nlp_para]
                if keep_word in para_tokens:
                    if wpd > 0.0:
                        out_para.append(para)
                        out_wpd.append(wpd)
                        out_ld.append(ld)
            else:
                if wpd > 0.0:
                    out_para.append(para)
                    out_wpd.append(wpd)
                    out_ld.append(ld)
    else:
        ld_list = [metrics.ld(nlp_input_text, nlp(t)) for t in paraphrase_list]
        sorted_ld = [[ld,para] for ld,para in sorted(zip(ld_list, paraphrase_list))]
        max_ld = max(ld_list)
        for ld, para in sorted_ld:
            nlp_para = nlp(para)
            wpd = metrics.wpd(nlp_input_text, nlp_para)
            ld = metrics.ld(nlp_input_text, nlp_para)
            if keep_word:
                para_tokens = [token.text.lower() for token in nlp_para]
                if keep_word in para_tokens:
                    if wpd > 0.0:
                        out_para.append(para)
                        out_wpd.append(wpd)
                        out_ld.append(ld)
            else:
                if wpd > 0.0:
                    out_para.append(para)
                    out_wpd.append(wpd)
                    out_ld.append(ld)
    return out_para, out_wpd, out_ld

bed
* There's a lot of trash on the bed of the river.
* I keep a glass of water next to my bed when I sleep.

In [None]:
input_text = "I keep a glass of water next to my bed when I sleep."
out_para, out_wpd, out_ld = generate(input_text, output_count=128, sort_by="wpd")

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format='retina'
import seaborn as sns
import pandas as pd

In [None]:
plt.title("T5 Model Output WPD")
plt.xlim([0, 1.0])
sns.histplot(out_wpd)

In [None]:
plt.title("T5 Model Output LD")
plt.xlim([0, 1.0])
sns.histplot(out_ld)

In [None]:
#input_text = "I keep a glass of water next to my bed when I sleep."
#out_para, out_wpd, out_ld = generate(input_text, output_count=128, sort_by="wpd")
for i, wpd in zip(out_para, out_wpd):
    wpd = round(wpd, 3)
    print(i, wpd)

In [None]:
input_text = "I keep a glass of water next to my bed when I sleep."
out_para, out_wpd, out_ld= generate(input_text, output_count=128, sort_by="ld")
for i, ld in zip(out_para, out_ld):
    ld = round(ld, 3)
    print(i, ld)

In [None]:
s1 = "I keep a glass of water next to my bed when I sleep."
s2 = "During the night, I keep a glass of water next to my bed."
s1, s2 = nlp(s1), nlp(s2)
metrics.ld(s1, s2)

In [None]:
s1 = "I keep a glass of water next to my bed when I sleep."
s2 = "When I sleep, I keep a glass of water by my bedside."
s1, s2 = nlp(s1), nlp(s2)
metrics.ld(s1, s2)

In [None]:
input_text = "There's a lot of trash on the bed of the river."
out_para, _, _ = generate(input_text, keep_word="bed")
for i in out_para[:3]:
    print(i)
for i in out_para[-3:]:
    print(i)

In [None]:
input_text = "I keep a glass of water next to my bed when I sleep."
out_para, _, _ = generate(input_text, keep_word="bed")
for i in out_para[:3]:
    print(i)
for i in out_para[-3:]:
    print(i)

In [None]:
input_text = "The expanded window will give us time to catch the thieves."
out_para, _, _ = generate(input_text, keep_word="window")
for i in out_para[:3]:
    print(i)
for i in out_para[-3:]:
    print(i)

In [None]:
input_text = "You have a two-hour window of clear weather to finish working on the lawn."
out_para, _, _ = generate(input_text, keep_word="window")
for i in out_para[:3]:
    print(i)
for i in out_para[-3:]:
    print(i)

In [None]:
input_text = "The pilot managed to land the airplane safely."
out_para, _, _ = generate(input_text, keep_word="land")
for i in out_para[:3]:
    print(i)
for i in out_para[-3:]:
    print(i)

In [None]:
input_text = "The enemy landed several of our aircrafts."
out_para, _, _ = generate(input_text, keep_word="land")
for i in out_para[:3]:
    print(i)
for i in out_para[-3:]:
    print(i)