In [1]:
%load_ext autoreload
%autoreload 2

import sys
import os

sys.path.append('..')

In [3]:
# create a spacy processor; this basically calls en_core_web_sm, but can do white tokenization if needed

from polyjuice import Polyjuice
pj = Polyjuice(is_cuda=True)

In [43]:
# sample doc
text = "At the beginning of this episode, we welcome Dr. Faiz Eddine, political writer and researcher."
pj.perturb(
    orig_sent=text,
    blanked_sent="At the beginning",
    is_complete_blank=False,
    ctrl_code="ne",
    perplex_thred=None)

ERROR:polyjuice.polyjuice_wrapper:{'ne'} is not a valid ctrl code. Please choose from {'resemantic', 'restructure', 'negation', 'insert', 'lexical', 'shuffle', 'quantifier', 'delete'}.


[]

In [33]:
pj.get_random_blanked_sentences(text, max_blank_block=3)

{'At [BLANK] beginning of this episode, we welcome Dr. Faiz Eddine, political writer and researcher.',
 'At [BLANK] we welcome Dr. Faiz Eddine, political [BLANK] writer and researcher.',
 'At the beginning of this episode, we [BLANK] welcome Dr. Faiz Eddine, political writer and researcher.'}

In [45]:
# the underlying search function is:

get_random_idxes(
    doc, 
    # only allow selecting from a preset range
    pre_selected_idxes=None,
    # only select from a subset of dep tags
    deps=None, #["subj"],
    # blank sub-spans or just single tokens
    is_token_only=False,
    # maximum number of returned index tuple
    max_count=3,
    # maximum number of blanks per returned sentence
    max_blank_block=1
)

[[[0, 18]], [[3, 6]], [[15, 16]]]

In [48]:
# get prompt calls create_blanked_sents

prompts = get_prompts(doc, 
    # control tag; If none, then everything
    tags=["lexical"], 
    indexes=None
)

prompts

['At the beginning of this episode, we welcome Dr. Faiz Eddine, political writer and researcher. <|perturb|> [lexical] [BLANK] [SEP]',
 'At the beginning of this episode, we welcome Dr. Faiz Eddine, political writer and researcher. <|perturb|> [lexical] At the beginning of this episode, we welcome Dr. [BLANK] Eddine, political writer and researcher. [SEP]',
 'At the beginning of this episode, we welcome Dr. Faiz Eddine, political writer and researcher. <|perturb|> [lexical] At the beginning of this episode, [BLANK] welcome Dr. Faiz Eddine, political writer and researcher. [SEP]']

In [49]:
# get the generator

polyjuice = load_generator(
    # these are default values
    model_path="uw-hai/polyjuice", is_cuda=True
)

In [52]:
# there are other kwargs that can config the model, e.g. top_p, num_beams, etc
generated = generate_on_prompts(generator=polyjuice, prompts=prompts)

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [53]:
generated

[['At the beginning of this episode, we found out that Majel Barrett was involved in a scheme to get Dr. Elias Magnus to murder Guillaume and to have Guillaume bring down the Guillaumeans.',
  "At the beginning of this episode, we don't welcome Dr. Kifoski, a political operative who at first turns out to be a pleasant being.",
  "At the beginning of the episode, we don't welcome Dr. Weldon, political junkie and the real estate developer."],
 ['At the beginning of this episode, we welcome Dr. Maximilian Eddine, political writer and researcher.',
  'At the beginning of this episode, we welcome Dr. Daphne Von Rey, auxiliary station Eddine, political writer and researcher.',
  'At the beginning of this episode, we welcome Dr. Dr. Eddine, political writer and researcher.'],
 ["At the beginning of this episode, we don't welcome Dr. Faiz Eddine, political writer and researcher.",
  'At the beginning of this episode, I found it welcome Dr. Faiz Eddine, political writer and researcher.']]

In [59]:
import numpy as np
merged = list(np.concatenate(generated))
generated_docs = [processor(str(g)) for g in merged]

In [63]:
# compute editing distances
from polyjuice.compute_perturbs import compute_edit_ops

eops = [compute_edit_ops(doc, generated_doc) for generated_doc in generated_docs]
eops[0]

[Munch({'op': 'equal', 'fromz_core': At the beginning of this episode, we, 'toz_core': At the beginning of this episode, we, 'fromz_full': At the beginning of this episode, we welcome, 'toz_full': At the beginning of this episode, we found out that Majel Barrett was involved in a scheme to get}),
 Munch({'op': 'replace', 'fromz_core': welcome, 'toz_core': found out that Majel Barrett was involved in a scheme to get, 'fromz_full': welcome, 'toz_full': found out that Majel Barrett was involved in a scheme to get}),
 Munch({'op': 'equal', 'fromz_core': Dr., 'toz_core': Dr., 'fromz_full': Dr. Faiz Eddine, 'toz_full': Dr. Elias Magnus}),
 Munch({'op': 'replace', 'fromz_core': Faiz Eddine, political writer, 'toz_core': Elias Magnus to murder Guillaume, 'fromz_full': welcome Dr. Faiz Eddine, political writer, 'toz_full': found out that Majel Barrett was involved in a scheme to get Dr. Elias Magnus to murder Guillaume}),
 Munch({'op': 'equal', 'fromz_core': and, 'toz_core': and, 'fromz_full': 

In [54]:
# filter by perplexity

from polyjuice.filters_and_selectors import load_perplex_scorer, compute_delta_perplexity
perplex_scorer = load_perplex_scorer(is_cuda=True)

In [66]:
# compute perplexities
pps = [compute_delta_perplexity(eop, perplex_scorer) for eop in eops]
# the computation of this part is briefly described in the paper. 

for pp, generated_doc in zip(pps, generated_docs):
    print(generated_doc)
    print(pp)
    print()

At the beginning of this episode, we found out that Majel Barrett was involved in a scheme to get Dr. Elias Magnus to murder Guillaume and to have Guillaume bring down the Guillaumeans.
Munch({'pr_sent': 70.07771301269531, 'pr_phrase': 1.1444091796875e-05})

At the beginning of this episode, we don't welcome Dr. Kifoski, a political operative who at first turns out to be a pleasant being.
Munch({'pr_sent': 42.57447052001953, 'pr_phrase': 2.288818359375e-05})

At the beginning of the episode, we don't welcome Dr. Weldon, political junkie and the real estate developer.
Munch({'pr_sent': 18.77056884765625, 'pr_phrase': 0.0})

At the beginning of this episode, we welcome Dr. Maximilian Eddine, political writer and researcher.
Munch({'pr_sent': 0.4716033935546875, 'pr_phrase': 0.0})

At the beginning of this episode, we welcome Dr. Daphne Von Rey, auxiliary station Eddine, political writer and researcher.
Munch({'pr_sent': 54.58534240722656, 'pr_phrase': 0.0})

At the beginning of this epis

In [73]:
# I usually require both sentence level and phrase level perplexity to be < 10
perplex_thred = 10
valid_idxes = np.where([pp.pr_sent < perplex_thred and pp.pr_phrase < perplex_thred for pp in pps])[0]
valid_eops = [eops[idx] for idx in valid_idxes]
valid_generations = [generated_docs[idx] for idx in valid_idxes]
valid_generations

[At the beginning of this episode, we welcome Dr. Maximilian Eddine, political writer and researcher.,
 At the beginning of this episode, we welcome Dr. Dr. Eddine, political writer and researcher.]

In [76]:
# can also filter by the match of the tag. the code for this part is quite messy though
from polyjuice.compute_perturbs import SentenceMetadata
meta = SentenceMetadata(valid_eops[0])
meta

[equal] At the beginning of this episode, we welcome Dr. -> At the beginning of this episode, we welcome Dr.
	tag: None
	dep: ROOT
	sem_dist: -1.000
[replace] Faiz -> Maximilian
	tag: None
	dep: modifier
	sem_dist: -1.000
[equal] Eddine, political writer and researcher. -> Eddine, political writer and researcher.
	tag: None
	dep: obj
	sem_dist: -1.000

In [77]:
# compute the control code
# first, load sentence similarity measurer

from polyjuice.filters_and_selectors import load_distance_scorer, compute_sent_cosine_distance
from functools import partial
distance_scorer = load_distance_scorer(is_cuda=True)
compute_sent_cosine_distance_model = partial(compute_sent_cosine_distance, similarity_scorer=distance_scorer)

In [79]:
compute_sent_cosine_distance_model("the door is open.", "the door is closed.")

0.85998318420456

In [82]:
meta.compute_metadata(sentence_similarity=compute_sent_cosine_distance_model)
meta

[replace] Faiz -> Maximilian
	tag: lexical
	dep: modifier
	sem_dist: 0.662
[equal] At the beginning of this episode, we welcome Dr. -> At the beginning of this episode, we welcome Dr.
	tag: equal
	dep: ROOT
	sem_dist: 0.010
[equal] Eddine, political writer and researcher. -> Eddine, political writer and researcher.
	tag: equal
	dep: obj
	sem_dist: 0.010

In [84]:
# this is the final tag
meta.primary.tag

'lexical'