# Demo code for iterative text revision on IteraTeR dataset

In [None]:
!pip install datasets
!pip install transformers
!pip install torch==1.5.1
!pip install torchvision==0.6.1
!pip install levenshtein

In [None]:
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from rapidfuzz.distance import Levenshtein

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [None]:
# load dataset
dataset = load_dataset("wanyu/IteraTeR_human_sent")
dataset

Using custom data configuration wanyu--IteraTeR_human_sent-7ef9360fff9b86ec
Reusing dataset json (/root/.cache/huggingface/datasets/json/wanyu--IteraTeR_human_sent-7ef9360fff9b86ec/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b)


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['before_sent', 'before_sent_with_intent', 'after_sent', 'labels', 'doc_id', 'revision_depth'],
        num_rows: 3254
    })
    test: Dataset({
        features: ['before_sent', 'before_sent_with_intent', 'after_sent', 'labels', 'doc_id', 'revision_depth'],
        num_rows: 364
    })
    validation: Dataset({
        features: ['before_sent', 'before_sent_with_intent', 'after_sent', 'labels', 'doc_id', 'revision_depth'],
        num_rows: 400
    })
})

In [None]:
# load model
tokenizer = AutoTokenizer.from_pretrained("wanyu/IteraTeR-PEGASUS-Revision-Generator")
model = AutoModelForSeq2SeqLM.from_pretrained("wanyu/IteraTeR-PEGASUS-Revision-Generator")
model.to(device)
model.eval()

In [None]:
# prepare input to the model: <intention> before_sent
before_text = dataset['test'][0]['before_sent']
before_input = dataset['test'][0]['before_sent_with_intent']
edit_intent = dataset['test'][0]['labels']
print('Model input:', before_input)
model_input = tokenizer(before_input, return_tensors='pt')
model_input.to(device)
print('Tokenized model input:', model_input)

Model input: <clarity> In this paper , we present a new sequence-to-sequence pre-training model called ProphetNet, which introduces a novel self-supervised objective named future n-gram prediction and the proposed n-stream self-attention mechanism.
Tokenized model input: {'input_ids': tensor([[96103,   222,   136,   800,   110,   108,   145,   799,   114,   177,
          5936,   121,   497,   121, 69987,  1133,   121, 18006,   861,   568,
         17857,  7614,   108,   162,  9905,   114,  2794,   813,   121, 83465,
          4129,  1729,   533,  3178,   121, 14506, 11955,   111,   109,  2962,
          3178,   121, 12342,   813,   121, 65167,  5661,   107,     1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1]])}


In [None]:
# start iterative text revision
MAX_ITER = 10
for t in range(MAX_ITER):
  # get current model prediction: after_sent
  model_outputs = model.generate(**model_input, num_beams=8, max_length=1024)
  after_text = tokenizer.batch_decode(model_outputs, skip_special_tokens=True)[0]
  print(f'Model output at revision depth {t+1}: {after_text}')
  edit_dist = Levenshtein.distance(before_text, after_text)
  print(f'Edit distance at revision depth {t+1}: {edit_dist}')
  print('============================================')
  # check stopping criteria
  if edit_dist == 0:
    print(f'Model final output: {after_text}')
    break
  else:
    before_text = after_text
    before_input = f'<{edit_intent}> {before_text}'
    model_input = tokenizer(before_input, return_tensors='pt').to(device)

Model output at revision depth 1: In this paper, we present a novel sequence-to-sequence pre-training model called ProphetNet, which introduces a novel self-supervised objective named future n-gram prediction and the proposed n-stream self-attention mechanism.
Edit distance at revision depth 1: 4
Model output at revision depth 2: In this paper, we present a sequence-to-sequence pre-training model called ProphetNet, which introduces a novel self-supervised objective named future n-gram prediction and the proposed n-stream self-attention mechanism.
Edit distance at revision depth 2: 6
Model output at revision depth 3: In this paper, we present a sequence-to-sequence pre-training model called ProphetNet, which introduces a novel self-supervised objective named future n-gram prediction and the proposed n-stream self-attention mechanism.
Edit distance at revision depth 3: 0
Model final output: In this paper, we present a sequence-to-sequence pre-training model called ProphetNet, which intro