**Data Augmentation by Paraphrasing**

In [4]:
# Install required packages
!pip install torch torchvision transformers==2.10.0 rasa==1.10.0 input_reader



In [5]:
# Import required libraries
import ipywidgets as widgets
import requests, os
from IPython.display import display
from ipywidgets import interact

from rasa.nlu.training_data import TrainingData,Message

**Download Model**

In [9]:
def download_file(id, destination):
    url = "https://docs.google.com/uc?export=download"
    sess = requests.Session()
    response = sess.get(url, params = { 'id' : id }, stream = True)
    token = get_confirmation_token(response)
    if token:
        params = { 'id' : id, 'confirm' : token }
        response = sess.get(url, params = params, stream = True)
    save_response_content(response, destination)    

def get_confirmation_token(response):
    for key, value in response.cookies.items():
        if key.startswith('download_warning'):
            return value
    return None

def save_response_content(response, destination):
    chunk_size = 32768
    with open(destination, "wb") as f:
        for chunk in response.iter_content(chunk_size):
            if chunk:
                f.write(chunk)

model_class_file_id = '1N1kn2b7i2ND7eNefzyJM-k13IM8tqZvr'
checkpoint_file_id = '1G0nwXlvzGsb8Ar-OAnYBQKFvY97WMzBy'
model_class_destination = 'model.py'
checkpoint_destination = 'model.zip'
checkpoint_unzipped_destination = 'package_models'

if not os.path.exists(checkpoint_unzipped_destination):
  download_file(checkpoint_file_id, checkpoint_destination)
  !unzip {checkpoint_destination}

if not os.path.exists(model_class_destination):
  download_file(model_class_file_id, model_class_destination)

Archive:  model.zip
   creating: package_models/
  inflating: package_models/.DS_Store  
   creating: __MACOSX/
   creating: __MACOSX/package_models/
  inflating: __MACOSX/package_models/._.DS_Store  
   creating: package_models/lm_finetune_8/
   creating: package_models/lm_finetune_8/checkpoint-56000/
  inflating: package_models/lm_finetune_8/checkpoint-56000/added_tokens.json  
  inflating: package_models/lm_finetune_8/checkpoint-56000/tokenizer_config.json  
  inflating: package_models/lm_finetune_8/checkpoint-56000/special_tokens_map.json  
  inflating: package_models/lm_finetune_8/checkpoint-56000/optimizer.pt  
  inflating: package_models/lm_finetune_8/checkpoint-56000/config.json  
  inflating: package_models/lm_finetune_8/checkpoint-56000/scheduler.pt  
  inflating: package_models/lm_finetune_8/checkpoint-56000/merges.txt  
  inflating: package_models/lm_finetune_8/checkpoint-56000/training_args.bin  
  inflating: package_models/lm_finetune_8/checkpoint-56000/pytorch_model.bin 

**Setup Model**

In [10]:
from model import ParaphraseModel
model_path = 'package_models/lm_finetune_8/checkpoint-56000/'

complete_td = TrainingData()
model = ParaphraseModel(model_path)

**Paraphrase**

In [12]:
input_phrase = input("Enter a message for which you would like to generate paraphrase: ")

Enter a message for which you would like to generate paraphrase: At launch, Amazon SageMaker was an easy onramp to machine learning for folks without formal data science training.


In [13]:
number_of_samples = int(input("Number of paraphrases to generate: "))
stop_words = input("Stop words to be constrained with(multiple semi-colon separated): ")

Number of paraphrases to generate: 2
Stop words to be constrained with(multiple semi-colon separated): .


In [14]:
paraphrase = model.get_paraphrases(input_phrase, number_of_samples, stop_words)

100%|██████████| 20/20 [00:06<00:00,  2.87it/s]


In [15]:
print("Steps:\n1. Read all proposed paraphrases below.\n2. Select valid paraphrases that you would like\
 to add to your NLU training data. Use Ctrl/Cmd + Click to select multiple.\n\
3. Enter the name of the intent under which these messages should be categorized\n\
4. Click 'Add to training data'\n\
5. Copy the training data displayed in Rasa Markdown format to your existing training data file.\n\
6. You can go back to 3 cells above this to enter new messages for which you want to generate paraphrases.")


Steps:
1. Read all proposed paraphrases below.
2. Select valid paraphrases that you would like to add to your NLU training data. Use Ctrl/Cmd + Click to select multiple.
3. Enter the name of the intent under which these messages should be categorized
4. Click 'Add to training data'
5. Copy the training data displayed in Rasa Markdown format to your existing training data file.
6. You can go back to 3 cells above this to enter new messages for which you want to generate paraphrases.


In [19]:
paraphrase_input_widget = widgets.SelectMultiple(
    options=paraphrase,
    value=[],
    rows=number_of_samples,
    description='Paraphrases',
    disabled=False,
    layout= widgets.Layout(width='100%')
)
display(paraphrase_input_widget)

intent = widgets.Text(description="Intent")
display(intent)

button = widgets.Button(description="Add to Training Data")
widget_output = widgets.Output()

display(button, widget_output)

def on_button_click(b):
    global complete_td
    with widget_output:
        intent_value = intent.value
        selected_paraphrase = paraphrase_input_widget.value
        if not len(selected_paraphrase):
            print("Error: You haven't selected any paraphrases")
            return
        if not intent_value:
            print("Error: Please enter the intent name under which these messages should be categorized.")
            return
        all_messages = [Message.build(text=input_phrase, intent=intent_value)]
        for p in selected_paraphrase:
            all_messages.append(Message.build(text=p,intent=intent_value))
        complete_td = complete_td.merge(TrainingData(training_examples=all_messages)) 
        print(complete_td.nlu_as_markdown())
button.on_click(on_button_click)

SelectMultiple(description='Paraphrases', layout=Layout(width='100%'), options=('the amazon sagemaker was an e…

Text(value='', description='Intent')

Button(description='Add to Training Data', style=ButtonStyle())

Output()