# Import libraries

In [19]:
import warnings 
warnings.filterwarnings('ignore')
import torch
import torchvision
import pandas as pd
import nlpaug.augmenter.word as naw
from nlpaug.util import Action
pd.options.display.max_colwidth = 1000

# Read data

In [6]:
# Load the input data into a Pandas DataFrame
df = pd.read_excel('./data-collected/ic-question-assessment-v2.xlsx')

df.head()

Unnamed: 0,sentence,class
0,Help me find pharmacies around here,Find pharmacy


# Data Augmentation 

### Synonym replacement

In [7]:
def generate_synonym(df, num_sentences=1, aug_src = 'wordnet', aug_min = 1, aug_max = 3):
    # Define the synonym replacement augmentation technique
    augmenter_synonym = naw.SynonymAug(aug_src=aug_src, aug_min=aug_min, aug_max=aug_max) # aug_min=1, aug_max=1

    df_A = pd.DataFrame(columns=['sentence', 'class', 'method'])
    for _, row in df.iterrows():
        augmented_sentences = [row['sentence']] + [augmenter_synonym.augment(row['sentence']) for _ in range(num_sentences)]
        augmented_rows = pd.DataFrame({
            'sentence': augmented_sentences,
            'class': [row['class']] * len(augmented_sentences),
            'method': ['synonym'] + ['synonym'] * num_sentences
        })
        df_A = df_A.append(augmented_rows)

    # Remove "[" and "]" characters from each output line
    df_A['sentence'] = df_A['sentence'].apply(lambda x: str(x).replace('[', '').replace("'", '').replace('"', '').replace(']', ''))

    return df_A

In [9]:
data_synonym = generate_synonym(df, 10)
data_synonym.head(10)

Unnamed: 0,sentence,class,method
0,Help me find pharmacies around here,Find pharmacy,synonym
1,Help me find pharmacies around hither,Find pharmacy,synonym
2,Assist me find pharmacy around here,Find pharmacy,synonym
3,Help pine tree state find pharmacies around hither,Find pharmacy,synonym
4,Help maine find pharmacies around hither,Find pharmacy,synonym
5,Help me find apothecarys shop around hither,Find pharmacy,synonym
6,Help me find apothecarys shop around hither,Find pharmacy,synonym
7,Facilitate maine find pharmacies around here,Find pharmacy,synonym
8,Help me discover chemists shop around here,Find pharmacy,synonym
9,Help pine tree state find apothecarys shop around here,Find pharmacy,synonym


### Random swap

In [10]:
def generate_swap(df, num_sentences=1, aug_min = 1, aug_max = 1):
    # Define the random position change augmentation technique
    augmenter_swap = naw.RandomWordAug(action=Action.SWAP, aug_min=aug_min, aug_max=aug_max)

    df_B = pd.DataFrame(columns=['sentence', 'class'])
    for _, row in df.iterrows():
        augmented_sentences = [row['sentence']] + [augmenter_swap.augment(row['sentence']) for _ in range(num_sentences)]
        augmented_rows = pd.DataFrame({
            'sentence': augmented_sentences,
            'class': [row['class']] * len(augmented_sentences),
            # 'method': [row['method']] + ['swap'] * num_sentences
        })
        df_B = df_B.append(augmented_rows)
    
    # Remove "[" and "]" characters from each output line
    df_B['sentence'] = df_B['sentence'].apply(lambda x: str(x).replace('[', '').replace("'", '').replace('"', '').replace(']', ''))

    return df_B

In [11]:
data_swap = generate_swap(df, 10)
data_swap.head(10)

Unnamed: 0,sentence,class
0,Help me find pharmacies around here,Find pharmacy
1,Me help find pharmacies around here,Find pharmacy
2,Me help find pharmacies around here,Find pharmacy
3,Me help find pharmacies around here,Find pharmacy
4,Help me find pharmacies here around,Find pharmacy
5,Help find me pharmacies around here,Find pharmacy
6,Help me pharmacies find around here,Find pharmacy
7,Help me pharmacies find around here,Find pharmacy
8,Help me find around pharmacies here,Find pharmacy
9,Help me find pharmacies here around,Find pharmacy


### Random deletion

In [12]:
def generate_deletion(df, num_sentences=1, aug_p=0.3, aug_min = 1, aug_max = 1):
    # Define the random deletion augmentation technique
    augmenter_delete = naw.RandomWordAug(action=Action.DELETE, aug_p=aug_p, aug_min=aug_min, aug_max=aug_max)

    df_augmented = pd.DataFrame(columns=['sentence', 'class'])
    for _, row in df.iterrows():
        augmented_sentences = [row['sentence']] + [augmenter_delete.augment(row['sentence']) for _ in range(num_sentences)]
        augmented_rows = pd.DataFrame({
            'sentence': augmented_sentences,
            'class': [row['class']] * len(augmented_sentences)
        })
        df_augmented = df_augmented.append(augmented_rows)

    df_augmented.reset_index(drop=True, inplace=True)

    # Remove "[" and "]" characters from each output line
    df_augmented['sentence'] = df_augmented['sentence'].apply(lambda x: str(x).replace('[', '').replace("'", '').replace('"', '').replace(']', ''))

    return df_augmented


In [13]:
data_deleted = generate_deletion(df, 10)
data_deleted.head(10)

Unnamed: 0,sentence,class
0,Help me find pharmacies around here,Find pharmacy
1,Help me find pharmacies around,Find pharmacy
2,Me find pharmacies around here,Find pharmacy
3,Help me find pharmacies here,Find pharmacy
4,Help me find pharmacies around,Find pharmacy
5,Me find pharmacies around here,Find pharmacy
6,Help find pharmacies around here,Find pharmacy
7,Help find pharmacies around here,Find pharmacy
8,Help find pharmacies around here,Find pharmacy
9,Help me find around here,Find pharmacy


### Pipeline

In [14]:
df1 = generate_synonym(df, 10)
df2 = generate_swap(df1, 2)
df3 = generate_deletion(df2, 2, 1, 2)

In [15]:
print(df3.shape)
df3.head(20)

(99, 2)


Unnamed: 0,sentence,class
0,Help me find pharmacies around here,Find pharmacy
1,Help me find around here,Find pharmacy
2,Help me pharmacies around here,Find pharmacy
3,Me help find pharmacies around here,Find pharmacy
4,Me help find pharmacies here,Find pharmacy
5,Me find pharmacies around here,Find pharmacy
6,Help find me pharmacies around here,Find pharmacy
7,Find me pharmacies around here,Find pharmacy
8,Help find me pharmacies around,Find pharmacy
9,Help pine tree state find pharmacies around hither,Find pharmacy


### Insert word by contextual word embeddings (BERT, DistilBERT, RoBERTA or XLNet)

In [23]:
text = "The quick brown fox jumps over the lazy dog"
aug = naw.ContextualWordEmbsAug(model_path='bert-base-uncased', action="insert")
augmented_text = aug.augment(text)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)

Original:
The quick brown fox jumps over the lazy dog
Augmented Text:
['except the lazy quick brown fox jumps over the lazy yellow dog']


In [16]:
def generate_insert(df, num_sentences=1, aug_p=0.3, aug_min = 1, aug_max = 1):
    # Define the random insert augmentation technique
    augmenter_insert = naw.ContextualWordEmbsAug(model_path='bert-base-uncased', action="insert")

    df_augmented = pd.DataFrame(columns=['sentence', 'class'])
    for _, row in df.iterrows():
        augmented_sentences = [row['sentence']] + [augmenter_insert.augment(row['sentence']) for _ in range(num_sentences)]
        augmented_rows = pd.DataFrame({
            'sentence': augmented_sentences,
            'class': [row['class']] * len(augmented_sentences)
        })
        df_augmented = df_augmented.append(augmented_rows)

    df_augmented.reset_index(drop=True, inplace=True)

    # Remove "[" and "]" characters from each output line
    df_augmented['sentence'] = df_augmented['sentence'].apply(lambda x: str(x).replace('[', '').replace("'", '').replace('"', '').replace(']', ''))

    return df_augmented


In [20]:
df4 = generate_insert(df, 10)
df4

Unnamed: 0,sentence,class
0,Help me find pharmacies around here,Find pharmacy
1,his help helps me find the pharmacies around here,Find pharmacy
2,you help me find nice pharmacies around the here,Find pharmacy
3,come help me find pharmacies scattered around over here,Find pharmacy
4,just help with me find pharmacies here around here,Find pharmacy
5,help me and find pharmacies poking around over here,Find pharmacy
6,christ help me... find pharmacies around around here,Find pharmacy
7,so help me find your pharmacies all around here,Find pharmacy
8,please help me find the pharmacies around up here,Find pharmacy
9,help guides me in find pharmacies around suburban here,Find pharmacy


### Substitute word by contextual word embeddings (BERT, DistilBERT, RoBERTA or XLNet)

In [24]:
aug = naw.ContextualWordEmbsAug(model_path='bert-base-uncased', action="substitute")
augmented_text = aug.augment(text)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)

Original:
The quick brown fox jumps over the lazy dog
Augmented Text:
['the little brown man jumps over his lazy dog']


In [25]:
def generate_substitute_bert_model(df, num_sentences=1, aug_p=0.3, aug_min = 1, aug_max = 1):
    # Define the random insert augmentation technique
    augmenter_substitute = naw.ContextualWordEmbsAug(model_path='bert-base-uncased', action="substitute")

    df_augmented = pd.DataFrame(columns=['sentence', 'class'])
    for _, row in df.iterrows():
        augmented_sentences = [row['sentence']] + [augmenter_substitute.augment(row['sentence']) for _ in range(num_sentences)]
        augmented_rows = pd.DataFrame({
            'sentence': augmented_sentences,
            'class': [row['class']] * len(augmented_sentences)
        })
        df_augmented = df_augmented.append(augmented_rows)

    df_augmented.reset_index(drop=True, inplace=True)

    # Remove "[" and "]" characters from each output line
    df_augmented['sentence'] = df_augmented['sentence'].apply(lambda x: str(x).replace('[', '').replace("'", '').replace('"', '').replace(']', ''))

    return df_augmented


In [26]:
df5 = generate_substitute_bert_model(df, 10)
df5

Unnamed: 0,sentence,class
0,Help me find pharmacies around here,Find pharmacy
1,make me open pharmacies right here,Find pharmacy
2,help me show him around...,Find pharmacy
3,help him find things like here,Find pharmacy
4,help to find out down here,Find pharmacy
5,help me guide pharmacies online because,Find pharmacy
6,help me catch anything around will,Find pharmacy
7,help me keep something around ॥,Find pharmacy
8,help people find pharmacies clean and,Find pharmacy
9,these people find pharmacies around 。,Find pharmacy


### Substitute word by contextual word embeddings (DistilBERT)

In [27]:
aug = naw.ContextualWordEmbsAug(
    model_path='distilbert-base-uncased', action="substitute")
augmented_text = aug.augment(text)
print("Original:")
print(text)
print("Augmented Text:")
print(augmented_text)

Downloading (…)okenizer_config.json: 100%|██████████| 28.0/28.0 [00:00<00:00, 28.0kB/s]
Downloading (…)lve/main/config.json: 100%|██████████| 483/483 [00:00<00:00, 473kB/s]
Downloading (…)solve/main/vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 286kB/s]
Downloading (…)/main/tokenizer.json: 100%|██████████| 466k/466k [00:01<00:00, 301kB/s]
Downloading pytorch_model.bin: 100%|██████████| 268M/268M [01:41<00:00, 2.65MB/s] 


Original:
The quick brown fox jumps over the lazy dog
Augmented Text:
['the quick brown fox jumps whenever the hopping mouse']


In [None]:
def generate_substitute_distilbert_model(df, num_sentences=1, aug_p=0.3, aug_min = 1, aug_max = 1):
    # Define the random insert augmentation technique
    augmenter_substitute = naw.ContextualWordEmbsAug(model_path='distilbert-base-uncased', action="substitute")

    df_augmented = pd.DataFrame(columns=['sentence', 'class'])
    for _, row in df.iterrows():
        augmented_sentences = [row['sentence']] + [augmenter_substitute.augment(row['sentence']) for _ in range(num_sentences)]
        augmented_rows = pd.DataFrame({
            'sentence': augmented_sentences,
            'class': [row['class']] * len(augmented_sentences)
        })
        df_augmented = df_augmented.append(augmented_rows)

    df_augmented.reset_index(drop=True, inplace=True)

    # Remove "[" and "]" characters from each output line
    df_augmented['sentence'] = df_augmented['sentence'].apply(lambda x: str(x).replace('[', '').replace("'", '').replace('"', '').replace(']', ''))

    return df_augmented


In [None]:
df6 = generate_substitute_distilbert_model(df, 10)
df6

### Back Translation Augmenter

In [32]:
import nlpaug.augmenter.word as naw

text = 'The quick brown fox jumped over the lazy dog'
back_translation_aug = naw.BackTranslationAug(
    from_model_name='facebook/wmt19-en-de', 
    to_model_name='facebook/wmt19-de-en'
)
back_translation_aug.augment(text, 10)

['The speedy brown fox leapt over the lazy dog',
 'The speedy brown fox leapt over the lazy dog',
 'The speedy brown fox leapt over the lazy dog',
 'The speedy brown fox leapt over the lazy dog',
 'The speedy brown fox leapt over the lazy dog',
 'The speedy brown fox leapt over the lazy dog',
 'The speedy brown fox leapt over the lazy dog',
 'The speedy brown fox leapt over the lazy dog',
 'The speedy brown fox leapt over the lazy dog',
 'The speedy brown fox leapt over the lazy dog']

In [33]:
def generate_back_translation(df, num_sentences=1, aug_p=0.3, aug_min = 1, aug_max = 1):
    # Define the random insert augmentation technique
    back_translation_aug = naw.BackTranslationAug(
    from_model_name='facebook/wmt19-en-de', 
    to_model_name='facebook/wmt19-de-en'
    )

    df_augmented = pd.DataFrame(columns=['sentence', 'class'])
    for _, row in df.iterrows():
        augmented_sentences = [row['sentence']] + [back_translation_aug.augment(row['sentence']) for _ in range(num_sentences)]
        augmented_rows = pd.DataFrame({
            'sentence': augmented_sentences,
            'class': [row['class']] * len(augmented_sentences)
        })
        df_augmented = df_augmented.append(augmented_rows)

    df_augmented.reset_index(drop=True, inplace=True)

    # Remove "[" and "]" characters from each output line
    df_augmented['sentence'] = df_augmented['sentence'].apply(lambda x: str(x).replace('[', '').replace("'", '').replace('"', '').replace(']', ''))

    return df_augmented


In [34]:
df7 = generate_back_translation(df, 10)
df7

Unnamed: 0,sentence,class
0,Help me find pharmacies around here,Find pharmacy
1,Help me find pharmacies here,Find pharmacy
2,Help me find pharmacies here,Find pharmacy
3,Help me find pharmacies here,Find pharmacy
4,Help me find pharmacies here,Find pharmacy
5,Help me find pharmacies here,Find pharmacy
6,Help me find pharmacies here,Find pharmacy
7,Help me find pharmacies here,Find pharmacy
8,Help me find pharmacies here,Find pharmacy
9,Help me find pharmacies here,Find pharmacy


# Save 