# **Import/Installation requirements**

In [None]:
import pandas as pd
import numpy as np
import json
import math
import re
from collections import Counter
import nltk
nltk.download('all')
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import wordnet

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/abc.zip.
[nltk_data]    | Downloading package alpino to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/alpino.zip.
[nltk_data]    | Downloading package biocreative_ppi to
[nltk_data]    |     /root/nltk_data...
[nltk_data]    |   Unzipping corpora/biocreative_ppi.zip.
[nltk_data]    | Downloading package brown to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/brown.zip.
[nltk_data]    | Downloading package brown_tei to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/brown_tei.zip.
[nltk_data]    | Downloading package cess_cat to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/cess_cat.zip.
[nltk_data]    | Downloading package cess_esp to /root/nltk_data...
[nltk_data]    |   Unzipping corpora/cess_esp.zip.
[nltk_data]    | Downloading package chat80 to /root/nltk_data...
[nltk_data]    |   Unzipp

In [None]:
!pip install nlpaug

Collecting nlpaug
[?25l  Downloading https://files.pythonhosted.org/packages/eb/f8/b11caecdd19aa2b1b2cb46c6cbbec692abd621aad884e653e459a8546add/nlpaug-1.1.3-py3-none-any.whl (394kB)
[K     |▉                               | 10kB 21.0MB/s eta 0:00:01[K     |█▋                              | 20kB 29.6MB/s eta 0:00:01[K     |██▌                             | 30kB 21.3MB/s eta 0:00:01[K     |███▎                            | 40kB 17.1MB/s eta 0:00:01[K     |████▏                           | 51kB 15.9MB/s eta 0:00:01[K     |█████                           | 61kB 18.1MB/s eta 0:00:01[K     |█████▉                          | 71kB 14.0MB/s eta 0:00:01[K     |██████▋                         | 81kB 14.9MB/s eta 0:00:01[K     |███████▌                        | 92kB 13.4MB/s eta 0:00:01[K     |████████▎                       | 102kB 14.4MB/s eta 0:00:01[K     |█████████▏                      | 112kB 14.4MB/s eta 0:00:01[K     |██████████                      | 122kB 14.4MB

In [None]:
import nlpaug.augmenter.char as nac
import nlpaug.augmenter.word as naw
import nlpaug.augmenter.sentence as nas
import nlpaug.flow as naf

from nlpaug.util import Action

# **Original Data**

In [None]:
# Load the data
!gsutil cp gs://boolq/train.jsonl .
!gsutil cp gs://boolq/dev.jsonl .
!gsutil cp gs://boolq/test.jsonl .

trainData = pd.read_json("/content/train.jsonl", lines=True, orient='records')
valData = pd.read_json("/content/dev.jsonl", lines=True, orient="records")
testData = pd.read_json("/content/test.jsonl", lines=True, orient="records")

# Train 
trainPassages = trainData.passage.values
trainQuestions = trainData.question.values
trainAnswers = trainData.answer.values.astype(int)

# Validation
valPassages = valData.passage.values
valQuestions = valData.question.values
valAnswers = valData.answer.values.astype(int)

# Test
testPassages = testData.passage.values
testQuestions = testData.question.values

Copying gs://boolq/train.jsonl...
- [1 files][  6.2 MiB/  6.2 MiB]                                                
Operation completed over 1 objects/6.2 MiB.                                      
Copying gs://boolq/dev.jsonl...
- [1 files][  2.1 MiB/  2.1 MiB]                                                
Operation completed over 1 objects/2.1 MiB.                                      
Copying gs://boolq/test.jsonl...
/ [1 files][  2.1 MiB/  2.1 MiB]                                                
Operation completed over 1 objects/2.1 MiB.                                      


# **Cosine Similarity**

In [None]:
WORD = re.compile(r"\w+")

def get_cosine(vec1, vec2):
    intersection = set(vec1.keys()) & set(vec2.keys())
    numerator = sum([vec1[x] * vec2[x] for x in intersection])

    sum1 = sum([vec1[x] ** 2 for x in list(vec1.keys())])
    sum2 = sum([vec2[x] ** 2 for x in list(vec2.keys())])
    denominator = math.sqrt(sum1) * math.sqrt(sum2)

    if not denominator:
        return 0.0
    else:
        return float(numerator) / denominator


def text_to_vector(text):
    words = WORD.findall(text)
    return Counter(words)


text1 = "This is a foo bar sentence ."
text2 = "This sentence is similar to a foo bar sentence ."

vector1 = text_to_vector(text1)
vector2 = text_to_vector(text2)

cosine = get_cosine(vector1, vector2)

print("Cosine:", cosine)

Cosine: 0.8616404368553293


# **Data using pattern matching**

In [None]:
def data_maker_pattern_question(dataset):
  array = []
  for sent in dataset:
      rows = []
      question_word =""
      question_word = sent.split()[0]
      tokenize_word = word_tokenize(sent)
      tagged = nltk.pos_tag(tokenize_word)
      for word, tag in tagged:
        if tag in ('NN', 'VB', 'VBP', 'VBZ','JJ'):
          rows.append(word)
      sentence = " ".join(rows)
      array.append(sentence)
  return array

def data_maker_pattern_passage(dataset, new_questions):
  double_passage_line = []
  for i in range(len(new_questions)):
    passage_line = ""
    sentences = sent_tokenize(dataset[i])
    for sent in sentences:
      if any(word in sent for word in new_questions[i]):
        passage_line += sent
    double_passage_line.append(passage_line)
  return double_passage_line

# **Data Augmentation using WordNet**

In [None]:
def data_maker_wordnet(questions):
  new_data = [] 
  aug_syn = naw.SynonymAug(aug_src='wordnet')
  for i in range(len(questions)):
    if i%500 == 0:
      print(i)
    text = questions[i]
    index = 0
    max_cosine = 0
    selected_sent =""
    for ii in range(5):
        augmented_text = aug_syn.augment(text)
        vector1 = text_to_vector(text)
        vector2 = text_to_vector(augmented_text)
        cosine = get_cosine(vector1, vector2)
        if max_cosine < cosine:
          max_cosine = cosine
          selected_sent = augmented_text
    new_data.append(selected_sent)
  new_data_questions = np.array(new_data)
  return (new_data_questions)

# **Data Augmentation using BERT emebddings**

In [None]:
!pip install transformers
def data_maker_bert(questions):
  new_data = [] 
  TOPK=20 #default=100
  ACT = 'insert' #"substitute"
  aug_bert = naw.ContextualWordEmbsAug(
      model_path='bert-base-uncased', 
      #device='cuda',
      action=ACT, top_k=TOPK)
  for i in range(len(questions)):
    if i%500 == 0:
      print(i)
    text = questions[i]
    index = 0
    max_cosine = 0
    selected_sent =""
    for ii in range(5):
        augmented_text = aug_bert.augment(text)
        vector1 = text_to_vector(text)
        vector2 = text_to_vector(augmented_text)
        cosine = get_cosine(vector1, vector2)
        if max_cosine < cosine:
          max_cosine = cosine
          selected_sent = augmented_text
    new_data.append(selected_sent)
  new_data_questions = np.array(new_data)
  return (new_data_questions)

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/d8/b2/57495b5309f09fa501866e225c84532d1fd89536ea62406b2181933fb418/transformers-4.5.1-py3-none-any.whl (2.1MB)
[K     |████████████████████████████████| 2.1MB 13.5MB/s 
[?25hCollecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/ae/04/5b870f26a858552025a62f1649c20d29d2672c02ff3c3fb4c688ca46467a/tokenizers-0.10.2-cp37-cp37m-manylinux2010_x86_64.whl (3.3MB)
[K     |████████████████████████████████| 3.3MB 54.7MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/75/ee/67241dc87f266093c533a2d4d3d69438e57d7a90abb216fa076e7d475d4a/sacremoses-0.0.45-py3-none-any.whl (895kB)
[K     |████████████████████████████████| 901kB 50.5MB/s 
Installing collected packages: tokenizers, sacremoses, transformers
Successfully installed sacremoses-0.0.45 tokenizers-0.10.2 transformers-4.5.1


# **Making the Json**

In [None]:
def json_file(trainQuestions, trainPassages, trainAnswers, file):
  augData = open("Augmented_" + file + ".jsonl", 'w')
  for question, passage, answers in zip(trainQuestions, trainPassages, trainAnswers):
    dicti = {}
    # Data to be written
    dicti['question'] = question
    dicti['passage'] = passage
    dicti['answer'] = int(answers)
    #write the new dictionary to the jsonl file
    augData.write(json.dumps(dicti) + '\n')

# **Make the Data**

In [None]:
#for data using pattern matching
new_data_questions = np.array(data_maker_pattern_question(trainQuestions))
new_data_passages = np.array(data_maker_pattern_passage(trainPassages, new_data_questions ))

new_val_questions = np.array(data_maker_pattern_question(valQuestions))
new_val_passages = np.array(data_maker_pattern_passage(valPassages, new_val_questions))

new_test_questions = np.array(data_maker_pattern_question(valQuestions))
new_test_passages = np.array(data_maker_pattern_passage(valPassages, new_val_questions ))

json_file(new_data_questions, new_data_passages, trainAnswers, "pattern_matching_train")
json_file(new_val_questions, new_val_passages, valAnswers, "pattern_matching_val")


#for data augmentation using wordnet
new_wordnet_data_questions = data_maker_wordnet(trainQuestions)
new_wordnet_val_questions = data_maker_wordnet(valQuestions)
new_wordnet_test_questions = data_maker_wordnet(testQuestions)

json_file(new_wordnet_data_questions, trainPassages, trainAnswers, "wordnet_train")
json_file(new_wordnet_val_questions, valPassages, valAnswers, "wordnet_val")


#for data augmentation using BERT
new_bert_data_questions = data_maker_bert(trainQuestions)
new_bert_val_questions = data_maker_bert(valQuestions)
new_bert_test_questions = data_maker_bert(testQuestions)

json_file(new_bert_data_questions, trainPassages, trainAnswers, "bert_train")
json_file(new_bert_val_questions, valPassages, valAnswers, "bert_val")


#for data augmentation using wordnet then BERT
new_word_BERT_data_questions = data_maker_bert(new_wordnet_data_questions)
new_word_BERT_val_questions = data_maker_bert(new_wordnet_val_questions)
new_word_BERT_test_questions = data_maker_bert(new_wordnet_test_questions)

json_file(new_word_BERT_data_questions, trainPassages, trainAnswers, "wordnet_bert_train")
json_file(new_word_BERT_val_questions, valPassages, valAnswers, "wordnet_bert_val")



0
500
1000
1500
2000
2500
3000
3500
4000
4500
5000
5500
6000
6500
7000
7500
8000
8500
9000
0
500
1000
1500
2000
2500
3000
0
500
1000
1500
2000
2500
3000


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=570.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466062.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=28.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…


0
500
1000
1500
2000
2500
3000
3500
4000
4500
5000
5500
6000
6500
7000
7500
8000
8500
9000
0
500
1000
1500
2000
2500
3000
0
500
1000
1500
2000
2500
3000
