### Import libraries and utilities

In [19]:
! pip install transformers
! pip install underthesea
! pip install pattern

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.20.1-py3-none-any.whl (4.4 MB)
[K     |████████████████████████████████| 4.4 MB 14.0 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 80.3 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 76.8 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.8.1-py3-none-any.whl (101 kB)
[K     |████████████████████████████████| 101 kB 13.9 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninsta

In [20]:
from nltk.tag import pos_tag

import nltk

nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')

from transformers import pipeline
from underthesea import word_tokenize
from gensim.models import KeyedVectors
from string import punctuation
import random
from math import ceil, floor
from collections import Counter
from tqdm import tqdm
import random
from nltk.corpus import wordnet as wn
from pattern.en import wordnet
from pattern.en import singularize, pluralize, comparative, superlative, conjugate
import re
import time

random.seed(42)
import numpy as np
np.random.seed(42)

[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [21]:
from google.colab import drive
drive.mount('/gdrive')

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).


In [22]:
%cd /gdrive/My\ Drive/NLP_Project

/gdrive/My Drive/NLP_Project


In [23]:
# punctuation characters
extend_punc = punctuation + "…"

def check_punctuation(word):
    """
    Check if the word contains any punctuation or number. Return False if it does, True otherwise
    """
    # check punctuation 
    for sub_word in word.split("_"): # split "_" for 2-syllable vn words
        for spec_char in extend_punc:
            if spec_char in sub_word:
                return False

    # check number
    for char in word:
        if char.isdigit():
            return False

    return True

### Import pretrained models

In [24]:
# vietnamese word2vec model
vi_model = KeyedVectors.load_word2vec_format('src/Augmentation/Synonym replacement/word2vec/baomoi.model.bin', binary=True)

# english word2vec model
import gensim.downloader
en_model = gensim.downloader.load('word2vec-google-news-300')
# en_model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True) 

# phobert model
unmasker = pipeline('fill-mask', model="vinai/phobert-base", top_k=5)



Downloading:   0%|          | 0.00/557 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/518M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/874k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


### English data augmentation

#### Find a synonym of an english word using WordNet

In [25]:
def get_synonym_wn(word, tag):
    """
    Get synonym using synset
    word: str
    tag: one of these: [wn.ADJ, wn.NOUN, wn.VERB. wn.ADV]
    """
    syn_l = set()

    # Look up words in synset
    s = wn.synsets(word, tag)
    # print(s)
    
    # check if there are any synonyms in the set
    if len(s) > 0:
        base = s[0].lemmas()[0].name()
    else:
        return
    
    # black list words
    black_list = ["be"]
    if base in black_list:
        return
    
    # Iterate through synonyms
    for syns in s:
        # print("Synonym",syns)
        for lemma in syns.lemmas():
            # print("Lemma",lemma)
            synonym = lemma.name()
            syn_l.add(synonym)

    # if there are no synonyms then return None. Equals 0 if word is not in wordnet
    if len(syn_l) == 1 or len(syn_l) == 0:
        return
    # remove the original word from the set and pick a random synonym
    syn_l.remove(base)
    # if print_syn:
    # print(syn_l)
    similarities_dict = dict()
    for sub_word in syn_l:
      if sub_word  in en_model.vocab and word in  en_model.vocab:
        # print(sub_word)
        similarities_dict[sub_word.replace("_", " ")]  = en_model.similarity(word, sub_word)
      else:
        similarities_dict[sub_word] = 0
    
    sorted_words = sorted(similarities_dict.items(), key=lambda x: x[1], reverse = True)
    # print(sorted_words)
    chosen_word = sorted_words[0][0]
        # print(word)
    return chosen_word

In [26]:
get_synonym_wn('intelligent',wn.ADJ)

'levelheaded'

#### Substitute words in a list of words

In [27]:
def en_sub(word_list, sub_num):
    # Apply pos tagging on each words
    wl_with_pos = pos_tag(word_list)
    word_num = len(word_list)
	# choose random words to substitute
    repl_order = [i for i in range(word_num)]
    random.shuffle(repl_order)

	# substitute them
    repl_counter = 0
    for index in repl_order:
        
        # Stop replacing if number of substituted words are over sub_num
        if repl_counter >= sub_num:
            break
      
        word = wl_with_pos[index][0].lower()
        tag = wl_with_pos[index][1]

        # Only replace if the original word doesn't contain punctuations
        if check_punctuation(word):
            if tag == "JJ":                     #adjective
                if index != 0:
                    word_prev = wl_with_pos[index-1][0].lower()
                    tag_prev = wl_with_pos[index-1][1]

                    # check if this adj follows "more" e.g "more skillful"
                    if tag_prev == "RBR":  # comparative adverb (greater, more skillful)
                        synonym = get_synonym_wn(word, wn.ADJ)
                        if synonym:
                          # Convert to comparative form
                            synonym_comp = comparative(synonym)
                            # if synonym is 2 syllable or more then assign synonym, e.g "more skillful" -> "more refined", not "more more refined"
                            if "more" in synonym_comp:
                                word_list[index] = synonym
                                repl_counter += 1
                            # synonym is 1 syllable, remove preceeding "more" and assign synonym_comp, e.g "more skillful" -> "better"
                            else:
                                word_list[index-1] = ""
                                word_list[index] = synonym_comp
                                repl_counter += 1

                    # same with "most"
                    elif tag_prev == "RBS":  #adverb, superlative (biggest)
                        synonym = get_synonym_wn(word, wn.ADJ)
                        if synonym:
                            synonym_sup = superlative(synonym)
                            if "most" in synonym_sup:
                                word_list[index] = synonym
                                repl_counter += 1
                            else:
                                word_list[index-1] = ""
                                word_list[index] = synonym_sup
                                repl_counter += 1
                    else:
                        synonym = get_synonym_wn(word, wn.ADJ)
                        if synonym:
                            word_list[index] = synonym
                            repl_counter += 1
                else:
                    synonym = get_synonym_wn(word, wn.ADJ)
                    if synonym:
                        word_list[index] = synonym
                        repl_counter += 1
            
            elif tag == "JJR":     #adjective, comparative (larger)
                synonym = get_synonym_wn(word, wn.ADJ)
                if synonym:
                    # Switch to comparative
                    synonym_comp = comparative(synonym)
                    word_list[index] = synonym_comp
                    repl_counter += 1
            elif tag == "JJS":      #adjective, comparative (larger)
                synonym = get_synonym_wn(word, wn.ADJ)
                if synonym:
                    # Switch to superlative
                    synonym_sup = superlative(synonym)
                    word_list[index] = synonym_sup
                    repl_counter += 1
            elif tag == "NN":     # noun   
                synonym = get_synonym_wn(word, wn.NOUN)
                if synonym:
                    word_list[index] = synonym
                    repl_counter += 1
            elif tag == "NNS":   # plural nouns
                # If the original word is plural, pluralize the replacement as well
                synonym = get_synonym_wn(word, wn.NOUN)
                if synonym:
                    synonym_plur = pluralize(synonym)
                    word_list[index] = synonym_plur
                    repl_counter += 1
            elif tag == "RB":   #adverb
                # dont handle comparative and superlative adverb
                # 1-syllable comparative and superlative since pos_tag tags them as JJR and JJS :(
                synonym = get_synonym_wn(word, wn.ADV)
                if synonym:
                    word_list[index] = synonym
                    repl_counter += 1
            elif tag == "VB": # vanilla verb
                synonym = get_synonym_wn(word, wn.VERB)
                if synonym:
                    word_list[index] = synonym
                    repl_counter += 1
            elif tag == "VBD": # past tense
                synonym = get_synonym_wn(word, wn.VERB)
                if synonym:
                    # conjugate the replacement to past tense
                    synonym_past = conjugate(synonym, tense='p')
                    word_list[index] = synonym_past
                    repl_counter += 1
            elif tag == "VBG":  # verb ing (judging)
                synonym = get_synonym_wn(word, wn.VERB)
                if synonym:
                    # conjugate to verb ing
                    synonym_ing = conjugate(synonym, tense='part')
                    word_list[index] = synonym_ing
                    repl_counter += 1
            elif tag == "VBN":  # verb ing (judging)
                synonym = get_synonym_wn(word, wn.VERB)
                if synonym:
                    synonym_part = conjugate(synonym, tense='ppart')
                    word_list[index] = synonym_part
                    repl_counter += 1
            elif tag == "VBP":  # verb past participle (reunified)
                synonym = get_synonym_wn(word, wn.VERB)
                if synonym:
                    synonym_inf = conjugate(synonym, tense='inf')
                    word_list[index] = synonym_inf
                    repl_counter += 1
            elif tag == "VBZ":  # verb, present tense with 3rd person singular (bases)
                synonym = get_synonym_wn(word, wn.VERB)
                if synonym:
                    synonym_3rd = conjugate(synonym, tense='3sg')
                    word_list[index] = synonym_3rd
                    repl_counter += 1
    
    for i in range(len(word_list)):
      word = word_list[i].replace('_',' ')
      word_list[i] = word
    
    # remove extra space resulting from removing "more"    
    return re.sub(" +", " ", " ".join(word_list))

In [28]:
sentence = "I &apos;d like to talk to you today about the scale of the scientific effort that goes into making the headlines you see in the paper ."
# sentence = "He is very level-headed ."

sentence = sentence.replace("&#91;", "[")
sentence = sentence.replace("&#93;", "]")
sentence = sentence.replace("&amp;", "&")
sentence = sentence.replace("&apos;", "'")
sentence = sentence.replace("&quot;", "\"")
sentence = sentence.strip().split()
print(sentence)
sentence = en_sub(sentence, floor(len(sentence) * 0.3))
sentence

['I', "'d", 'like', 'to', 'talk', 'to', 'you', 'today', 'about', 'the', 'scale', 'of', 'the', 'scientific', 'effort', 'that', 'goes', 'into', 'making', 'the', 'headlines', 'you', 'see', 'in', 'the', 'paper', '.']


RuntimeError: ignored

### Vietnamese data augmentation

In [29]:
def vi_sub(word_list, sub_num):
    """
    Use BERT and Word2Vec to subsitute words in the word_list
    This is done by replacing a word in the word list with <mask> token and pass this to BERT to generate the candidates. These candidates are ranked by how similar they are to the original word using Word2Vec.
    Input:
            word_list: the input word list
            sub_num: the number of words to be substituted
    Output:
            the string with substituted words
    """
	# add _ between 2 or more syllable words to match Word2Vec vocab
    word_num = len(word_list)
    for i in range(word_num):
        word_list[i] = word_list[i].replace(" ", "_")
	# choose random words to substitute
    replace_index = random.sample(range(0, word_num), sub_num)

	# substitute them
    for index in replace_index:
        original_word = word_list[index].lower()
        # print("Chosen word ",original_word)
        
        # Only mask if word doesn't contain punctuations
        if check_punctuation(original_word):
            # Mask the word to be replaced
            word_list[index] = "<mask>"
            # print(" ".join(word_list) )
            candidates = {}
            try:
				        # compute similarity of each candidates from phoBERT
                for entry in unmasker(" ".join(word_list)):
                    cand = "".join(entry["token_str"].split(" ")).lower()
                    # print(cand)
                    if check_punctuation(cand):
                        score = vi_model.similarity(cand, original_word)
                        candidates[cand] = score

			        	# find the best one that is not the original word
                for cand in Counter(candidates).most_common(len(candidates)):
                    best_cand = cand[0].replace("_", " ")
                    if best_cand != original_word:
                        break

                word_list[index] = best_cand
            except Exception as e:
				# keep the original word if an error occurs
                word_list[index] = original_word
                # print(e)

    return " ".join(word_list).replace("_", " ")

In [30]:
sentence = "Tôi muốn cho các bạn biết về sự to lớn của những nỗ lực khoa học đã góp phần làm nên các dòng tít bạn thường thấy trên báo ."

sentence = sentence.replace("&#91;", "[")
sentence = sentence.replace("&#93;", "]")
sentence = sentence.replace("&amp;", "&")
sentence = sentence.replace("&apos;", "'")
sentence = sentence.replace("&quot;", "\"")
sentence = sentence.strip().split()
print(sentence)
sentence = vi_sub(sentence, floor(len(sentence) * 0.3))
sentence

['Tôi', 'muốn', 'cho', 'các', 'bạn', 'biết', 'về', 'sự', 'to', 'lớn', 'của', 'những', 'nỗ', 'lực', 'khoa', 'học', 'đã', 'góp', 'phần', 'làm', 'nên', 'các', 'dòng', 'tít', 'bạn', 'thường', 'thấy', 'trên', 'báo', '.']


'em muốn cho các bạn thấy về sự ảnh hưởng lớn đối với những nỗ sĩ khoa học đã góp sức làm nên các bài tít em hay thấy trên báo .'

### Augment the data

In [36]:
def augment(sentence, mode='vi'):
    """
    Augment the original sentecnnce
    """
    # replace these with the actual characters so that they dont mess up the word tokenizer
    sentence = sentence.replace("&#91;", "[")
    sentence = sentence.replace("&#93;", "]")
    sentence = sentence.replace("&amp;", "&")
    sentence = sentence.replace("&apos;", "'")
    sentence = sentence.replace("&quot;", "\"")

    if mode == "vi":
        sentence = word_tokenize(sentence)

        # the number of substitutions depends on the sentence length 
        word_num = len(sentence)
        if 3 <= word_num < 5:
            sentence = vi_sub(sentence, 1)
        elif 5 <= word_num < 7:
            sentence = vi_sub(sentence, 2)
        elif 7 <= word_num < 15:
            sentence = vi_sub(sentence, ceil(word_num * 0.3))
        elif 15 <= word_num:
            sentence = vi_sub(sentence, ceil(word_num * 0.1))
        else:
            sentence = " ".join(sentence).replace("_", " ")
    
    elif mode == "en":
        sentence = sentence.strip().split()
        sentence = en_sub(sentence, floor(len(sentence) * 0.3))

    # revert the inital changes
    # sentence = sentence.replace("[", "&#91;")
    # sentence = sentence.replace("]", "&#93;")
    # sentence = sentence.replace("&", "&amp;")
    # sentence = sentence.replace("'", "&apos;")
    # sentence = sentence.replace("\"", "&quot;")

    return sentence

#### We only augment 65k sentences of our data

In [34]:
vi[0], en[0]

('Không làm được gì hết "\n',
 'And to get there , we have to change our workplaces , our policies and our culture .\n')

In [46]:
random.seed(42)
with open('./data/processed/train/train.vi') as f:
    vi = f.readlines()
# random.shuffle(vi)
with open('./data/processed/train/train.en') as f:
    en = f.readlines()
# random.shuffle(en)
chosen_index = random.sample([i for i in range(len(vi))], 65000)
vi  = [vi[i] for i in chosen_index]
en  = [en[i] for i in chosen_index]
print(vi[0], en[0])
print(vi[-1], en[-1])

# with open('./data/processed/augmented/synonym replacement/vi_augment.txt', 'a') as f_vi:
#     with open('./data/processed/augmented/synonym replacement/en_middle.txt', 'a') as f_en:
#         for i in tqdm(range(len(vi))):
#             vi_sent = augment(vi[i].strip())
#             en_sent = en[i].strip()
#             f_vi.write(vi_sent + "\n")
#             f_en.write(en_sent + "\n")
#             # stop when 65k sentences is reached
#             if i == 65000:
#                 break

# with open('./data/original/augmented/synonym replacement/en_augment.txt', 'w') as f:
#     with open('./data/original/augmented/synonym replacement/en_middle.txt', 'r') as f_en:
#         en = f_en.read().split("\n")[:-1]
#     for line in tqdm(en):
#         f.write(augment(line, 'en') + "\n")

Chưa ai từng làm việc đó , thế nên tôi sẽ làm nó .
 Nobody 's ever done it before , so I 'm going to go do it .

Nhưng thật đáng ngạc nhiên là chúng ta lại dành ra quá ít thời gian để chăm lo cho điều quan trọng bậc nhất : thể cách mà tâm trí chúng ta hoạt động . Điều này , một lần nữa , lại là điều tối hậu xác định chất lượng kinh nghiệm sống của chúng ta .
 Yet , we spend surprisingly little time taking care of what matters most -- the way our mind functions -- which , again , is the ultimate thing that determines the quality of our experience .



In [48]:
with open('./data/processed/augmented/synonym_replacement/vi_augment.txt', 'w') as f_vi:
    with open('./data/processed/augmented/synonym_replacement/en_middle.txt', 'w') as f_en:
        for i in tqdm(range(len(vi))):
            vi_sent = augment(vi[i].strip())
            en_sent = en[i].strip()
            f_vi.write(vi_sent + "\n")
            f_en.write(en_sent + "\n")
            # stop when 65k sentences is reached
            if i == 65000:
                break

with open('./data/processed/augmented/synonym_replacement/en_augment.txt', 'w') as f:
    with open('./data/processed/augmented/synonym_replacement/en_middle.txt', 'r') as f_en:
        en = f_en.read().split("\n")[:-1]
    for line in tqdm(en):
        f.write(augment(line, 'en') + "\n")

  2%|▏         | 1241/65000 [04:10<4:14:16,  4.18it/s]Token indices sequence length is longer than the specified maximum sequence length for this model (310 > 256). Running this sequence through the model will result in indexing errors
100%|██████████| 65000/65000 [3:35:57<00:00,  5.02it/s]
100%|██████████| 65000/65000 [02:50<00:00, 382.04it/s]


In [None]:
en[0], vi[0]

('And that changed my mindset forever .\n', 'Tôi tên Zamin .\n')

#### Concatenate the original data with the augmented data

In [None]:
len(en), len(vi)

(133317, 133317)

In [57]:
with open('./data/processed/train/train.vi') as f:
    vi = f.read().split("\n")[:-1]
# random.shuffle(vi)
with open('./data/processed/train/train.en') as f:
    en = f.read().split("\n")[:-1]
# random.shuffle(en)



with open('./data/processed/augmented/synonym_replacement/en_augment.txt', "r") as f:
    en_aug_65 = f.read().split("\n")[:-1]
with open('./data/processed/augmented/synonym_replacement/vi_augment.txt', "r") as f:
    vi_aug_65 = f.read().split("\n")[:-1]

final_en = en + en_aug_65
final_vi = vi + vi_aug_65

In [58]:
vi[0], en[0]

('Khoa học đằng sau một tiêu đề về khí hậu',
 'Rachel Pike : The science behind a climate headline')

In [59]:
vi_aug_65[0:10], en_aug_65[0:10]

(['Chưa ai bao giờ làm việc đó , thế nhưng mình sẽ làm nó .',
  'Bây giờ chúng ta không biết chính xác con ong nhìn cái gì , bạn chỉ biết cái tôi đang nhìn thấy và tôi gọi nó là màu đỏ .',
  'Và vào thời điểm đó , con phật đã được cảm hoá trở thành Phật của tình yêu và lòng vị tha .',
  'Khi tôi bắt đầu làm việc ở đó , chỉ có khoảng 1 % trẻ em được tiêm chủng đầy đủ .',
  'Quý vị có thể sử dụng mỗi phút trong phần thưởng ý nghĩa này như thế nào ?',
  'đây theo thứ tự có bốn gốc bazơ nitơ .',
  'Tất nhiên không ! Bạn phải thực hành 24 / 7 .',
  'TS : Các thỏi nam châm là sự kết hợp giữa trọng lực và từ trường , nên nó đại khái là sự pha trộn của các nguồn lực quanh ta , các nguồn lực có ảnh hưởng lên vạn vật .',
  'và sau năm 2050 , nó sẽ tăng lên hơn một nghìn tỷ dollar .',
  'con người đơn giản chỉ thích làm điều mà họ đã làm xong trước đấy .'],
 ["nobody 's ever done it before , so I 'm going to go do it .",
  "Now we don 't know exactly what a bee sees , any more more than than you 

In [61]:
len(final_en), len(final_vi)

(198317, 198317)

In [62]:
final_en[0], final_vi[0]

('Rachel Pike : The science behind a climate headline',
 'Khoa học đằng sau một tiêu đề về khí hậu')

In [63]:
with open('./data/processed/augmented/synonym_replacement/en_train_synonym_full.txt', "w") as f:
    for line in final_en:
      f.write(line + '\n')
with open('./data/processed/augmented/synonym_replacement/vi_train_synonym_full.txt', "w") as f:
    for line in final_vi:
      f.write(line + '\n')


In [None]:
with open("data/en_concat", "w") as f:
	with open("data/train.en", "r") as en:
		en_copy = en.read().split("\n")[:-1]
	en_concat_syn = en_aug_syn_65 + en_copy
	print(en_concat_syn[0])
	random.seed(15)
	random.shuffle(en_concat_syn)
	print(en_concat_syn[0])
	for sent in en_concat_syn:
		f.write(sent+"\n")


with open("data/vi_concat", "w") as f:
	with open("data/train.vi", "r") as vi:
		vi_copy = vi.read().split("\n")[:-1]
	vi_concat = vi_aug_65 + vi_copy
	print(vi_concat[0])
	random.seed(15)
	random.shuffle(vi_concat)
	print(vi_concat[0])
	for sent in vi_concat:
		f.write(sent+"\n")