# This notebook demonstrates different augmentation strategy
  - [x] random swap 
  - [x] random delete
  - [x] google translate
  - [ ] random insertion
    
check out https://github.com/maelfabien/maelfabien.github.io/blob/9f2eb5092748a7cd8a7c964a7bda4968b65c4935/_posts/2019-10-28-NLP_8.md


In [1]:
import torch
import torch.nn as nn
import torch.optim as optim

from torchtext.datasets import Multi30k
from torchtext.data import Field, BucketIterator

import spacy
import numpy as np

import random
import math
import time

In [2]:
SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [3]:
spacy_de = spacy.load('de')
spacy_en = spacy.load('en')

In [4]:
def tokenize_de(text):
    """
    Tokenizes German text from a string into a list of strings (tokens) and reverses it
    """
    return [tok.text for tok in spacy_de.tokenizer(text)][::-1]

def tokenize_en(text):
    """
    Tokenizes English text from a string into a list of strings (tokens)
    """
    return [tok.text for tok in spacy_en.tokenizer(text)]

In [5]:
SRC = Field(tokenize = tokenize_de, 
            init_token = '<sos>', 
            eos_token = '<eos>', 
            lower = True)

TRG = Field(tokenize = tokenize_en, 
            init_token = '<sos>', 
            eos_token = '<eos>', 
            lower = True)

In [6]:
train_data, valid_data, test_data = Multi30k.splits(exts = ('.de', '.en'), 
                                                    fields = (SRC, TRG))

In [7]:
print(f"Number of training examples: {len(train_data.examples)}")
print(f"Number of validation examples: {len(valid_data.examples)}")
print(f"Number of testing examples: {len(test_data.examples)}")

Number of training examples: 29000
Number of validation examples: 1014
Number of testing examples: 1000


In [8]:
print(vars(train_data.examples[0]))

{'src': ['.', 'büsche', 'vieler', 'nähe', 'der', 'in', 'freien', 'im', 'sind', 'männer', 'weiße', 'junge', 'zwei'], 'trg': ['two', 'young', ',', 'white', 'males', 'are', 'outside', 'near', 'many', 'bushes', '.']}


In [9]:
SRC.build_vocab(train_data, min_freq = 2)
TRG.build_vocab(train_data, min_freq = 2)

In [10]:
print(f"Unique tokens in source (de) vocabulary: {len(SRC.vocab)}")
print(f"Unique tokens in target (en) vocabulary: {len(TRG.vocab)}")

Unique tokens in source (de) vocabulary: 7854
Unique tokens in target (en) vocabulary: 5893


In [11]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [12]:
# randomly delete words from a sentence with a given probability

def random_deletion(sentence, p=0.5): 
    # return if single word
    if len(sentence) == 1: 
        return sentence
    # delete words
    remaining = list(filter(lambda x: random.uniform(0,1) > p, sentence)) 
    # if nothing left, sample a random word
    if len(remaining) == 0: 
        return [] 
    else:
        return remaining
    

# randomly swap a pair of words in a sentence for a given # of times

def random_swap(sentence, n=5): 
    if len(sentence) < 2:
        return sentence
    length = range(len(sentence)) 
    for _ in range(n):
        idx1, idx2 = random.sample(length, 2)
        sentence[idx1], sentence[idx2] = sentence[idx2], sentence[idx1] 
    return sentence

In [13]:
index = random.randint(0,len(train_data))
sentence = train_data.examples[index].src
list(filter(lambda x: random.uniform(0,1) > 0, sentence))

['.',
 'vordergrund',
 'im',
 'spielerin',
 'einer',
 'mit',
 'roller-derby-spiel',
 'einem',
 'bei',
 'actionfoto',
 'ein']

In [14]:
import google_trans_new
from google_trans_new import google_translator
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
# translate a sentence to a random language,
# and translate back to original language

def back_translate(sentence, p=0.1, lang="en"):
    sentence = [str(i) for i in sentence]
    # do nothing with probability of 1-p
    if random.uniform(0,1) > p:
        return sentence
    
    # combine tokenized sentence into one string
    sentence = " ".join(sentence)
    print(sentence)
    # instantiate translator
    translator = google_translator()
    
    # choose a target language
    available_langs = list(google_trans_new.LANGUAGES.keys())
    trans_lang = random.choice(available_langs)
    
    # translate to the target language
    translations = translator.translate(sentence, lang_tgt=trans_lang)
    
    # translate back to original language
    translations_en_random = translator.translate(translations, lang_src=trans_lang, lang_tgt=lang)
    
    # select only one translation
    if len(translations_en_random) > 1:
        translations_en_random = translations_en_random[0]
        
    return word_tokenize(translations_en_random)

[nltk_data] Downloading package punkt to /home/deep/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [18]:
back_translate(sentence[1:], p=.9, lang="en")

vordergrund im spielerin einer mit roller-derby-spiel einem bei actionfoto ein


['i']