In [1]:
!pip install nltk pytest pytest-cov -q

In [2]:
import nltk
from nltk.corpus import wordnet
from nltk import pos_tag
from nltk.tokenize import word_tokenize
import random
import re
from typing import List, Optional
import numpy as np


In [3]:
nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)
nltk.download('punkt', quiet=True)

True

In [4]:
resources = [
    'wordnet',
    'omw-1.4',
    'averaged_perceptron_tagger',
    'averaged_perceptron_tagger_eng',
    'punkt',
    'punkt_tab'
]

for resource in resources:
    try:
        nltk.download(resource, quiet=True)
        print(f"{resource}")
    except Exception as e:
        print(f"{resource} - {e}")

wordnet
omw-1.4
averaged_perceptron_tagger
averaged_perceptron_tagger_eng
punkt
punkt_tab


In [5]:
def set_seed(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)

In [6]:
def tokenize(text: str) -> List[str]:
    from nltk.tokenize import word_tokenize
    return word_tokenize(text)

In [7]:
def detokenize(tokens: List[str]) -> str:
    text = " ".join(tokens)
    # Fix spacing around punctuation
    text = re.sub(r'\s+([.,!?;:])', r'\1', text)  # Remove space before punctuation
    text = re.sub(r'\(\s+', r'(', text)  # Fix opening parentheses
    text = re.sub(r'\s+\)', r')', text)  # Fix closing parentheses
    text = re.sub(r'\s+\'', r'\'', text)  # Fix apostrophes

    return text

In [8]:
def get_wordnet_pos(treebank_tag: str) -> Optional[str]:
    from nltk.corpus import wordnet

    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return None


**Testing helper functions**

In [9]:
test_text = "Hello, UJ! You are amazing."
tokens = tokenize(test_text)
print(f"Original: {test_text}")
print(f"Tokens: {tokens}")
print(f"Detokenized: {detokenize(tokens)}")
print()


set_seed(42)
print(f"Random number (seed=42): {random.random()}")
set_seed(42)
print(f"Random number (seed=42 again): {random.random()} ← Same!")
print()


Original: Hello, UJ! You are amazing.
Tokens: ['Hello', ',', 'UJ', '!', 'You', 'are', 'amazing', '.']
Detokenized: Hello, UJ! You are amazing.

Random number (seed=42): 0.6394267984578837
Random number (seed=42 again): 0.6394267984578837 ← Same!



In [10]:
print("Testing POS tag conversion:")
test_tags = ['NN', 'VB', 'JJ', 'RB', 'DT']
from nltk.corpus import wordnet
for tag in test_tags:
    wn_pos = get_wordnet_pos(tag)
    print(f"  {tag} → {wn_pos}")

Testing POS tag conversion:
  NN → n
  VB → v
  JJ → a
  RB → r
  DT → None


# **AUGMENTATION METHOD 1: SYNONYM REPLACEMENT**


*   Replace random words with their synonyms using WordNet.



In [11]:
from nltk.corpus import wordnet
from nltk import pos_tag
from nltk.tokenize import word_tokenize
def get_synonyms(word: str, pos: Optional[str] = None) -> List[str]:

    synonyms = set()
    synsets = wordnet.synsets(word, pos=pos)

    for syn in synsets[:2]:
        for lemma in syn.lemmas():
            synonym = lemma.name()


            if '_' in synonym:
                continue

            if synonym.lower() == word.lower():
                continue

            if not synonym.isalpha():
                continue

            if len(synonym) > 12 or len(synonym) < 3:
                continue

            synonyms.add(synonym.lower())

    return list(synonyms)[:5]

In [12]:
def synonym_replacement(text: str, n: int = 1, seed: Optional[int] = None) -> str:

    if seed is not None:
        set_seed(seed)

    tokens = tokenize(text)
    pos_tags = pos_tag(tokens)

    replaceable_indices = []

    for idx, (word, tag) in enumerate(pos_tags):

        if tag in ('NNP', 'NNPS'):
            continue

        if word.isupper() and len(word) <= 5:
            continue

        if not word.isalpha():
            continue


        if len(word) < 4:
            continue

        wordnet_pos = get_wordnet_pos(tag)
        if wordnet_pos is None:
            continue

        synonyms = get_synonyms(word.lower(), pos=wordnet_pos)

        if len(synonyms) > 0:
            replaceable_indices.append(idx)

    n = min(n, len(replaceable_indices))
    if n == 0:
        return text

    indices_to_replace = random.sample(replaceable_indices, n)
    new_tokens = tokens.copy()

    for idx in indices_to_replace:
        word = tokens[idx]
        tag = pos_tags[idx][1]
        wordnet_pos = get_wordnet_pos(tag)

        synonyms = get_synonyms(word.lower(), pos=wordnet_pos)

        if synonyms:
            synonym = random.choice(synonyms)
            if word[0].isupper():
                synonym = synonym.capitalize()

            new_tokens[idx] = synonym

    return detokenize(new_tokens)

**TESTING SYNONYM REPLACEMENT**

In [13]:
print("Testing Synonym Replacement")
print("-" * 30)

text = "The movie was really good."
print("Original:")
print(text)
print()

print("Augmented:")
aug = synonym_replacement(text, n=2, seed=42)
print(aug)
print()

print("-" * 30)

print("Reproducibility check:")
aug1 = synonym_replacement(text, n=2, seed=42)
aug2 = synonym_replacement(text, n=2, seed=42)
print(aug1)
print(aug2)
print("Same output:", aug1 == aug2)

print("-" * 30)

print("Synonym inspection (WordNet)")


words = ["good", "happy", "fast"]

for word in words:
    syns = get_synonyms(word)
    print(f"{word}: {syns[:5]}")


Testing Synonym Replacement
------------------------------
Original:
The movie was really good.

Augmented:
The flick was really full.

------------------------------
Reproducibility check:
The flick was really full.
The flick was really full.
Same output: True
------------------------------
Synonym inspection (WordNet)
good: ['goodness']
happy: ['felicitous']
fast: ['fasting']


# **Random Deletion Augmentation**

In this section, I implemented the **Random Deletion** text augmentation technique.
Random Deletion creates augmented samples by **randomly removing words** from a sentence with a fixed probability. This simulates incomplete or noisy text commonly found in real-world NLP data.

**Method overview:**

*   The input text is tokenized into words.
*   Each token is deleted with probability p.

*   A safety check ensures that at least one word remains.
*   The remaining tokens are reconstructed into a clean sentence.

This method complements synonym replacement by altering **sentence length** rather than word meaning.









In [14]:
def random_deletion(text: str, p: float = 0.1, seed: Optional[int] = None) -> str:
    if seed is not None:
        set_seed(seed)

    tokens = tokenize(text)

    if len(tokens) == 1:
        return text

    new_tokens = []

    for token in tokens:

        if token.isalpha() and token.isupper() and len(token) <= 4:
            new_tokens.append(token)
            continue

        if random.random() >= p:
            new_tokens.append(token)

    if len(new_tokens) == 0:
        alphabetic_tokens = [t for t in tokens if t.isalpha()]
        if alphabetic_tokens:
            new_tokens = [random.choice(alphabetic_tokens)]
        else:
            new_tokens = [tokens[0]]

    return detokenize(new_tokens)

In [15]:


print("Random Deletion Demo")
print("-" * 40)

# -------------------------------------------------
# DEMO 1: Basic Usage
# -------------------------------------------------
text = "The movie was really good and entertaining to watch."
print("Original text:")
print(text)
print()

print("Random deletion with different probabilities:")
for p in [0.1, 0.3, 0.5]:
    aug = random_deletion(text, p=p, seed=42)
    print(f"p={p}: {aug}")
print()


# -------------------------------------------------
# DEMO 2: Reproducibility
# -------------------------------------------------
print("Reproducibility check:")
aug1 = random_deletion(text, p=0.3, seed=42)
aug2 = random_deletion(text, p=0.3, seed=42)
print(aug1)
print(aug2)
print("Same output:", aug1 == aug2)
print()


# -------------------------------------------------
# DEMO 3: Punctuation and Short Text
# -------------------------------------------------
print("Edge cases:")
punct_text = "Wow! This is really good, isn't it?"
short_text = "Hello!"

print("With punctuation:")
print(random_deletion(punct_text, p=0.3, seed=1))

print("Single word:")
print(random_deletion(short_text, p=0.5, seed=1))
print()


# -------------------------------------------------
# DEMO 4: Realistic Example
# -------------------------------------------------
print("Realistic example:")
message = "URGENT: Your account will be suspended unless you verify now."
print("Original:")
print(message)

print("Augmented:")
print(random_deletion(message, p=0.3, seed=10))

print("\nRandom Deletion demo complete.")


Random Deletion Demo
----------------------------------------
Original text:
The movie was really good and entertaining to watch.

Random deletion with different probabilities:
p=0.1: The was really good and entertaining watch
p=0.3: The good and entertaining watch
p=0.5: The good and entertaining

Reproducibility check:
The good and entertaining watch
The good and entertaining watch
Same output: True

Edge cases:
With punctuation:
! This really good, is?
Single word:
!

Realistic example:
Original:
URGENT: Your account will be suspended unless you verify now.
Augmented:
URGENT: Your will be suspended you verify.

Random Deletion demo complete.


**Random Deletion Note**
Random Deletion may produce grammatically incomplete sentences, especially around punctuation. This is intentional and simulates noisy or truncated real-world text such as chat messages or corrupted inputs.

# **Random Insertion Augmentation**

In this section, I implemented the **Random Insertion** text augmentation technique.

Random Insertion works by **adding new words into a sentence at random positions.**
The inserted words are typically synonyms of existing words, which helps expand the sentence while preserving its original meaning.

This method increases sentence diversity and is useful for improving model generalization in NLP tasks.

In [16]:

def random_insertion(text: str, n: int = 1, seed: Optional[int] = None) -> str:

    if seed is not None:
        set_seed(seed)

    tokens = tokenize(text)

    if len(tokens) == 0:
        return text

    pos_tags = pos_tag(tokens)

    words_with_synonyms = []

    for token, (word, tag) in zip(tokens, pos_tags):

        if tag in ('NNP', 'NNPS'):
            continue

        if token.isalpha() and token.isupper() and len(token) <= 5:
            continue

        if not token.isalpha():
            continue

        if len(token) < 4:
            continue

        wordnet_pos = get_wordnet_pos(tag)
        if wordnet_pos is None:
            continue

        synonyms = get_synonyms(token.lower(), pos=wordnet_pos)
        if synonyms:
            words_with_synonyms.append((token, synonyms))

    if not words_with_synonyms:
        return text

    new_tokens = tokens.copy()

    for _ in range(n):
        word, synonyms = random.choice(words_with_synonyms)
        synonym = random.choice(synonyms)

        if word[0].isupper():
            synonym = synonym.capitalize()

        insert_position = random.randint(0, len(new_tokens))
        new_tokens.insert(insert_position, synonym)

    return detokenize(new_tokens)

In [17]:
print("=" * 70)
print("TESTING RANDOM INSERTION")
print("=" * 70)
print()

sentence = "The movie was really good and entertaining."
print(f"Original: {sentence}")
print()

for n_val in [1, 2, 3]:
    print(f"n={n_val}: {random_insertion(sentence, n=n_val, seed=42)}")
print()

r1 = random_insertion(sentence, n=2, seed=100)
r2 = random_insertion(sentence, n=2, seed=100)

print("Reproducibility check:")
print(r1)
print(r2)
print(r1 == r2)
print()

print("Different seeds:")
for i in range(3):
    print(f"Seed {i}: {random_insertion(sentence, n=2, seed=i)}")

print()
print("Random Insertion complete.")


TESTING RANDOM INSERTION

Original: The movie was really good and entertaining.

n=1: full The movie was really good and entertaining.
n=2: full The movie full was really good and entertaining.
n=3: full film The movie full was really good and entertaining.

Reproducibility check:
The movie was really good picture and entertaining picture.
The movie was really good picture and entertaining picture.
True

Different seeds:
Seed 0: genuinely The movie was really good and actually entertaining.
Seed 1: The movie was really pic good and picture entertaining.
Seed 2: The pic movie was truly really good and entertaining.

Random Insertion complete.


# **Random Swap Augmentation**

This section implements **Random Swap**, where two words in a sentence are randomly selected and their positions are swapped. This introduces variation in word order while keeping sentence length and vocabulary unchanged.

In [18]:
def random_swap(text: str, n: int = 1, seed: Optional[int] = None) -> str:
    if seed is not None:
        set_seed(seed)

    tokens = tokenize(text)

    if len(tokens) < 2:
        return text

    pos_tags = pos_tag(tokens)
    new_tokens = tokens.copy()

    swappable_indices = []

    for i, (token, (word, tag)) in enumerate(zip(tokens, pos_tags)):

        if tag in ('NNP', 'NNPS'):
            continue

        if token.isalpha() and token.isupper() and len(token) <= 5:
            continue

        if not token.isalpha():
            continue

        if len(token) < 4:
            continue

        swappable_indices.append(i)

    if len(swappable_indices) < 2:
        return text

    for _ in range(n):
        i, j = random.sample(swappable_indices, 2)
        new_tokens[i], new_tokens[j] = new_tokens[j], new_tokens[i]

    return detokenize(new_tokens)


In [19]:
print("=" * 70)
print("TESTING RANDOM SWAP")
print("=" * 70)
print()

sentence = "The movie was really good and entertaining."
print(f"Original: {sentence}")
print()

for n_val in [1, 2, 3]:
    print(f"n={n_val}: {random_swap(sentence, n=n_val, seed=42)}")
print()

r1 = random_swap(sentence, n=2, seed=100)
r2 = random_swap(sentence, n=2, seed=100)

print("Reproducibility check:")
print(r1)
print(r2)
print(r1 == r2)
print()

print("Different seeds:")
for i in range(3):
    print(f"Seed {i}: {random_swap(sentence, n=2, seed=i)}")

print()
print("Random Swap complete.")


TESTING RANDOM SWAP

Original: The movie was really good and entertaining.

n=1: The entertaining was really good and movie.
n=2: The good was really entertaining and movie.
n=3: The really was good entertaining and movie.

Reproducibility check:
The really was entertaining good and movie.
The really was entertaining good and movie.
True

Different seeds:
Seed 0: The entertaining was movie good and really.
Seed 1: The good was movie really and entertaining.
Seed 2: The really was entertaining good and movie.

Random Swap complete.


# **Back Translation Augmentation**

This section implements **Back Translation**, a text augmentation technique where a sentence is translated to another language and then translated back to the original language. This often produces a natural paraphrase while preserving meaning.

Back Translation is commonly used in NLP tasks to generate high-quality, semantically similar training data.

In [20]:
!pip install googletrans==3.1.0a0 -q

In [21]:
from googletrans import Translator

def back_translation(
    text: str,
    src_lang: str = "en",
    mid_lang: str = "fr",
    seed: Optional[int] = None
) -> str:

    translator = Translator()

    try:
        e
        translated = translator.translate(text, src=src_lang, dest=mid_lang)

        if translated is None:
            print(f"Translation to {mid_lang} failed")
            return text

        back_translated = translator.translate(translated.text, src=mid_lang, dest=src_lang)

        if back_translated is None:
            print(f"Back translation from {mid_lang} failed")
            return text

        return back_translated.text

    except Exception as e:
        print(f"Back translation error: {type(e).__name__}: {str(e)}")
        return text

print("Back Translation function implemented!")

Back Translation function implemented!


In [22]:
print("=" * 70)
print("TESTING BACK TRANSLATION")
print("=" * 70)
print()

sentence = "The movie was really good and entertaining."
print(f"Original: {sentence}")
print()

for lang in ["fr", "de", "es"]:
    augmented = back_translation(sentence, mid_lang=lang)
    print(f"Via {lang}: {augmented}")
print()

print("Reproducibility check:")
r1 = back_translation(sentence, mid_lang="fr", seed=42)
r2 = back_translation(sentence, mid_lang="fr", seed=42)
print(r1)
print(r2)
print(r1 == r2)
print()

print("Back Translation complete.")


TESTING BACK TRANSLATION

Original: The movie was really good and entertaining.

Back translation error: UnboundLocalError: cannot access local variable 'e' where it is not associated with a value
Via fr: The movie was really good and entertaining.
Back translation error: UnboundLocalError: cannot access local variable 'e' where it is not associated with a value
Via de: The movie was really good and entertaining.
Back translation error: UnboundLocalError: cannot access local variable 'e' where it is not associated with a value
Via es: The movie was really good and entertaining.

Reproducibility check:
Back translation error: UnboundLocalError: cannot access local variable 'e' where it is not associated with a value
Back translation error: UnboundLocalError: cannot access local variable 'e' where it is not associated with a value
The movie was really good and entertaining.
The movie was really good and entertaining.
True

Back Translation complete.


# **Text augmentation library combining all 5 methods.**

In [23]:
class TextAugmentor:

    def __init__(self, methods: Optional[List[str]] = None, sr_n: int = 2, rd_p: float = 0.1, ri_n: int = 2, rs_n: int = 2, bt_lang: str = "fr"):
        if methods is None:
            self.methods = ['sr', 'rd', 'ri', 'rs']
        else:
            self.methods = methods
        self.sr_n = sr_n
        self.rd_p = rd_p
        self.ri_n = ri_n
        self.rs_n = rs_n
        self.bt_lang = bt_lang

    def augment(self, text: str, num_aug: int = 1, seed: Optional[int] = None) -> List[str]:
        if seed is not None:
            set_seed(seed)
        augmented_texts = []
        attempts = 0
        max_attempts = num_aug * 3
        while len(augmented_texts) < num_aug and attempts < max_attempts:
            method = random.choice(self.methods)
            if method == 'sr':
                aug_text = synonym_replacement(text, n=self.sr_n, seed=seed+attempts if seed else None)
            elif method == 'rd':
                aug_text = random_deletion(text, p=self.rd_p, seed=seed+attempts if seed else None)
            elif method == 'ri':
                aug_text = random_insertion(text, n=self.ri_n, seed=seed+attempts if seed else None)
            elif method == 'rs':
                aug_text = random_swap(text, n=self.rs_n, seed=seed+attempts if seed else None)
            elif method == 'bt':
                aug_text = back_translation(text, mid_lang=self.bt_lang, seed=seed+attempts if seed else None)
            else:
                aug_text = text
            if aug_text != text and aug_text not in augmented_texts:
                augmented_texts.append(aug_text)
            attempts += 1
        return augmented_texts

    def augment_batch(self, texts: List[str], num_aug: int = 1, seed: Optional[int] = None) -> List[List[str]]:
        results = []
        for idx, text in enumerate(texts):
            text_seed = seed + idx if seed is not None else None
            augmented = self.augment(text, num_aug=num_aug, seed=text_seed)
            results.append(augmented)
        return results

    def augment_with_method(self, text: str, method: str, seed: Optional[int] = None) -> str:
        if method == 'sr':
            return synonym_replacement(text, n=self.sr_n, seed=seed)
        elif method == 'rd':
            return random_deletion(text, p=self.rd_p, seed=seed)
        elif method == 'ri':
            return random_insertion(text, n=self.ri_n, seed=seed)
        elif method == 'rs':
            return random_swap(text, n=self.rs_n, seed=seed)
        elif method == 'bt':
            return back_translation(text, mid_lang=self.bt_lang, seed=seed)
        else:
            raise ValueError(f"Unknown method: {method}. Valid: ['sr', 'rd', 'ri', 'rs', 'bt']")

    def __repr__(self) -> str:
        method_names = {'sr': 'Synonym Replacement', 'rd': 'Random Deletion', 'ri': 'Random Insertion', 'rs': 'Random Swap', 'bt': 'Back Translation'}
        methods_str = ", ".join([method_names[m] for m in self.methods])
        return f"TextAugmentor(methods=[{methods_str}])"


print("TextAugmentor class created!")
print()
print("Quick start:")
print("  aug = TextAugmentor()")
print('  results = aug.augment("Your text", num_aug=5)')

TextAugmentor class created!

Quick start:
  aug = TextAugmentor()
  results = aug.augment("Your text", num_aug=5)


In [24]:
print("Text Augmentation Demo")
print("-" * 40)

text = "The movie was really good and entertaining."

# Create augmentor with default settings
augmentor = TextAugmentor()
print("Original text:")
print(text)
print()

print("Automatic augmentation (mixed methods):")
augmented = augmentor.augment(text, num_aug=3, seed=42)
for i, t in enumerate(augmented, 1):
    print(f"{i}. {t}")
print()


print("Specific method (Synonym Replacement):")
print(augmentor.augment_with_method(text, method="sr", seed=42))
print()


texts = [
    "AI is transforming the world.",
    "Natural language processing is powerful."
]

print("Batch augmentation:")
batch_results = augmentor.augment_batch(texts, num_aug=2, seed=42)
for original, augs in zip(texts, batch_results):
    print(f"\nOriginal: {original}")
    for aug in augs:
        print(f"  - {aug}")

print("\nDemo complete.")


Text Augmentation Demo
----------------------------------------
Original text:
The movie was really good and entertaining.

Automatic augmentation (mixed methods):
1. The flick was really full and entertaining.
2. movie was really good and entertaining.
3. The picture was truly good and entertaining.

Specific method (Synonym Replacement):
The flick was really full and entertaining.

Batch augmentation:

Original: AI is transforming the world.
  - AI is transform the cosmos.
  - AI transforming the world.

Original: Natural language processing is powerful.
  - Natural speech processing is powerful.
  - Natural language processing is powerful

Demo complete.


In [25]:


augmentor = TextAugmentor()

augmentations = augmentor.augment(
    "The movie was really good and entertaining.",
    num_aug=3
)

for text in augmentations:
    print(text)


The film was really full and entertaining.
The movie was really good full and full entertaining.
The movie was actually full and entertaining.
