Data augmentation techniques for Natural Language Processing (NLP) involve generating new training examples or modifying existing ones to expand the dataset. These techniques help improve the performance and generalization of NLP models. It's essential to ensure that the augmented data remains semantically and grammatically correct.

In [2]:
!pip install transformers
!pip install sentencepiece
from transformers import MarianMTModel, MarianTokenizer

Collecting transformers
  Downloading transformers-4.33.1-py3-none-any.whl (7.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m19.9 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.15.1 (from transformers)
  Downloading huggingface_hub-0.17.1-py3-none-any.whl (294 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m294.8/294.8 kB[0m [31m33.6 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m61.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m63.1 MB/s[0m eta [36m0:00:0

## Models Configuration
### Configuration of the first model
This model translates from English to French




In [3]:
# Get the name of the first model
first_model_name = 'Helsinki-NLP/opus-mt-en-fr'

# Get the tokenizer
first_model_tkn = MarianTokenizer.from_pretrained(first_model_name)

# Load the pretrained model based on the name
first_model = MarianMTModel.from_pretrained(first_model_name)

Downloading (…)olve/main/source.spm:   0%|          | 0.00/778k [00:00<?, ?B/s]

Downloading (…)olve/main/target.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.34M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.42k [00:00<?, ?B/s]



Downloading pytorch_model.bin:   0%|          | 0.00/301M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

## Configuration of the second model
This model translates from French to English



In [4]:
# Get the name of the second model
second_model_name = 'Helsinki-NLP/opus-mt-fr-en'

# Get the tokenizer
second_model_tkn = MarianTokenizer.from_pretrained(second_model_name)

# Load the pretrained model based on the name
second_model = MarianMTModel.from_pretrained(second_model_name)

Downloading (…)olve/main/source.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

Downloading (…)olve/main/target.spm:   0%|          | 0.00/778k [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.34M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/301M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

In [5]:
original_texts = ["The sunsets in this coastal town are breathtaking, painting the sky in hues of orange and pink."
                  "She had never tasted a dish so spicy before, and her taste buds were on fire."
                  "In the quiet of the night, the sound of crickets and distant laughter filled the air."
                  "The old bookstore on the corner is a hidden gem, filled with dusty tomes and forgotten stories."]

original_texts

['The sunsets in this coastal town are breathtaking, painting the sky in hues of orange and pink.She had never tasted a dish so spicy before, and her taste buds were on fire.In the quiet of the night, the sound of crickets and distant laughter filled the air.The old bookstore on the corner is a hidden gem, filled with dusty tomes and forgotten stories.']

In [6]:
def format_batch_texts(language_code, batch_texts):

  formated_bach = [">>{}<< {}".format(language_code, text) for text in batch_texts]

  return formated_bach

In [7]:
# Test of the function
format_batch_texts("fr", original_texts)

['>>fr<< The sunsets in this coastal town are breathtaking, painting the sky in hues of orange and pink.She had never tasted a dish so spicy before, and her taste buds were on fire.In the quiet of the night, the sound of crickets and distant laughter filled the air.The old bookstore on the corner is a hidden gem, filled with dusty tomes and forgotten stories.']

In [8]:
def perform_translation(batch_texts, model, tokenizer, language="fr"):
    # Prepare the text data into appropriate format for the model
    formated_batch_texts = format_batch_texts(language, batch_texts)

    # Generate translation using model
    translated = model.generate(**tokenizer(formated_batch_texts, return_tensors="pt", padding=True))

    # Convert the generated tokens indices back into text
    translated_texts = [tokenizer.decode(t, skip_special_tokens=True) for t in translated]

    return translated_texts

In [9]:
translated_texts = perform_translation(original_texts, first_model, first_model_tkn)


In [10]:
translated_texts


["Les couchers de soleil dans cette ville côtière sont à couper le souffle, peignant le ciel en teintes d'orange et de rose.Elle n'avait jamais goûté un plat si épicé auparavant, et ses papilles étaient en feu.Dans le calme de la nuit, le bruit de criquets et de rires lointains a rempli l'air.L'ancienne librairie au coin est un joyau caché, rempli de tomes poussiéreux et d'histoires oubliées."]

In [11]:
back_translated_texts = perform_translation(translated_texts, second_model, second_model_tkn)


In [12]:
back_translated_texts


['The sunsets in this coastal city are breathtaking, painting the sky in shades of orange and pink.She had never tasted such a spicy dish before, and her taste buds were on fire.In the quiet of the night, the noise of distant locusts and laughter filled the air.The old bookstore at the corner is a hidden gem, filled with dusty volumes and forgotten stories.']

In [13]:
def perform_back_translation(batch_texts, original_language="en", temporary_language="fr"):

  # Translate from Original to Temporary Language
  tmp_translated_batch = perform_translation(batch_texts, first_model, first_model_tkn, temporary_language)

  # Translate Back to English
  back_translated_batch = perform_translation(tmp_translated_batch, second_model, second_model_tkn, original_language)

  # Return The Final Result
  return back_translated_batch

In [14]:
def combine_texts(original_texts, back_translated_batch):

  return set(original_texts + back_translated_batch)

In [15]:
back_translated_batch = perform_back_translation(original_texts)
back_translated_batch

['The sunsets in this coastal city are breathtaking, painting the sky in shades of orange and pink.She had never tasted such a spicy dish before, and her taste buds were on fire.In the quiet of the night, the noise of distant locusts and laughter filled the air.The old bookstore at the corner is a hidden gem, filled with dusty volumes and forgotten stories.']

### Final Augmented Text data.
Modifying the back translation function by creating the augmentation



In [16]:
def perform_back_translation_with_augmentation(batch_texts, original_language="en", temporary_language="fr"):

  # Translate from Original to Temporary Language
  tmp_translated_batch = perform_translation(batch_texts, first_model, first_model_tkn, temporary_language)

  # Translate Back to English
  back_translated_batch = perform_translation(tmp_translated_batch, second_model, second_model_tkn, original_language)

  # Return The Final Result
  return combine_texts(original_texts, back_translated_batch)

In [17]:
final_augmented = perform_back_translation_with_augmentation(original_texts)
final_augmented

{'The sunsets in this coastal city are breathtaking, painting the sky in shades of orange and pink.She had never tasted such a spicy dish before, and her taste buds were on fire.In the quiet of the night, the noise of distant locusts and laughter filled the air.The old bookstore at the corner is a hidden gem, filled with dusty volumes and forgotten stories.',
 'The sunsets in this coastal town are breathtaking, painting the sky in hues of orange and pink.She had never tasted a dish so spicy before, and her taste buds were on fire.In the quiet of the night, the sound of crickets and distant laughter filled the air.The old bookstore on the corner is a hidden gem, filled with dusty tomes and forgotten stories.'}