## Language Translation

In [21]:
from transformers import MarianMTModel, MarianTokenizer


In [22]:
def get_model_and_tokenizer(source_lang, target_lang):
    model_name = f'Helsinki-NLP/opus-mt-{source_lang}-{target_lang}'
    model = MarianMTModel.from_pretrained(model_name)
    tokenizer = MarianTokenizer.from_pretrained(model_name)
    return model, tokenizer


In [23]:
def translate_text(text, model, tokenizer):
    # Tokenize the input text
    tokenized_text = tokenizer.prepare_seq2seq_batch([text], return_tensors='pt')

    # Perform translation
    translated_tokens = model.generate(**tokenized_text)

    # Decode and return the translated text
    translated_text = tokenizer.decode(translated_tokens[0], skip_special_tokens=True)

    return translated_text


In [24]:
# Get user input for language and text
source_lang = input("Enter the source language code (e.g., 'en' for English, 'fr' for French): ")
target_lang = input("Enter the target language code (e.g., 'de' for German, 'es' for Spanish): ")
text_to_translate = input("Enter the text to translate: ")

# Load the model and tokenizer for the given language pair
model, tokenizer = get_model_and_tokenizer(source_lang, target_lang)

# Translate the text
translated_text = translate_text(text_to_translate, model, tokenizer)

# Print the result
print(f"Original text: {text_to_translate}")
print(f"Translated text: {translated_text}")


Enter the source language code (e.g., 'en' for English, 'fr' for French): en
Enter the target language code (e.g., 'de' for German, 'es' for Spanish): hi
Enter the text to translate: you are a boy


`prepare_seq2seq_batch` is deprecated and will be removed in version 5 of HuggingFace Transformers. Use the regular
`__call__` method to prepare your inputs and targets.

Here is a short example:

model_inputs = tokenizer(src_texts, text_target=tgt_texts, ...)

If you either need to use different keyword arguments for the source and target texts, you should do two calls like
this:

model_inputs = tokenizer(src_texts, ...)
labels = tokenizer(text_target=tgt_texts, ...)
model_inputs["labels"] = labels["input_ids"]

See the documentation of your specific tokenizer for more details on the specific arguments to the tokenizer of choice.
For a more complete example, see the implementation of `prepare_seq2seq_batch`.



Original text: you are a boy
Translated text: आप एक लड़का हैं


In [12]:
!pip install gTTS


Collecting gTTS
  Downloading gTTS-2.5.4-py3-none-any.whl.metadata (4.1 kB)
Downloading gTTS-2.5.4-py3-none-any.whl (29 kB)
Installing collected packages: gTTS
Successfully installed gTTS-2.5.4


## Text to audio conversion

In [26]:
from gtts import gTTS
import os


# Language in which you want to convert
language = 'es'  # 'en' for English, you can change this to another language code

# Passing the text and language to the engine
tts = gTTS(text=translated_text, lang=language, slow=False)

# Save the audio file
tts.save("output.mp3")

# Play the audio (if you're on a local machine)
os.system("start output.mp3")  # For Windows
# For MacOS, use: os.system("afplay output.mp3")
# For Linux, use: os.system("mpg321 output.mp3")


32512

## Combined code

In [19]:
# Step 1: Install necessary libraries
!pip install transformers gTTS

# Step 2: Import required libraries
from transformers import MarianMTModel, MarianTokenizer
from gtts import gTTS
from IPython.display import Audio

# Step 3: Define function to load the MarianMT model and tokenizer
def get_model_and_tokenizer(source_lang, target_lang):
    model_name = f'Helsinki-NLP/opus-mt-{source_lang}-{target_lang}'
    model = MarianMTModel.from_pretrained(model_name)
    tokenizer = MarianTokenizer.from_pretrained(model_name)
    return model, tokenizer

# Step 4: Define the translation function
def translate_text(text, model, tokenizer):
    tokenized_text = tokenizer.prepare_seq2seq_batch([text], return_tensors='pt')
    translated_tokens = model.generate(**tokenized_text)
    translated_text = tokenizer.decode(translated_tokens[0], skip_special_tokens=True)
    return translated_text

# Step 5: Define function to convert text to speech
def text_to_speech(text, language):
    tts = gTTS(text=text, lang=language, slow=False)
    tts.save("translated_output.mp3")
    return Audio("translated_output.mp3")

# Step 6: Get user input for language pair and text to translate
source_lang = input("Enter the source language code (e.g., 'en' for English, 'fr' for French): ")
target_lang = input("Enter the target language code (e.g., 'de' for German, 'es' for Spanish): ")
text_to_translate = input("Enter the text to translate: ")

# Step 7: Load model and tokenizer for the specified language pair
model, tokenizer = get_model_and_tokenizer(source_lang, target_lang)

# Step 8: Translate the text
translated_text = translate_text(text_to_translate, model, tokenizer)

# Step 9: Display the translated text
print(f"Original Text: {text_to_translate}")
print(f"Translated Text: {translated_text}")

# Step 10: Convert the translated text to speech
audio = text_to_speech(translated_text, target_lang)

# Step 11: Play the audio in Google Colab
audio


Enter the source language code (e.g., 'en' for English, 'fr' for French): en
Enter the target language code (e.g., 'de' for German, 'es' for Spanish): hi
Enter the text to translate: sri is a girl


`prepare_seq2seq_batch` is deprecated and will be removed in version 5 of HuggingFace Transformers. Use the regular
`__call__` method to prepare your inputs and targets.

Here is a short example:

model_inputs = tokenizer(src_texts, text_target=tgt_texts, ...)

If you either need to use different keyword arguments for the source and target texts, you should do two calls like
this:

model_inputs = tokenizer(src_texts, ...)
labels = tokenizer(text_target=tgt_texts, ...)
model_inputs["labels"] = labels["input_ids"]

See the documentation of your specific tokenizer for more details on the specific arguments to the tokenizer of choice.
For a more complete example, see the implementation of `prepare_seq2seq_batch`.



Original Text: sri is a girl
Translated Text: यूरी लड़की है
