In [None]:
!pip install googletrans==4.0.0-rc1

!pip install asyncio
!pip install PyICU




**Translation Using LLMs**

In [None]:
!pip install transformers



English (opus-mt-nl-en)

German (opus-mt-nl-de)

French (opus-mt-nl-fr)

Spanish (opus-mt-nl-es)

**To get Translation Better:**

Key Improvements:
Beam Search:

Increased the num_beams (default: 1) to 5, which enhances translation quality by exploring multiple candidate translations.
Repetition Penalty:

Added repetition_penalty=1.2 to discourage repetitive phrases in the output.
Length Penalty:

Controlled the length of the translation using length_penalty=1.0, ensuring it isn't overly short or long.
Early Stopping:

Enabled early_stopping=True to stop the generation process once the best translation is found.
Truncation:

Added truncation=True to handle long input texts safely.


In [None]:
from transformers import MarianMTModel, MarianTokenizer

# Load the model and tokenizer for Dutch to English
model_name = "Helsinki-NLP/opus-mt-nl-en"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

def translate_text(text, tokenizer, model, num_beams=5, repetition_penalty=1.2, length_penalty=1.5):
    """
    Translates the given text using a MarianMT model and tokenizer with optimized settings.

    Args:
        text (str): Text to translate.
        tokenizer (MarianTokenizer): Tokenizer for the translation model.
        model (MarianMTModel): Translation model.
        num_beams (int): Number of beams for beam search.
        repetition_penalty (float): Penalty for repetitive translations.
        length_penalty (float): Penalty to control the length of translations.

    Returns:
        str: Translated text.
    """
    # Tokenize the input text
    inputs = tokenizer.encode(text, return_tensors="pt", padding=True, truncation=True)

    # Generate translation with beam search and penalties
    translated = model.generate(
        inputs,
        max_length=250,  # Limit the output length for control
        num_beams=num_beams,  # Beam search to improve quality
        repetition_penalty=repetition_penalty,  # Avoid repetitive output
        length_penalty=length_penalty,  # Control the length preference
        early_stopping=True  # Stop early when best beam is found
    )

    # Decode the translation
    return tokenizer.decode(translated[0], skip_special_tokens=True)

# Example usage
dutch_text = "Gebruik als trainer je eigen hockeyskills als je redelijk of goed kan hockeyen. Bovendien kan je de groep voor je winnen als je niet alleen mooi kan vertellen, maar ook in de partijk mooi kan laten zien."
translated_text = translate_text(dutch_text, tokenizer, model)

print("Translated Text:", translated_text)


Translated Text: As a trainer, use your own hockey skills if you can play hockey reasonably or well. Moreover, you can win the group if you can not only tell beautifully, but also show beautifully in the game.


In [None]:
from transformers import MarianMTModel, MarianTokenizer

# Load the model and tokenizer for Dutch to English
model_name = "Helsinki-NLP/opus-mt-nl-en"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

def translate_text(text, tokenizer, model, num_beams=8, repetition_penalty=1.5, length_penalty=1.2, max_length=250):
    """
    Translates the given text using a MarianMT model and tokenizer with optimized settings.

    Args:
        text (str): Text to translate.
        tokenizer (MarianTokenizer): Tokenizer for the translation model.
        model (MarianMTModel): Translation model.
        num_beams (int): Number of beams for beam search.
        repetition_penalty (float): Penalty for repetitive translations.
        length_penalty (float): Penalty to control the length of translations.
        max_length (int): Maximum translation length.

    Returns:
        str: Translated text.
    """
    # Tokenize the input text
    inputs = tokenizer.encode(text, return_tensors="pt", padding=True, truncation=True)

    # Generate translation with beam search and penalties
    translated = model.generate(
        inputs,
        max_length=max_length,  # Limit the output length for control
        num_beams=num_beams,  # Beam search to improve quality
        repetition_penalty=repetition_penalty,  # Avoid repetitive output
        length_penalty=length_penalty,  # Control the length preference
        early_stopping=True  # Stop early when best beam is found
    )

    # Decode the translation
    return tokenizer.decode(translated[0], skip_special_tokens=True)

# Example usage
dutch_text = "Gebruik als trainer je eigen hockeyskills als je redelijk of goed kan hockeyen. Bovendien kan je de groep voor je winnen als je niet alleen mooi kan vertellen, maar ook in de partijk mooi kan laten zien."
translated_text = translate_text(dutch_text, tokenizer, model)

print("Translated Text:", translated_text)


Translated Text: As a trainer, use your own hockey skills if you can play hockey reasonably or well. Moreover, you can win the group if you can not only tell beautifully, but also show nicely in the game.


Dutch to English and German

In [None]:
from transformers import MarianMTModel, MarianTokenizer

# Load the model and tokenizer for Dutch to English
nl_to_en_model_name = "Helsinki-NLP/opus-mt-nl-en"
nl_to_en_tokenizer = MarianTokenizer.from_pretrained(nl_to_en_model_name)
nl_to_en_model = MarianMTModel.from_pretrained(nl_to_en_model_name)

# Load the model and tokenizer for English to German
en_to_de_model_name = "Helsinki-NLP/opus-mt-en-de"
en_to_de_tokenizer = MarianTokenizer.from_pretrained(en_to_de_model_name)
en_to_de_model = MarianMTModel.from_pretrained(en_to_de_model_name)

def translate_text(text, tokenizer, model, num_beams=8, repetition_penalty=1.5, length_penalty=1.2, max_length=250):
    # Tokenize the input text
    inputs = tokenizer.encode(text, return_tensors="pt", padding=True, truncation=True)

    # Generate translation with beam search and penalties
    translated = model.generate(
        inputs,
        max_length=max_length,  # Limit the output length for control
        num_beams=num_beams,  # Beam search to improve quality
        repetition_penalty=repetition_penalty,  # Avoid repetitive output
        length_penalty=length_penalty,  # Control the length preference
        early_stopping=True  # Stop early when best beam is found
    )

    # Decode the translation
    return tokenizer.decode(translated[0], skip_special_tokens=True)

# Example usage for Dutch -> English -> German back translation
dutch_text = "Gebruik als trainer je eigen hockeyskills als je redelijk of goed kan hockeyen. Bovendien kan je de groep voor je winnen als je niet alleen mooi kan vertellen, maar ook in de partijk mooi kan laten zien."

# Translate Dutch to English
english_text = translate_text(dutch_text, nl_to_en_tokenizer, nl_to_en_model)
print(f"English Translation: {english_text}")

# Translate English to German
german_text = translate_text(english_text, en_to_de_tokenizer, en_to_de_model)
print(f"German Translation: {german_text}")


tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/768k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/797k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.27M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.33k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/298M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

English Translation: As a trainer, use your own hockey skills if you can play hockey reasonably or well. Moreover, you can win the group if you can not only tell beautifully, but also show nicely in the game.
German Translation: Als Trainer nutzen Sie Ihre eigenen Hockey-Fähigkeiten, wenn Sie Hockey vernünftig oder gut spielen können. Darüber hinaus können Sie die Gruppe gewinnen, wenn Sie nicht nur schön sagen können, sondern auch schön im Spiel zeigen.
