<a href="https://colab.research.google.com/github/tyson925/brokerChooser/blob/main/BrokerChooser.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
!pip install googletrans==4.0.0-rc1

from transformers import pipeline

from googletrans import Translator

import nltk

from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

# Download the NLTK data for BLEU score calculation
nltk.download('punkt')

import pandas as pd
dataset = pd.read_csv('/content/translated_output.csv')

class BlueScoreCalculator:
    def calculate_blue_score(self, reference, translation):
        """
        Calculate the BLEU score for the given reference and translation.

        Parameters:
        reference (str): The reference translation.
        translation (str): The candidate translation.

        Returns:
        float: The BLEU score.
        """
        # Tokenize the reference and candidate translation
        reference_tokens = reference.split()
        translation_tokens = translation.split()

        # Calculate the BLEU score
        bleu_score = sentence_bleu([reference_tokens], translation_tokens)

        return bleu_score

      # Function to calculate BLEU for each row
    def calculate_bleu(self,reference, candidate):
        reference = [reference.split()]  # Tokenize and wrap in a list
        candidate = candidate.split()  # Tokenize candidate
        smoothing = SmoothingFunction().method1  # Smoothing for short sentences
        bleu = sentence_bleu(reference, candidate, smoothing_function=smoothing)
        return bleu * 100  # Convert to percentage


#Define the Translation class for GoogleTranslator
class GoogleTranslator:
    def __init__(self, source_lang, target_lang, instructions=None):
        self.source_lang = source_lang
        self.target_lang = target_lang
        self.instructions = instructions
        self.translator = Translator()

    def translate(self, text):

        # Return only the translated text
        translated = self.translator.translate(text, src=self.source_lang, dest=self.target_lang)
        print(f"Translated Text: {translated.text}")
        return translated.text



class LanguageTranslator:
    def __init__(self, source_lang, target_lang, instructions=None):
        """
        Initialize the LanguageTranslator with source and target languages,
        and optional instructions for the LLM.

        Parameters:
        source_lang (str): The source language code (e.g., 'en' for English).
        target_lang (str): The target language code (e.g., 'hu' for Hungarian).
        instructions (str): Optional instructions or notes for the LLM.
        """
        self.source_lang = source_lang
        self.target_lang = target_lang
        self.instructions = instructions
        if target_lang == 'pt':
          self.model_name = "Helsinki-NLP/opus-mt-tc-big-en-pt"
        else:
          self.model_name = f"Helsinki-NLP/opus-mt-{source_lang}-{target_lang}"

        self.translator = pipeline("translation", model = self.model_name)
        self.text2text_generator = pipeline("text2text-generation", model = self.model_name)

    def generate_translate(self, text):
        """
        Translate the input text from source language to target language.

        Parameters:
        text (str): The text to be translated.

        Returns:
        str: The translated text.
        """
        # If instructions are provided, prepend them to the text
        if self.instructions:
            text = f"{self.instructions}[****] {text}"


        translated = self.text2text_generator(text)
        # Return only the translated text
        try:
            if self.instructions:
              return translated[0]['generated_text'].split("[****]")[1].strip()
            else:
              return translated[0]['generated_text']
        except Exception as e:
            print(f"Translation error: {text}")
            print(f"{e}")
            return text


    def translate(self, text):
        """
        Translate the input text from source language to target language.

        Parameters:
        text (str): The text to be translated.

        Returns:
        str: The translated text.
        """
        # If instructions are provided, prepend them to the text
        if self.instructions:
            text = f"{self.instructions}[****]\n{text}"
        else:
            text = f"{text}"

        # Perform the translation
        #print(text)
        translated = self.translator(text, max_length=512)

        # Return only the translated text
        try:
            if self.instructions:
              return translated[0]['translation_text'].split("[****]")[1].strip()
            else:
              return translated[0]['translation_text']
        except Exception as e:

            print(f"translation: {translated[0]['translation_text']}")
            #print(f"Translation error: {text}")
            print(f"{e}")
            return text

def manipulate_token(token, extra_char='*'):
    """
    Manipulate the input token by adding an extra character after each character.

    Parameters:
    token (str): The input token to manipulate.
    extra_char (str): The character to add after each character in the token.

    Returns:
    str: The manipulated token.
    """
    # Use a list comprehension to create the new token
    manipulated = ''.join([char + extra_char for char in token])
    return manipulated

def manipulate_tokens_in_sentence(sentence, extra_char='*'):
    """
    Tokenize the input sentence and manipulate tokens that start with '['.

    Parameters:
    sentence (str): The input sentence to tokenize.
    extra_char (str): The character to add after each character in the token.

    Returns:
    str: The manipulated sentence as a string.
    """
    # Tokenize the sentence using word_tokenize
    #tokenizer = TreebankWordTokenizer()
    #tokens = tokenizer.tokenize(sentence)
    tokens = sentence.split(' ')

    # Manipulate tokens that start with '['
    manipulated_tokens = [
        manipulate_token(token, extra_char) if (token.startswith('[') or token.endswith(']')) else token
        for token in tokens
    ]

    # Join the manipulated tokens back into a single string
    manipulated_sentence = ' '.join(manipulated_tokens)

    return manipulated_sentence

def clean_token(manipulated_token, extra_char='*'):
    """
    Clean the manipulated token by removing the extra characters.

    Parameters:
    manipulated_token (str): The manipulated token to clean.
    extra_char (str): The character that was added after each character in the original token.

    Returns:
    str: The cleaned token (original token).
    """
    # Remove the extra character by slicing
    cleaned = manipulated_token.replace(extra_char, '')
    return cleaned

def clean_tokens_in_sentence(sentence, extra_char='*'):
    """
    Tokenize the input sentence and manipulate tokens that start with '['.

    Parameters:
    sentence (str): The input sentence to tokenize.
    extra_char (str): The character to add after each character in the token.

    Returns:
    str: The manipulated sentence as a string.
    """
    # Tokenize the sentence using word_tokenize
    #tokenizer = TreebankWordTokenizer()
    #tokens = tokenizer.tokenize(sentence)
    tokens = sentence.split(' ')

    # Manipulate tokens that start with '['
    clean_tokens = [
        clean_token(token, extra_char) if (token.startswith('[') or token.rstrip("*").endswith(']')) else token
        for token in tokens
    ]

    # Join the manipulated tokens back into a single string
    cleaned_sentence = ' '.join(clean_tokens)

    return cleaned_sentence


# Example usage
source_language = "en"  # Source language code
target_language = "hu"
# Spanish, French, German, Japanese, Arabic, Hindi, and Portuguese.
target_languages = ['es','fr','de','jap','ar','hi','pt']  # Target languages codes
#target_languages = ['pt']
instructions = """
Please fulfill the following conditions when translating.
Purpose of the translation: To market our own brand of broker chooser and stock market to be displayed on
our website.
Target audience: men in their 35s
example translations:
 - "15+ years track record" == "15+ évnyi tapasztalat"
 - "Mutual funds are available" == "Befektetési alapok elérhetőek"
 do not translate any text between [] like [param name]
 Translate the following text:
"""
#string_utils = StringUtils()
dataset['english_manipulated'] = dataset['english'].apply(manipulate_tokens_in_sentence)

translator_instructions = LanguageTranslator(source_language, target_language, instructions)
translator = LanguageTranslator(source_language, target_language)
dataset[target_language+'_LLM_instructions'] = dataset['english'].apply(translator_instructions.translate)
#dataset[target_language+'_LLM_instructions_cleaned'] = dataset[target_language+'_LLM_instructions'].apply(clean_tokens_in_sentence)

#translator = LanguageTranslator(source_language, target_language)
#dataset[target_language+'_LLM'] = dataset['english'].apply(translator.generate_translate)

google_translator = GoogleTranslator(source_language, target_language, instructions)
dataset[target_language+'_google'] = dataset['english_manipulated'].apply(google_translator.translate)

dataset[target_language+'_google_cleaned'] = dataset[target_language+'_google'].apply(clean_tokens_in_sentence)

for language in target_languages:
  translator_lan_instructions = LanguageTranslator(source_language, language)
  dataset[language+'_LLM_instructions'] = dataset['english'].apply(translator_lan_instructions.generate_translate)

#google translate baseline solution for all langauges
#for language in target_languages:
#  google_translator = GoogleTranslator(source_language, language, instructions)
#  dataset[language+'_LLM'] = dataset['english_manipulated'].apply(google_translator.translate)
#  dataset[target_language+'_google_cleaned'] = dataset[target_language+'_google'].apply(clean_tokens_in_sentence)

# Create an instance of the BlueScoreCalculator
bleu_calculator = BlueScoreCalculator()


premissas = {'LLM','LLM_instructions'}
premissas = {}
for premissa in premissas:
  dataset[f"{target_language}_{premissa}"] = dataset.apply(
      lambda row: bleu_calculator.calculate_bleu(row['translated_value'], row[f"{target_language}_{premissa}"]),
      axis=1
  )

  blue_score = dataset[f"{target_language}_{premissa}"].mean()
  print(f"{target_language}_{premissa} blue score: {blue_score}")

dataset['bleu_score_google'] = dataset.apply(
    lambda row: bleu_calculator.calculate_bleu(row['translated_value'], row[target_language+'_google_cleaned']),
    axis=1
)
print(f"blue score: {dataset['bleu_score_google'].mean()}")
#print(f"Translated Text: {dataset}")

dataset.to_csv('translations.csv', index=False)
#print(f"Translated Text: {dataset}")



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu


Translated Text: 15 éves vagy annál idősebb eredmények
Translated Text: 125 500 ember már talált brókert ezen az eszközön keresztül!
Translated Text: A befektetési alapok rendelkezésre állnak
Translated Text: A legjobb határidős brókerek
Translated Text: Az archivált szálakhoz való hozzáférés csak a regisztrált felhasználók számára érhető el.Ennek nélkül csak 1 hónapig tarthatjuk őket.
Translated Text: a [*d*a*t*a*p*o*i*n*t*s*]*+*kritériumokon keresztül
Translated Text: Alternatívák a [*b*r*o*k*e*r*n*a*m*e*]*-re.
Translated Text: Elérhető a [*c*o*u*n*t*r*y*n*a*m*e*]*-ben
Translated Text: A legjobb forex brókerek
Translated Text: A legjobb tőzsdei alkalmazások
Translated Text: Brokerchooser díjak [*y*e*a*r*]*
Translated Text: A bróker nem érhető el a [*c*o*u*n*t*r*y*t*h*e*n*a*m*e*] -ben.
Translated Text: Számítsa ki a részvénykereskedelmi bizottságot a különféle brókereknél.
Translated Text: Nézze meg a legjobb brókereket a [*c*o*u*n*t*r*y*] -ben.
Translated Text: Töltse ki a bróker kér

config.json:   0%|          | 0.00/1.47k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/312M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/826k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.59M [00:00<?, ?B/s]

Device set to use cpu
Device set to use cpu


config.json:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/301M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/778k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/802k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.34M [00:00<?, ?B/s]

Device set to use cpu
Device set to use cpu


config.json:   0%|          | 0.00/1.33k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/298M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/768k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/797k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.27M [00:00<?, ?B/s]

Device set to use cpu
Device set to use cpu


config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/274M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/43.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/509k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/1.02M [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.64M [00:00<?, ?B/s]

Device set to use cpu
Device set to use cpu


config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/308M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/801k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/917k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.12M [00:00<?, ?B/s]

Device set to use cpu
Device set to use cpu


config.json:   0%|          | 0.00/1.39k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/306M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/293 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/44.0 [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/812k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/1.07M [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.10M [00:00<?, ?B/s]

Device set to use cpu
Device set to use cpu
Device set to use cpu
Device set to use cpu


Translated Text:                                               english  \
0                              15+ years track record   
1   125,500 people already found a broker via this...   
2                          Mutual funds are available   
3                                Best futures brokers   
4   Access to archived threads is only available t...   
5                       across [dataPoints]+ criteria   
6                        Alternatives to [brokerName]   
7                          Available in [countryName]   
8                                  Best forex brokers   
9                             Best stock trading apps   
10                        BrokerChooser Awards [year]   
11        Broker is not available in [countryTheName]   
12  Calculate stock trade commission at various br...   
13            Check out the best brokers in [country]   
14              Complete Find My Broker questionnaire   
15                                  FX Fee Calculator   
16            