<a href="https://colab.research.google.com/github/ymoslem/Adaptive-MT-LLM/blob/main/MT/ChatGPT-BatchTranslation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Batch Translation with ChatGPT

This notebook is part of the repository [Adaptive-MT-LLM](https://github.com/ymoslem/Adaptive-MT-LLM).

# Load files

In [None]:
# ✳️ Change target language code
tgt = "es"  # ar, es, fr, rw, zh

# ✳️ Change language
source_lang = "English"
target_lang = "Spanish"

source_file_name = f"tico-19-en{tgt}-dedup.en"
target_file_name = f"tico-19-en{tgt}-dedup.{tgt}"
similar_sentences_file_path = f"similar-10-tico.{tgt}.json"

output_translation_file_name = f"tico-en{tgt}-gpt-3.5-turbo-translations.{tgt}"

In [None]:
# Download sample files

source_url = f"https://raw.githubusercontent.com/ymoslem/Adaptive-MT-LLM/main/data/tico-19/tico-19-en{tgt}-dedup.en"
target_url = f"https://raw.githubusercontent.com/ymoslem/Adaptive-MT-LLM/main/data/tico-19/tico-19-en{tgt}-dedup.{tgt}"
fuzzy_url = f"https://github.com/ymoslem/Adaptive-MT-LLM/raw/main/data/fuzzy_matches/similar-10-tico.{tgt}.json.zip"

# Source
!wget -q $source_url -O $source_file_name
# Target
!wget -q $target_url -O $target_file_name
# Fuzzy matches
!wget -q $fuzzy_url -O $similar_sentences_file_path".zip"
!unzip -qq -o $similar_sentences_file_path".zip"

print("Downloaded!")

In [None]:
!ls -lh

In [None]:
# Load source
with open(source_file_name, "r") as source:
  source_sentences = [sent.strip() for sent in source.readlines()]
  print(len(source_sentences))
  print(source_sentences[0])

# Load target (references)
with open(target_file_name, "r") as reference:
  target_sentences = [sent.strip() for sent in reference.readlines()]
  print(len(target_sentences))
  print(target_sentences[0])

In [None]:
# Load similar translations

import json

with open(similar_sentences_file_path, 'rb') as file_path:
  paraphrases = json.load(file_path)

print(similar_sentences_file_path, len(paraphrases), sep="\n", end="\n\n")

paraphrases[4]

# Create prompts

In [None]:
!pip3 install tqdm -q

In [None]:
# Create a list of prompts with n SIMILAR examples (fuzzy matches)

from tqdm.notebook import tqdm

def create_prompt(source_lang: str,
                  target_lang: str,
                  source_sentences: list,
                  target_sentences: list,
                  paraphrases=None,
                  n_examples=0) -> list:

  """Create prompts of fuzzy matches and the new source sentence to translate
  as in the paper "Adaptive Machine Translation with Large Language Models"
  https://arxiv.org/abs/2301.13294

  source_lang: source language, e.g. "English"
  target_lang: target language, e.g. "Arabic"
  source_sentences: list of source languages
  target_sentences: list of target languages
  paraphrases: list of dictionaries for fuzzy matches - default None
  n_examples: number of fuzzy matches for each sentence, e.g. 0, 1, 5, 10, etc. - default 0

  Returns a list of prompts
  """

  prompts = []

  for sentence_pair in tqdm(zip(source_sentences, target_sentences), total=len(source_sentences)):
    source_sentence = sentence_pair[0]
    target_sentence = sentence_pair[1]

    # create one-shot or few-shot prompts
    if paraphrases != None and n_examples > 0:
      count = 0
      fuzzy_context = []

      for paraphrase in paraphrases:
        if count == n_examples:
            break
        elif paraphrase["origin_src"] == source_sentence:
          similar_sent_source = paraphrase["match_src"]
          similar_sent_target = paraphrase["match_tgt"]
          current_prompt = source_lang + ": " + similar_sent_source + "\n" + \
                            target_lang + ": " + similar_sent_target
          fuzzy_context.append(current_prompt)
          count += 1
        elif paraphrase["match_src"] == source_sentence:
          similar_sent_source = paraphrase["origin_src"]
          similar_sent_target = paraphrase["origin_tgt"]
          current_prompt = source_lang + ": " + similar_sent_source + "\n" + \
                            target_lang + ": " + similar_sent_target
          fuzzy_context.append(current_prompt)
          count += 1

      fuzzy_context.reverse()
      context = "\n".join(fuzzy_context)
      prompt = context + "\n" + \
              source_lang + ": " + source_sentence + "\n" + \
              target_lang + ":"

    # else, create zero-shot prompts
    else:
      prompt = source_lang + ": " + source_sentence + "\n" + \
               target_lang + ":"

    prompts.append(prompt.strip())

  return prompts

In [None]:
# 🟢 Set n_examples to 0 for zero-shot, 1 for one-shot, etc.
prompts = create_prompt(source_lang,
                        target_lang,
                        source_sentences,
                        target_sentences,
                        paraphrases,
                        n_examples=1
                        )

In [None]:
# Alternatively, to create zero-shot prompts without the previous function
# prompts = [source_lang + ": " + source_sentence + "\n" + target_lang + ":" \
#           for source_sentence in source_sentences]

In [None]:
print(len(prompts))

In [None]:
print(prompts[0])

# Generation

In [None]:
!pip3 install openai --upgrade -q

In [None]:
# ChatGPT generation function
# model: You can change "gpt-3.5-turbo" to "gpt-4", but for higher costs!

import openai
from tenacity import retry, stop_after_attempt, wait_random_exponential


# ✳️ Add your OpenAI API key
OPENAI_API_KEY = "your-API-key-here"
openai.api_key = OPENAI_API_KEY

@retry(wait=wait_random_exponential(min=2, max=60), stop=stop_after_attempt(6))
def translate(prompt, max_tokens, temperature=0.3, top_p=1):
  response = openai.chat.completions.create(
                                            model="gpt-3.5-turbo",
                                            temperature=temperature,
                                            max_tokens=max_tokens,
                                            messages=[
                                            {"role": "user",
                                            "content": prompt}
                                            ],
                                            top_p=top_p,
                                            frequency_penalty=0,
                                            presence_penalty=0,
                                            n=1,
                                            #stop=["\n"],
                                            )

  return response

In [None]:
# Test
test_translation = translate(prompt=prompts[0], max_tokens=100)
print(test_translation)

## Batch Processing

In [None]:
# Sending batch requsets

from concurrent import futures
from concurrent.futures import ThreadPoolExecutor

num_workers = 128

def batch_translate(prompts, **kwargs):
  with futures.ThreadPoolExecutor(max_workers=num_workers) as executor:
    response = executor.map(lambda prompt: translate(prompt=prompt, **kwargs), prompts)
  return list(response)

In [None]:
# Devide a long list of source sentences into smaller batches

def divide_chunks(l, n):
  # looping till length l
  for i in range(0, len(l), n):
    yield l[i:i + n]

In [None]:
# Parameters

temperature = 0.3
top_p = 1

# ✳️ Change the batch size for longer inputs/outputs
# Note: Trial accounts allow only 3 requests per minute
batch_size = 40

# ✳️ Change number of source words vs target tokens.
# Try 4 for French and Spanish; it can be 5 for some other languages like Arabic.
# You can also use the "tiktoken" library to tokenize the source,
# and then length_multiplier can be based on tokens rather than words.
length_multiplier = 4

In [None]:
# Batch translation

from tqdm.notebook import tqdm
from time import sleep


# Translate
translations = []
total = int(len(prompts)/batch_size)

for chunk_prompts in tqdm(divide_chunks(prompts, batch_size), total=total):
  length = [len(prompt.split("\n")[-2].split(" ")[1:]) for prompt in chunk_prompts]
  max_len = max(length) * length_multiplier

  outputs = batch_translate(prompts = chunk_prompts,
                            max_tokens = max_len,
                            temperature=temperature,
                            top_p = top_p)
  batch_translations = [output.choices[0].message.content.strip() for output in outputs]
  translations += batch_translations

  sleep(3)


# Report stats
print("Translations:", len(translations), end="\n\n")
print("• Last Translation:")
print("Prompt Tokens:", outputs[-1].usage.prompt_tokens)
print("Completion Tokens:", outputs[-1].usage.completion_tokens)
print("Total Tokens:", outputs[-1].usage.total_tokens, end="\n\n")
print(prompts[-1], end=" ")
print(translations[-1], sep="\n")

In [None]:
# Print the first 5 translations
print(*translations[:5], sep="\n")

# Save translations

In [None]:
!pip3 install nltk -q

import nltk
nltk.download("punkt")

In [None]:
# Save translations to a file
# This code also handles over-generation

from nltk import sent_tokenize, word_tokenize
import os

# ✳️ Where to save the translations
# It is better to connect Google Drive, and change 'directory'
directory = ""
output_file_name = output_translation_file_name
output_path = os.path.join(directory, output_file_name)

with open(output_path, "w+") as translated_file:
  for source, translation in zip(source_sentences, translations):
    translation = translation.strip()
    if "\n" in translation:
      translation = translation.split("\n")[0]
      translated_file.write(translation.strip() + "\n")
    elif len(sent_tokenize(translation)) > len(sent_tokenize(source)) and len(word_tokenize(translation)) > len(word_tokenize(source))*2:
      translation = sent_tokenize(translation)[0]
      translated_file.write(translation.strip() + "\n")
    else:
      translated_file.write(translation.strip() + "\n")

print("Translation file saved at:", output_path)

## Save to Google Drive

In [None]:
from google.colab import drive
drive.mount("/content/drive")

In [None]:
!mkdir /content/drive/MyDrive/chatGPT-translations/

In [None]:
new_path = f"/content/drive/MyDrive/chatGPT-translations/" + output_translation_file_name

!mv -nv $output_path $new_path