<a href="https://colab.research.google.com/github/ymoslem/Adaptive-MT-LLM-Fine-tuning/blob/main/ChatGPT-Adaptive-MT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Batch Translation with ChatGPT

This notebook is part of the repository [Adaptive-MT-LLM-Fine-tuning](https://github.com/ymoslem/Adaptive-MT-LLM-Fine-tuning).

# Load files

In [None]:
import os

data_path = "/content/drive/MyDrive/data/"
directory = os.path.join(data_path, "spanish")

os.chdir(directory)
os.getcwd()

'/content/drive/MyDrive/data/spanish'

In [None]:
# Load test dataset

source_test_file = "all-filtered.es.real.test"
target_test_file = "all-filtered.en.real.test"

with open(source_test_file, encoding="utf-8") as source, open(target_test_file, encoding="utf-8") as target:
  source_sentences = [sent.strip() for sent in source.readlines()]
  target_sentences = [sent.strip() for sent in target.readlines()]

print(source_sentences[0])
print(target_sentences[0])

Período de validez después de abierto el envase: 10 horas.
Shelf life after first opening the container: 10 hours.


In [None]:
# Load fuzzy matches from the Context Dataset

online_test_file = "all-filtered.esen.ms-multi-12.online.test"

with open(online_test_file, encoding="utf-8") as online:
  lines = [line.strip().split(" ||| ") for line in online.readlines()]
  scores = [float(line[0].strip()) for line in lines]
  fuzzy_source_sentences = [line[1].strip() for line in lines]
  online_source_sentences = [line[2].strip() for line in lines]
  fuzzy_target_prefixes = [line[3].strip() for line in lines]

print(fuzzy_source_sentences[0])
print(online_source_sentences[0])
print(fuzzy_target_prefixes[0])

Período de validez después de abierto el envase: 4 semanas
Período de validez después de abierto el envase: 10 horas.
Shelf life after opening the immediate packaging: 4 weeks.


# Create prompts

In [None]:
# Function to create zero-shot and one-shot prompts

def create_prompt(source_lang,
                  target_lang,
                  fuzzy_sources,
                  fuzzy_targets,
                  new_sources,
                  one_shot=True
                  ):

  prompts = []

  if one_shot:
    for fuzzy_src, fuzzy_tgt, new_src in zip(fuzzy_sources, fuzzy_targets, new_sources):
      fuzzy_src = source_lang + ": " + fuzzy_src
      fuzzy_tgt = target_lang + ": " + fuzzy_tgt
      new_src = source_lang + ": " + new_src
      segment = fuzzy_src + "\n" + fuzzy_tgt + "\n" + new_src + "\n" + target_lang + ":"
      prompts.append(segment)
  else:
    for new_src in new_sources:
      new_src = source_lang + ": " + new_src
      segment = new_src + "\n" + target_lang + ":"
      prompts.append(segment)

  return prompts

In [None]:
source_lang = "Spanish"
target_lang = "English"

In [None]:
# Create prompts
# Set one_shot=True to create a one-shot prompts

prompts = create_prompt(source_lang,
                        target_lang,
                        fuzzy_source_sentences,
                        fuzzy_target_prefixes,
                        online_source_sentences,
                        one_shot=False
                        )

print(len(prompts))

10000


In [None]:
print(prompts[0], "\n")
print(prompts[-1])

Spanish: Período de validez después de abierto el envase: 10 horas.
English: 

Spanish: El mecanismo implicado en esta posible asociación es aún especulativo pero puede reflejar la mayor frecuencia en mujeres por la disfunción del esfínter de Oddi como lo señalado por Freeman y cols en su estudio 2.
English:


# Generation

In [None]:
!pip3 install openai --upgrade -q

In [1]:
# Get OpenAI API key from Colab Secrets

from google.colab import userdata
OPENAI_API_KEY = userdata.get("openai_api_key")

In [None]:
# ChatGPT generation function
# model: You can change "gpt-3.5-turbo" to "gpt-4", but for higher costs!

import openai
from tenacity import retry, stop_after_attempt, wait_random_exponential


# ✳️ Add your OpenAI API key
openai.api_key = OPENAI_API_KEY

@retry(wait=wait_random_exponential(min=2, max=60), stop=stop_after_attempt(6))
def translate(prompt, max_tokens, model, temperature=0.3, top_p=1):
  response = openai.chat.completions.create(
                                          model=model,
                                          temperature=temperature,
                                          max_tokens=max_tokens,
                                          messages=[
                                          {"role": "user",
                                          "content": prompt}
                                          ],
                                          top_p=top_p,
                                          frequency_penalty=0,
                                          presence_penalty=0,
                                          n=1,
                                          #stop=["\n"],
  )

  return response

In [None]:
# Test

test_translation = translate(prompt=prompts[0], max_tokens=100, model="gpt-3.5-turbo-1106")
print(test_translation.choices[0].message.content.strip())

Shelf life after opening the package: 10 hours.


## Batch Processing

In [None]:
# Sending batch requsets

from concurrent import futures
from concurrent.futures import ThreadPoolExecutor

num_workers = 128

def batch_translate(prompts, **kwargs):
  with futures.ThreadPoolExecutor(max_workers=num_workers) as executor:
    response = executor.map(lambda prompt: translate(prompt=prompt, **kwargs), prompts)
  return list(response)

In [None]:
# Devide a long list of source sentences into smaller batches

def divide_chunks(l, n):
  # looping till length l
  for i in range(0, len(l), n):
    yield l[i:i + n]

In [None]:
# Parameters

temperature = 0.3
top_p = 1

# ✳️ Change the batch size for longer inputs/outputs
# Note: Trial accounts allow only 3 requests per minute
batch_size = 20

# ✳️ Change number of source words vs target tokens.
# Try 4 for French and Spanish; it can be 5 for some other languages like Arabic.
# You can also use the "tiktoken" library to tokenize the source,
# and then length_multiplier can be based on tokens rather than words.
length_multiplier = 4

In [None]:
# Model name

model = "gpt-3.5-turbo"

# Other models
# model = "gpt-3.5-turbo-1106"
# model = "gpt-4"
# model = "gpt-4-1106-preview"  # GPT-4 TurboNew

In [None]:
# Batch translation

from tqdm.notebook import tqdm
from time import sleep
import json

start = 2380 # change to 0

# Translate
translations = []
total = int(len(prompts[start:])/batch_size)


with open("temp_output.json", "a") as output_file:

  for chunk_prompts in tqdm(divide_chunks(prompts[start:], batch_size), total=total):
    length = [len(prompt.split("\n")[-2].split(" ")[1:]) for prompt in chunk_prompts]
    max_len = max(length) * length_multiplier

    outputs = batch_translate(prompts = chunk_prompts,
                              max_tokens = max_len,
                              model = model,
                              temperature=temperature,
                              top_p = top_p)
    batch_translations = [output.choices[0].message.content.strip() for output in outputs]
    translations += batch_translations

    output_translations = [{"translation": translation.strip()} for translation in batch_translations]
    output_translations = "\n".join([json.dumps(translation, ensure_ascii=False) for translation in output_translations])
    # Write raw translations to a JSON file (without handling over-generation)
    output_file.write(output_translations + "\n")
    output_file.flush()

    sleep(10)


# Report stats
print("Translations:", len(translations), end="\n\n")
print("• Last Translation:")
print("Prompt Tokens:", outputs[-1].usage.prompt_tokens)
print("Completion Tokens:", outputs[-1].usage.completion_tokens)
print("Total Tokens:", outputs[-1].usage.total_tokens, end="\n\n")
print(prompts[-1], end=" ")
print(translations[-1], sep="\n")

In [None]:
len(translations)

In [None]:
# Print the first 5 translations
print(*translations[:5], sep="\n")

# Save translations

In [None]:
translations_file_name = "all-filtered.esen.ms-multi-12.online.test.translated-ChatGPT-gpt-3.5-turbo-zero-shot.en"
# translations_file_name = "all-filtered.esen.ms-multi-12.online.test.translated-ChatGPT-gpt-3.5-turbo-one-shot.en"

In [None]:
!pip3 install nltk -q

import nltk
nltk.download("punkt")

In [None]:
# Save translations to a file
# This code also handles over-generation

from nltk import sent_tokenize, word_tokenize
import os

# ✳️ Where to save the translations
# It is better to connect Google Drive, and change 'directory'
directory = ""
output_file_name = translations_file_name
output_path = os.path.join(directory, output_file_name)

with open(output_path, "w+") as translated_file:
  for source, translation in zip(source_sentences, translations):
    translation = translation.strip()
    if "\n" in translation:
      translation = translation.split("\n")[0]
      translated_file.write(translation.strip() + "\n")
    elif len(sent_tokenize(translation)) > len(sent_tokenize(source)) and len(word_tokenize(translation)) > len(word_tokenize(source))*2:
      translation = sent_tokenize(translation)[0]
      translated_file.write(translation.strip() + "\n")
    else:
      translated_file.write(translation.strip() + "\n")

print("Translation file saved at:", output_path)