In [1]:
!git clone https://github.com/taidopurason/llm-mt
!pip install ./llm-mt

Cloning into 'llm-mt'...
remote: Enumerating objects: 2830, done.[K
remote: Counting objects: 100% (311/311), done.[K
remote: Compressing objects: 100% (276/276), done.[K
remote: Total 2830 (delta 37), reused 305 (delta 35), pack-reused 2519[K
Receiving objects: 100% (2830/2830), 2.14 MiB | 10.34 MiB/s, done.
Resolving deltas: 100% (151/151), done.
Processing ./llm-mt
  Preparing metadata (setup.py) ... [?25l- \ done
Collecting openai (from llm-mt-eval==0.1.0)
  Downloading openai-0.27.10-py3-none-any.whl (76 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.5/76.5 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tiktoken (from llm-mt-eval==0.1.0)
  Downloading tiktoken-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.7/1.7 MB[0m [31m46.4 MB/s[0m eta [36m0:00:00[0m
Collecting sacrebleu==2.3.1 (from llm-mt-eval==0.1.0)
  Downloading 

In [2]:
import logging

for handler in logging.root.handlers[:]:
    logging.root.removeHandler(handler)
    
logging.basicConfig(
    format="%(asctime)s | %(levelname)s | %(name)s | %(message)s",
    datefmt="%Y-%m-%d %H:%M:%S",
    level=logging.INFO,
)

In [3]:
import openai
from kaggle_secrets import UserSecretsClient

# For Kaggle notebooks:
user_secrets = UserSecretsClient()
openai.api_key = user_secrets.get_secret("tartunlp_openai")

# For everyone else:
# openai.api_key = os.environ["OPENAI_API_KEY"]

In [4]:
from llm_mt_eval.utils import read_json
from llm_mt_eval.translate import translate_document_files
from pathlib import Path
from typing import List


language_name_map = {
    "et": "Estonian",
    "en": "English",
    "de": "German",
    "ru": "Russian"
}

lang_pair_doc_names = {
    "et-en": ['doc-1-1.txt', 'doc-1-5.txt', 'doc-1-7.txt', 'doc-2-13.txt', 'doc-2-14.txt', 'doc-2-16.txt', 'doc-2-19.txt', 'doc-2-21.txt', 'doc-2-22.txt', 'doc-2-28.txt', 'doc-3-36.txt', 'doc-3-37.txt', 'doc-3-39.txt', 'doc-3-41.txt', 'doc-3-43.txt', 'doc-3-44.txt', 'doc-3-46.txt', 'doc-3-47.txt', 'doc-3-49.txt', 'doc-3-51.txt', 'doc-3-52.txt', 'doc-3-54.txt', 'doc-3-58.txt', 'doc-3-59.txt', 'doc-3-62.txt', 'doc-3-64.txt', 'doc-3-71.txt', 'doc-3-73.txt', 'doc-3-74.txt', 'doc-3-75.txt', 'doc-3-76.txt', 'doc-3-77.txt', 'doc-3-78.txt', 'doc-3-79.txt', 'doc-4-81.txt', 'doc-4-85.txt', 'doc-4-86.txt', 'doc-4-89.txt', 'doc-4-92.txt', 'doc-4-93.txt', 'doc-4-94.txt', 'doc-4-95.txt', 'doc-5-105.txt', 'doc-5-108.txt', 'doc-5-110.txt'],
    "en-et": ['doc-1-1.txt', 'doc-1-13.txt', 'doc-1-15.txt', 'doc-1-17.txt', 'doc-1-18.txt', 'doc-1-20.txt', 'doc-1-21.txt', 'doc-1-28.txt', 'doc-1-3.txt', 'doc-1-30.txt', 'doc-1-33.txt', 'doc-1-34.txt', 'doc-1-36.txt', 'doc-1-38.txt', 'doc-1-39.txt', 'doc-1-4.txt', 'doc-1-41.txt', 'doc-1-42.txt', 'doc-1-43.txt', 'doc-1-44.txt', 'doc-1-45.txt', 'doc-1-48.txt', 'doc-1-49.txt', 'doc-1-51.txt', 'doc-1-53.txt', 'doc-1-6.txt', 'doc-2-55.txt', 'doc-2-56.txt', 'doc-2-57.txt', 'doc-2-60.txt', 'doc-2-61.txt', 'doc-2-62.txt', 'doc-2-66.txt', 'doc-2-67.txt', 'doc-2-70.txt', 'doc-3-71.txt', 'doc-3-75.txt', 'doc-3-76.txt', 'doc-3-78.txt', 'doc-3-80.txt', 'doc-3-81.txt', 'doc-4-85.txt', 'doc-4-86.txt', 'doc-4-87.txt', 'doc-4-89.txt']
}    
    
for lang_pair in ["et-en"]:
    src, tgt = lang_pair.split("-")
    src_dir = f"llm-mt/data/mtee-news/{src}_src"
    ref_dir = f"llm-mt/data/mtee-news/{lang_pair}_ref"
    src_lang = language_name_map[src]
    tgt_lang = language_name_map[tgt]
    doc_names = lang_pair_doc_names[lang_pair]
    
    print(src_dir, ref_dir, src_lang, tgt_lang)
    print("translated doc names", doc_names)
    
    response_path = f"mtee-news.p2.{lang_pair}.response.json"
    
    translate_document_files(
        src_lang=src_lang,
        tgt_lang=tgt_lang,
        src_dir=src_dir,
        ref_dir=ref_dir,
        doc_names=doc_names,
        prompt="Translate the following {src_lang} text into {tgt_lang}:\n{sentence}",
        response_out_path = response_path,
        hyp_out_dir = f"mtee-news.p2.{lang_pair}_hyps",
        concat_hyp_out_path = f"mtee-news.p2.{lang_pair}.hyp",
        sent_delimiter = "\n",
        metrics = ("bleu", "chrf", "chrf++")
    )
    
    responses = read_json(response_path)
    prompt_tokens = sum([response["usage"]["prompt_tokens"] for response in responses])
    completion_tokens = sum(response["usage"]["completion_tokens"] for response in responses)
    cost = (prompt_tokens * 0.0015 + completion_tokens * 0.002) / 1000
    print("Cost:", cost)

llm-mt/data/mtee-news/et_src llm-mt/data/mtee-news/et-en_ref Estonian English
translated doc names ['doc-1-1.txt', 'doc-1-5.txt', 'doc-1-7.txt', 'doc-2-13.txt', 'doc-2-14.txt', 'doc-2-16.txt', 'doc-2-19.txt', 'doc-2-21.txt', 'doc-2-22.txt', 'doc-2-28.txt', 'doc-3-36.txt', 'doc-3-37.txt', 'doc-3-39.txt', 'doc-3-41.txt', 'doc-3-43.txt', 'doc-3-44.txt', 'doc-3-46.txt', 'doc-3-47.txt', 'doc-3-49.txt', 'doc-3-51.txt', 'doc-3-52.txt', 'doc-3-54.txt', 'doc-3-58.txt', 'doc-3-59.txt', 'doc-3-62.txt', 'doc-3-64.txt', 'doc-3-71.txt', 'doc-3-73.txt', 'doc-3-74.txt', 'doc-3-75.txt', 'doc-3-76.txt', 'doc-3-77.txt', 'doc-3-78.txt', 'doc-3-79.txt', 'doc-4-81.txt', 'doc-4-85.txt', 'doc-4-86.txt', 'doc-4-89.txt', 'doc-4-92.txt', 'doc-4-93.txt', 'doc-4-94.txt', 'doc-4-95.txt', 'doc-5-105.txt', 'doc-5-108.txt', 'doc-5-110.txt']


  0%|          | 0/45 [00:00<?, ?it/s]2023-08-31 10:16:15 | INFO | llm_mt_eval.translate | Translating doc-1-1.txt (llm-mt/data/mtee-news/et_src/doc-1-1.txt)
2023-08-31 10:16:17 | INFO | llm_mt_eval.translate | translating a request with 1090 tokens
  2%|▏         | 1/45 [00:20<15:00, 20.46s/it]2023-08-31 10:16:35 | INFO | llm_mt_eval.translate | Translating doc-1-5.txt (llm-mt/data/mtee-news/et_src/doc-1-5.txt)
2023-08-31 10:16:35 | INFO | llm_mt_eval.translate | translating a request with 627 tokens
  4%|▍         | 2/45 [00:30<10:28, 14.62s/it]2023-08-31 10:16:46 | INFO | llm_mt_eval.translate | Translating doc-1-7.txt (llm-mt/data/mtee-news/et_src/doc-1-7.txt)
2023-08-31 10:16:46 | INFO | llm_mt_eval.translate | translating a request with 375 tokens
  7%|▋         | 3/45 [00:38<08:03, 11.52s/it]2023-08-31 10:16:53 | INFO | llm_mt_eval.translate | Translating doc-2-13.txt (llm-mt/data/mtee-news/et_src/doc-2-13.txt)
2023-08-31 10:16:53 | INFO | llm_mt_eval.translate | translating a r

Cost: 0.06851600000000001
