In [103]:
!pip install transformers datasets openai python-dotenv together

Collecting together
  Downloading together-1.2.1-py3-none-any.whl.metadata (11 kB)
Collecting eval-type-backport<0.3.0,>=0.1.3 (from together)
  Downloading eval_type_backport-0.2.0-py3-none-any.whl.metadata (2.2 kB)
Collecting pillow<11.0.0,>=10.3.0 (from together)
  Downloading pillow-10.3.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (9.2 kB)
Collecting tabulate<0.10.0,>=0.9.0 (from together)
  Downloading tabulate-0.9.0-py3-none-any.whl.metadata (34 kB)
Collecting typer<0.13,>=0.9 (from together)
  Downloading typer-0.12.3-py3-none-any.whl.metadata (15 kB)
Collecting shellingham>=1.3.0 (from typer<0.13,>=0.9->together)
  Downloading shellingham-1.5.4-py2.py3-none-any.whl.metadata (3.5 kB)
Collecting rich>=10.11.0 (from typer<0.13,>=0.9->together)
  Downloading rich-13.7.1-py3-none-any.whl.metadata (18 kB)
Collecting markdown-it-py>=2.2.0 (from rich>=10.11.0->typer<0.13,>=0.9->together)
  Downloading markdown_it_py-3.0.0-py3-none-any.whl.metadata (6.9 kB)
Collecting mdurl~=0.1 (f

In [104]:
from itertools import permutations, combinations
from datasets import load_dataset
from dotenv import load_dotenv
import pandas as pd
import random
import json
import os
import re

from openai import OpenAI

load_dotenv("../.env")

client = OpenAI(api_key=os.environ["OPENAI_API_KEY"])

In [100]:
# UTIL FUNCTIONS

LLAMA3_PROMPT = """<|begin_of_text|><|start_header_id|>system<|end_header_id|>

{system_prompt}
<|eot_id|><|start_header_id|>user<|end_header_id|>

{user_message}<|eot_id|><|start_header_id|>assistant<|end_header_id|>

{model_answer}<|eot_id|>"""

def remove_indexes(lst, indexes):
    # Convert indexes to a set for O(1) lookups
    indexes_set = set(indexes)
    # Use list comprehension to filter out unwanted indexes
    return [item for i, item in enumerate(lst) if i not in indexes_set]

def get_lang_directions(languages, base_languages, order_matters=True, skip_base_pairs=False):
    """Helper function to generate the language directions."""

    directions = []
    
    for base in base_languages:
        if skip_base_pairs:
            other_languages = [lang for lang in languages if lang not in base_languages]
        else:
            other_languages = [lang for lang in languages if lang != base]

        for lang in other_languages:

            if order_matters:
                directions.append((base, lang))
                directions.append((lang, base))
            else:
                pair = tuple(sorted([base, lang]))
                if pair not in directions:
                    directions.append(pair)
    
    return directions

def create_batch_requests(df, prompt, model, output_filename):
    """Creates and dumps the batch requests to the output_filename .jsonl file"""
    with open(output_filename, "w") as f:
        for i,row in df.iterrows():
            source_lang_name = LANGUAGE_NAMES[row["source_lang"]]
            target_lang_name = LANGUAGE_NAMES[row["target_lang"]]
            f.write(json.dumps({
                  "custom_id": f"batch-clean-{i}",
                  "method": "POST",
                  "url": "/v1/chat/completions",
                  "body": {
                    "model": model,
                    "messages": [
                        {
                          "role": "system",
                          "content": prompt
                        },
                        {
                          "role": "user",
                          "content": f'{source_lang_name}: {row["source_sentence"]}\n{target_lang_name}: {row["target_sentence"]}\nCleaned:\n'
                        }
                    ],
                    "max_tokens": 256
                  }
                })+"\n")

def run_batch(filename):
    batch_input_file = client.files.create(
      file=open(filename, "rb"),
      purpose="batch"
    )
    batch_input_file_id = batch_input_file.id

    return client.batches.create(
        input_file_id=batch_input_file_id,
        endpoint="/v1/chat/completions",
        completion_window="24h",
        metadata={
          "description": "batch clean job"
        }
    )

def get_batch_results(batch_id):
    file_id = client.batches.retrieve(batch_id).output_file_id
    results_str = client.files.content(file_id).content.decode("utf-8")

    response_df = pd.DataFrame(columns=["source_sentence_cleaned", "target_sentence_cleaned"])
    for line in results_str.split('\n')[:-1]:
        o = json.loads(line)
        i = int(o["custom_id"].replace('batch-clean-', ''))

        response_str = o["response"]["body"]["choices"][0]["message"]["content"]
        re_pattern = r'(\w+):\s*([^\n]+)\n(\w+):\s*(.+)'
        match = re.search(pattern, response_str)
        if match is None or len(match.groups()) != 4:
            print(f"WARNING: Unparsable response: \"{response_str}\", response will be saved as a empty string")
            source_sentence = ""
            target_sentence = ""
        else:
            source_sentence = match.group(2)
            target_sentence = match.group(4)
        response_df.loc[i] = [source_sentence, target_sentence]
    return response_df

def example_to_llama3(example, system_prompt):
    src_lang = LANGUAGE_NAMES[example["source_lang"]]
    tgt_lang = LANGUAGE_NAMES[example["target_lang"]]
    user_message = f'{src_lang}: {example["source_sentence"]}\n{tgt_lang}:{example["target_sentence"]}'
    model_answer = f'{src_lang}: {example["source_sentence_cleaned"]}\n{tgt_lang}:{example["target_sentence_cleaned"]}'

    return LLAMA3_PROMPT.format(
        system_prompt=system_prompt,
        user_message=user_message,
        model_answer=model_answer
    )

In [7]:
# LOAD DATASETS

LANGUAGES = [
    'ban_Latn',
    'ace_Latn',
    'bjn_Latn',
    'bug_Latn',
    'min_Latn',
    'sun_Latn',
    'jav_Latn',
    'ind_Latn',
    'eng_Latn'
]
BASE_LANGUAGES = [
    "eng_Latn",
    "ind_Latn"
]
LANGUAGE_NAMES = {
    'ban_Latn': 'Balinese',
    'ace_Latn': 'Acehnese',
    'bjn_Latn': 'Banjar',
    'bug_Latn': 'Buginese',
    'min_Latn': 'Minangkabau',
    'sun_Latn': 'Sundanese',
    'jav_Latn': 'Javanese',
    'ind_Latn': 'Indonesian',
    'eng_Latn': 'English'
}
DIRECTIONS = get_lang_directions(LANGUAGES, BASE_LANGUAGES, order_matters=False, skip_base_pairs=True)
datasets = []
remove_indices = []

for i, (lang1, lang2) in enumerate(DIRECTIONS):
    try:
        dataset = load_dataset("allenai/nllb", f"{lang1}-{lang2}", ignore_verifications=True, trust_remote_code=True)["train"]
    except ValueError:
        try:
            dataset = load_dataset("allenai/nllb", f"{lang2}-{lang1}", ignore_verifications=True, trust_remote_code=True)["train"]
        except ValueError:
            remove_indices.append(i)
            print(f"WARNING: Removed {lang1}-{lang2} direction because it cannot be found in the dataset")
            continue
    datasets.append(dataset)

DIRECTIONS = remove_indexes(DIRECTIONS, remove_indices)




Downloading builder script:   0%|          | 0.00/9.49k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/5.05M [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/38.6k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/81.9k [00:00<?, ?B/s]

Repo card metadata block was not found. Setting CardData to empty.


Downloading data:   0%|          | 0.00/57.5M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/324936 [00:00<?, ? examples/s]

Repo card metadata block was not found. Setting CardData to empty.


Downloading data:   0%|          | 0.00/413M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2443442 [00:00<?, ? examples/s]

Repo card metadata block was not found. Setting CardData to empty.


Downloading data:   0%|          | 0.00/141M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/766894 [00:00<?, ? examples/s]

Repo card metadata block was not found. Setting CardData to empty.


Downloading data:   0%|          | 0.00/213M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1240098 [00:00<?, ? examples/s]

Repo card metadata block was not found. Setting CardData to empty.


Downloading data:   0%|          | 0.00/47.5M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/301972 [00:00<?, ? examples/s]

Repo card metadata block was not found. Setting CardData to empty.


Downloading data:   0%|          | 0.00/2.16G [00:00<?, ?B/s]

Generating train split:   0%|          | 0/11779642 [00:00<?, ? examples/s]

Repo card metadata block was not found. Setting CardData to empty.


Downloading data:   0%|          | 0.00/3.04G [00:00<?, ?B/s]

Generating train split:   0%|          | 0/16948924 [00:00<?, ? examples/s]

Loading dataset shards:   0%|          | 0/17 [00:00<?, ?it/s]

Repo card metadata block was not found. Setting CardData to empty.


Downloading data:   0%|          | 0.00/152M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/732976 [00:00<?, ? examples/s]

Repo card metadata block was not found. Setting CardData to empty.


Downloading data:   0%|          | 0.00/291M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1454976 [00:00<?, ? examples/s]

Repo card metadata block was not found. Setting CardData to empty.


Downloading data:   0%|          | 0.00/317M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1455384 [00:00<?, ? examples/s]



Repo card metadata block was not found. Setting CardData to empty.


Downloading data:   0%|          | 0.00/227M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1466998 [00:00<?, ? examples/s]

Repo card metadata block was not found. Setting CardData to empty.


Downloading data:   0%|          | 0.00/637M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/3029624 [00:00<?, ? examples/s]

Repo card metadata block was not found. Setting CardData to empty.


Downloading data:   0%|          | 0.00/346M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1929462 [00:00<?, ? examples/s]

In [8]:
!mkdir -p ../data/cleaner

In [13]:
# FILTERING

NUM_SAMPLES_PER_DIRECTION = 100
LID_THRESHOLD = 0.9
LASER_SCORE_THRESHOLD = 1.07

DF_STORE = "../data/cleaner/examples_df.pkl"

if 'df' in globals():
    df = pd.read_pickle(DF_STORE)
else:
    df = pd.DataFrame(columns=["source_lang", "target_lang", "source_sentence", "target_sentence"])
    
    i = 0
    for di, d in enumerate(datasets):
        direction = DIRECTIONS[di]
        n = 0
        for ex in d:
            if n > NUM_SAMPLES_PER_DIRECTION:
                break
            
            if ex["laser_score"] >= LASER_SCORE_THRESHOLD and ex["source_sentence_lid"] >= LID_THRESHOLD and ex["target_sentence_lid"] >= LID_THRESHOLD:
                df.loc[i] = [direction[0], direction[1], ex["translation"][direction[0]], ex["translation"][direction[1]]]
                n+=1
                i+=1
    
    df.to_pickle(DF_STORE)

In [15]:
# CREATE EXAMPLES FOR FINETUNED GPT

FT_GPT_EXAMPLE_FILEPATH = "../data/cleaner/ft-examples.jsonl"
FT_PROMPT = "Clean the data by identifying and fixing problems in parallel sentences. The problems include misalignment, repetition, incomplete translations, and inconsistent formatting."


create_batch_requests(
    df, 
    prompt=FT_GPT_EXAMPLE_FILEPATH, 
    model="ft:gpt-3.5-turbo-0125:personal::9T5FmQKr", 
    output_filename=FT_GPT_EXAMPLE_FILEPATH
)
ft_batch = run_batch(FT_GPT_EXAMPLE_FILEPATH)

In [106]:
ft_results = get_batch_results(ft_batch.id)
ft_df = df.join(ft_results)
ft_df = ft_df[(ft_df["source_sentence_cleaned"] != "")&(ft_df["target_sentence_cleaned"] != "")]



In [108]:
LLAMA3_SYSTEM_PROMPT = "Clean the data by identifying and fixing problems in parallel sentences. The problems include misalignment, repetition, incomplete translations, and inconsistent formatting. Provide the cleaned output without any repetition."

with open("../data/cleaner/together_cleaner_examples.jsonl", "w") as f:
    for _, row in ft_df.iterrows():
        f.write(json.dumps({
            "text": example_to_llama3(row, LLAMA3_SYSTEM_PROMPT)
        }) + "\n")



In [17]:
# CREATE EXAMPLES FOR FEW SHOT GPT 4o

GPT4_EXAMPLE_FILEPATH = "../data/cleaner/gpt4-examples.jsonl"
GPT4_PROMPT = """
Clean the data by identifying and fixing problems in parallel sentences. The problems include misalignment, repetition, incomplete translations, and inconsistent formatting. Provide the cleaned output without any repetition.

Examples:

1. 
Balinese: Anake akeh punika tumuli pada mabebaosan, sapuniki: 'Singke Anake ene pianak Yusupe?'
English: They said among themselves, 'Could this, at last, be the Son of Joseph?'
Cleaned:
Balinese: Anake akeh punika tumuli pada mabebaosan, sapuniki: 'Singke Anake ene pianak Yusupe?'
English: They said among themselves, 'Is this not Joseph's son?'

2.
Balinese: 15 Paurukan Ida Sang Panembahan sane tuturang tiang ring parasemeton, kadi asapuniki: iraga sane kantun urip ring rahina pangrauh Ida Sang Panembahan, tan pacang ngriinin anake sane sampun padem.
English: 15) According to the Lord's word, we tell you that we who are still alive, who are left until the coming of the Lord, will certainly not precede those who have fallen asleep.
Cleaned:
Balinese: 15 Paurukan Ida Sang Panembahan sane tuturang tiang ring parasemeton, kadi asapuniki: iraga sane kantun urip ring rahina pangrauh Ida Sang Panembahan, tan pacang ngriinin anake sane sampun padem.
English: 15 According to the Lord's word, we tell you that we who are still alive, who are left until the coming of the Lord, will certainly not precede those who have fallen asleep.

3.
Balinese: 4.Sane ngiring Ida wantah parautusan Idane kewanten.'
English: Only the APOSTLES were there.
Cleaned:
Balinese: Sane ngiring Ida wantah parautusan Idane kewanten.
English: Only the apostles were there.

4.
Balinese: Bali: Sakadi sane sinurat ring Cakepan Sucine: 'Tusing ada anak ane patut,'
English: As it is written: 'There is none righteous, no, not one (Rom.'
Cleaned:
Balinese: Sakadi sane sinurat ring Cakepan Sucine: 'Tusing ada anak ane patut,'
English: As it is written: 'There is none righteous, no, not one.'

5.
Balinese: 31Uning semeton napi ke punika rahina soma ribek?
English: 31 And what do you know what the Day of Decision is?
Cleaned:
Balinese: 31 Uning semeton napi ke punika rahina soma ribek?
English: 31 And what do you know what the Day of Decision is?
"""

create_batch_requests(
    df, 
    prompt=GPT4_PROMPT, 
    model="gpt-4o", 
    output_filename=GPT4_EXAMPLE_FILEPATH
)
gpt4_batch = run_batch(GPT4_EXAMPLE_FILEPATH)

In [82]:
gpt4_results = get_batch_results(gpt4_batch.id)
gpt4_df = df.join(gpt4_results)


Indonesian: Apalagi visi menuju Bali Era Baru yang sedang proses untuk diwujudkannya dengan menata secara fundamental dan komprehensif pembangunan di Bali, utamanya di bidang adat dan budaya yang ditransformasikan ke dalam pengakuan dan penguatan desa adat sangat sejalan dengan prinsip Tri Sakti yang pernah disampaikan oleh Ir. Soekarno.", response will be saved as a empty string
Minangkabau:", response will be saved as a empty string
