In [2]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from tqdm import tqdm
import pandas as pd
import os
import json
import subprocess

FLORES_DIR = "../data/flores-200/"

def get_lang_directions(languages, base_languages, order_matters=True, skip_base_pairs=False):
    """Helper function to generate the language directions."""

    directions = []
    
    for base in base_languages:
        if skip_base_pairs:
            other_languages = [lang for lang in languages if lang not in base_languages]
        else:
            other_languages = [lang for lang in languages if lang != base]

        for lang in other_languages:

            if order_matters:
                directions.append((base, lang))
                directions.append((lang, base))
            else:
                pair = tuple(sorted([base, lang]))
                if pair not in directions:
                    directions.append(pair)
    
    return directions

def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

def translate(model, tokenizer, sentences, target_language):
    translated_sentences = []
    
    for c in tqdm(chunks(sentences, BATCH_SIZE), total=int(len(sentences)/BATCH_SIZE)):
        # print(c)
        inputs = tokenizer(c, return_tensors="pt", padding=True).to("cuda")
        # print(inputs)
    
        translated_tokens = model.generate(
            inputs["input_ids"], forced_bos_token_id=tokenizer.convert_tokens_to_ids(target_language), max_length=256
        )
        translated_sentences.extend(tokenizer.batch_decode(translated_tokens, skip_special_tokens=True))

    return translated_sentences

def load_flores(src, tgt):
    """Loads the flores devtest dataset from the flores-200 directory"""
    with open(os.path.join(FLORES_DIR, f"{src}{tgt}", f"test.{src}-{tgt}.json")) as f:
        return json.load(f)

def get_flores_tgt_file(src, tgt):
    return os.path.join(FLORES_DIR, f"{src}{tgt}", f"test.{src}-{tgt}.{tgt}")

In [3]:
BATCH_SIZE=16

MODELS = ["facebook/nllb-200-distilled-600M", "facebook/nllb-200-1.3B", "facebook/nllb-200-3.3B"]
OUTPUT_DIR = "../data/benchmarks/"
METRICS = ["bleu", "chrf"]
CHRF_plus = True
SACREBLEU_TOKENIZER = "flores200"

USE_CACHED = True

LANGUAGE_PAIRS_NLLB_MAP = {"ban": "ban_Latn", "min": "min_Latn", "en": "eng_Latn", "id": "ind_Latn"}
TGT_LANGUAGES = ["ban", "min"]
BASE_LANGUAGES = ["en", "id"]
METRIC_MAPPING = {"BLEU": "bleu", "chrF2++": "chrf"}
DIRECTIONS = get_lang_directions(TGT_LANGUAGES, BASE_LANGUAGES)

In [8]:

dfs = {
    m: pd.DataFrame(columns=['-'.join(d) for d in DIRECTIONS])
    for m in METRICS
}

for m in dfs.keys():
    for model_name in MODELS:
        dfs[m].loc[model_name] = ""

for model_name in MODELS:
    MODEL_RESULTS_DIR = os.path.join(OUTPUT_DIR, model_name.replace("facebook/", ""))
    
    model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
    model.to("cuda")

    for src, tgt in DIRECTIONS:
        DIRECTION_RESULTS_DIR = os.path.join(MODEL_RESULTS_DIR, f"{src}{tgt}")
        os.makedirs(DIRECTION_RESULTS_DIR, exist_ok=True)

        sentence_output_path = os.path.join(DIRECTION_RESULTS_DIR, f"test-{src}-{tgt}")
        bleu_output_path = sentence_output_path + ".metrics.json"

        if not (os.path.isfile(bleu_output_path) and USE_CACHED):
            # run if there isn't a file
        
            translations = load_flores(src, tgt)
            src_sentences = [t["translation"][src] for t in translations]
            
            tokenizer = AutoTokenizer.from_pretrained(
                model_name, src_lang=LANGUAGE_PAIRS_NLLB_MAP[src]
            )
    
            translated_sentences = translate(model, tokenizer, src_sentences, LANGUAGE_PAIRS_NLLB_MAP[tgt])
    
            # Dump to test-{src}-{tgt}
            
            with open(sentence_output_path, "w") as f:
                f.write('\n'.join(translated_sentences))
    
            # calculate bleu score by running the command
            command = f"sacrebleu -tok {SACREBLEU_TOKENIZER} -w 2 {get_flores_tgt_file(src, tgt)} -m {' '.join(METRICS)}"
            if CHRF_plus:
                command += ' --chrf-word-order 2'
            command += f" < {sentence_output_path} > {bleu_output_path}"

            process = subprocess.run(command, shell=True, check=True, text=True)

        with open(bleu_output_path, 'r') as f:
            metrics = json.load(f)

        for m in metrics:
            dfs[METRIC_MAPPING[m["name"]]].loc[model_name][src+'-'+tgt] = float(m["score"])
    


    del model

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  dfs[METRIC_MAPPING[m["name"]]].loc[model_name][src+'-'+tgt] = float(m["score"])


tokenizer_config.json:   0%|          | 0.00/564 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/4.85M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.3M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/3.55k [00:00<?, ?B/s]

64it [02:23,  2.25s/it]                        
64it [01:50,  1.73s/it]                        
64it [02:04,  1.95s/it]                        
64it [01:52,  1.76s/it]                        
64it [02:28,  2.31s/it]                        
64it [01:44,  1.63s/it]                        
64it [02:13,  2.08s/it]                        
64it [01:47,  1.68s/it]                        


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [10]:
# FOR NLLB-54 MOE model

NLLB_PREDICTIONS_DIR = "../../nllb-flores/flores_translations/"

for m in dfs.keys():
    dfs[m].loc["nllb-moe-54b"] = ""

for src, tgt in DIRECTIONS:
    DIRECTION_RESULTS_DIR = os.path.join(MODEL_RESULTS_DIR, f"{src}{tgt}")
    os.makedirs(DIRECTION_RESULTS_DIR, exist_ok=True)

    src_path = os.path.join(NLLB_PREDICTIONS_DIR, f"flores200-{LANGUAGE_PAIRS_NLLB_MAP[src]}-{LANGUAGE_PAIRS_NLLB_MAP[tgt]}-devtest.hyp")
    sentence_output_path = os.path.join(DIRECTION_RESULTS_DIR, f"test-{src}-{tgt}")
    bleu_output_path = sentence_output_path + ".metrics.json"

    if not (os.path.isfile(bleu_output_path) and USE_CACHED):
        command = f"sacrebleu -tok {SACREBLEU_TOKENIZER} -w 2 {get_flores_tgt_file(src, tgt)} -m {' '.join(METRICS)}"
        if CHRF_plus:
            command += ' --chrf-word-order 2'
        command += f" < {src_path} > {bleu_output_path}"

        process = subprocess.run(command, shell=True, check=True, text=True)

    with open(bleu_output_path, 'r') as f:
        metrics = json.load(f)

    for m in metrics:
        dfs[METRIC_MAPPING[m["name"]]].loc["nllb-moe-54b"][src+'-'+tgt] = float(m["score"])

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  dfs[METRIC_MAPPING[m["name"]]].loc["nllb-moe-54b"][src+'-'+tgt] = float(m["score"])


In [16]:
for m, df in dfs.items():
    dfs[m].to_csv(os.path.join(OUTPUT_DIR, f"nllb.{m}.csv"))

In [13]:
columns = ["ban-en","en-ban","ban-id","id-ban", "min-en","en-min","min-id","id-min"]

In [14]:
dfs['bleu'][columns]

Unnamed: 0,ban-en,en-ban,ban-id,id-ban,min-en,en-min,min-id,id-min
facebook/nllb-200-distilled-600M,33.96,16.86,30.12,15.15,35.05,19.72,31.92,17.72
facebook/nllb-200-1.3B,37.24,17.73,32.42,16.21,38.59,22.79,34.68,20.89
facebook/nllb-200-3.3B,38.57,17.09,33.35,14.85,40.61,24.71,35.2,22.44
nllb-moe-54b,38.57,17.09,33.35,14.85,40.61,24.71,35.2,22.44


In [15]:
dfs['chrf'][columns]

Unnamed: 0,ban-en,en-ban,ban-id,id-ban,min-en,en-min,min-id,id-min
facebook/nllb-200-distilled-600M,54.4,42.7,53.83,40.71,55.41,46.18,56.06,44.03
facebook/nllb-200-1.3B,57.2,43.21,55.48,41.5,58.16,47.96,57.94,46.39
facebook/nllb-200-3.3B,58.01,42.24,56.34,39.75,59.84,49.27,58.32,47.74
nllb-moe-54b,58.01,42.24,56.34,39.75,59.84,49.27,58.32,47.74


In [9]:
import gzip

with gzip.open("../.cache/preprocess/seed/ban-en.ban.gz", "rt") as src, gzip.open("../.cache/preprocess/seed/ban-en.en.gz", "rt") as tgt:
    src_sentences = src.read().split('\n')
    tgt_sentences = tgt.read().split('\n')

In [34]:
tgt_translated = translate(tgt_sentences, "ind_Latn") # translate the english to balinese

388it [15:55,  2.46s/it]                         


In [35]:
with gzip.open("../.cache/preprocess/seed/ban-id.ban.gz", "wt") as src, gzip.open("../.cache/preprocess/seed/ban-id.id.gz", "wt") as tgt:
    for i in range(len(src_sentences)):
        src.write(src_sentences[i] + '\n')
        tgt.write(tgt_translated[i] + '\n')

In [13]:
src_sentences[:10]

['Lilian Diana Gish (14 Oktober 1893 - 27 Februari 1993) inggih punika aktris, sutradara, miwah penulis skenario Amerika.',
 'Gish inggih punika bintang film sane kasub ring warsa 1912 kanti 1920-an, sane manut khusus kaasosiasinin antuk film-film sutradara D. W. Griffith.',
 'Ia taler ngalaksanayang pagaen televise sane sedeng gede saking awal 1950-an kanti 1980-an, lan muputang gae mplalian nglawan Bette Davis ring film 1987 The Whales of August.',
 'Makudang-kudang generasi kapertama Gish inggih punika menteri Dunkard.',
 'Biang ipunne ngamukakang Majestic Candy Kitchen, lan luh-luh punika ngawantu ngadol popcorn lan permen ka pelanggan Majestic Theater sue, sane matongos ring sampingne.',
 'Lilian sane mayusa pitulas warsa ngalaksanayang pamargin ka Shawnee, Oklahoma, ring dije nyamane muani James, Alfred Grant Grish lan kurenanne, Maude, manongos.',
 'Ajine padem ring Norman, Oklahoma, ring warsa 1912, nanging ia sampun mawali ka Ohio makudang-kudang sasih sadurungne.',
 'Dugas Li

In [12]:
tgt_sentences[:10]

['Lillian Diana Gish (October 14, 1893 – February 27, 1993) was an American actress, director and screenwriter.',
 'Gish was a prominent film star from 1912 into the 1920s, being particularly associated with the films of director D. W. Griffith.',
 'She also did considerable television work from the early 1950s into the 1980s, and closed her career playing opposite Bette Davis in the 1987 film The Whales of August.',
 'The first several generations of Gishes were Dunkard ministers.',
 'Their mother opened the Majestic Candy Kitchen, and the girls helped sell popcorn and candy to patrons of the old Majestic Theater, located next door.',
 "The seventeen-year-old Lillian traveled to Shawnee, Oklahoma, where James's brother Alfred Grant Gish and his wife, Maude, lived.",
 'Her father died in Norman, Oklahoma, in 1912, but she had returned to Ohio a few months before this.',
 'When Lillian and Dorothy were old enough they joined the theatre, often traveling separately in different productio

In [50]:
with open("flores-eval/enban/test.en-ban.en", "r") as f:
    src_sentences = [s for s in f.read().split('\n') if s != '']

In [53]:
with open("openai/enban/nllb200-response.ban", "w") as f:
    f.write('\n'.join(tgt_sentences))