# NLLB-200 Training Data Statistics Estimation

There are no official numbers about the amount of data for each language which was used to train the NLLB-200 model. Therefore, here I attempt to estimate the distribution over the languages.

Training data consists of the following portions:

- Primary bitext
    - Public data
    - Seed data
- Mined bitext
- Backtranslated bitext

For more information on NLLB-200 training data, see https://github.com/facebookresearch/fairseq/tree/nllb/examples/nllb/data.

In [66]:
import json
import os
import re
from collections import defaultdict
from urllib.request import urlretrieve

import pandas as pd

## Primary Bitext

### Public Data

In [3]:
PUBL_PATH = "/Users/hirak/nllb_train_data/primary/public_data"

EXCLUDED_FILES = ["cached_lm_test.en", "test.fm.prob", "get_zero_shot_pairs.py",
                  "zeroshotcorpstats", "README", "train.tsv", ".DS_Store"]

# Mapping of ISO 639-1 codes to ISO 639-3 codes
ISO_MAP = {"bn": "ben", "en": "eng", "gu": "guj", "hi": "hin", "kn": "kan",
           "ml": "mal", "mr": "mar", "or": "ory", "pa": "pan", "ta": "tam",
           "te": "tel", "ur": "urd", "fr": "fra"}

# MISC_MAP = {"tir_ET": "tir"}

In [10]:
datasets_publ = sorted(os.listdir(PUBL_PATH))
datasets_publ.remove(".DS_Store")

file_paths_publ = []

for dataset in datasets_publ:
    dataset_path = os.path.join(PUBL_PATH, dataset)
    dataset_langs = set()
    for parent_dir, _, dir_files in os.walk(dataset_path):
        dir_files = [file for file in dir_files if file not in EXCLUDED_FILES]
        for file in dir_files:
            file_lang = file.split(".")[1]
            file_lang = ISO_MAP.get(file_lang, file_lang)
            if not file_lang in dataset_langs:  # Avoid duplicate data
                dataset_langs.add(file_lang)
                file_path = os.path.join(parent_dir, file)
                file_paths_publ.append(file_path)

/Users/hirak/nllb_train_data/primary/public_data/aau
/Users/hirak/nllb_train_data/primary/public_data/akuapem
/Users/hirak/nllb_train_data/primary/public_data/bianet
/Users/hirak/nllb_train_data/primary/public_data/cmu_hatian
/Users/hirak/nllb_train_data/primary/public_data/ffr
/Users/hirak/nllb_train_data/primary/public_data/french_ewe
/Users/hirak/nllb_train_data/primary/public_data/french_fongbe
/Users/hirak/nllb_train_data/primary/public_data/giossa
/Users/hirak/nllb_train_data/primary/public_data/hornmt
/Users/hirak/nllb_train_data/primary/public_data/indic_nlp
/Users/hirak/nllb_train_data/primary/public_data/kinya_smt
/Users/hirak/nllb_train_data/primary/public_data/lingala_songs
/Users/hirak/nllb_train_data/primary/public_data/mburisano
/Users/hirak/nllb_train_data/primary/public_data/menyo20k
/Users/hirak/nllb_train_data/primary/public_data/minangnlp
/Users/hirak/nllb_train_data/primary/public_data/mukiibi
/Users/hirak/nllb_train_data/primary/public_data/nynorsk_memories
/Users

In [4]:
file_paths_publ = []

for parent_dir, _, dir_file_names in os.walk(PUBL_PATH):
    dir_file_names = [n for n in dir_file_names if n not in EXCLUDED_FILES]
    dir_file_paths = [parent_dir + "/" + name for name in dir_file_names]
    file_paths_publ.extend(dir_file_paths)

In [5]:
lang_sents_publ = defaultdict(set)

for path in file_paths_publ:
    lang = path.split(".")[-1]
    lang = ISO_MAP.get(lang, lang)
    with open(path) as file:
        sents = file.readlines()
        lang_sents_publ[lang].update(sents)

In [None]:
lang_sizes_publ = {lang: len(set(sents)) for lang, sents in lang_sents_publ.items()}

In [24]:
df = pd.DataFrame(lang_sizes_publ.items(), columns=["lang", "num_sents_publ"])
df.sort_values("lang", inplace=True)

In [25]:
df.to_csv("lang_train_size.csv", index=False)

### Seed

In [2]:
df = pd.read_csv("lang_train_size.csv")

In [3]:
SEED_PATH = "/Users/hirak/nllb_train_data/primary/seed"

In [4]:
file_paths_seed = []

for parent_dir, _, dir_file_names in os.walk(SEED_PATH):
    dir_file_paths = [parent_dir + "/" + name for name in dir_file_names]
    file_paths_seed.extend(dir_file_paths)

In [7]:
lang_sizes_seed = defaultdict(int)

for path in file_paths_seed:
    lang = path.split("/")[-1]
    with open(path) as file:
        num_sents = len(file.readlines())
        lang_sizes_seed[lang] += num_sents

lang_sizes_seed["eng_Latn"] = 6193

In [9]:
for lang in lang_sizes_seed:
    lang_sizes_seed[lang] = 6193 

In [10]:
lang_sizes_seed

defaultdict(int,
            {'eng_Latn': 6193,
             'tzm_Tfng': 6193,
             'fur_Latn': 6193,
             'ltg_Latn': 6193,
             'mag_Deva': 6193,
             'lij_Latn': 6193,
             'pbt_Arab': 6193,
             'knc_Latn': 6193,
             'taq_Latn': 6193,
             'srd_Latn': 6193,
             'lim_Latn': 6193,
             'dzo_Tibt': 6193,
             'bho_Deva': 6193,
             'mri_Latn': 6193,
             'kas_Deva': 6193,
             'bug_Latn': 6193,
             'bjn_Arab': 6193,
             'vec_Latn': 6193,
             'dik_Latn': 6193,
             'ace_Arab': 6193,
             'grn_Latn': 6193,
             'nus_Latn': 6193,
             'szl_Latn': 6193,
             'ary_Arab': 6193,
             'ace_Latn': 6193,
             'scn_Latn': 6193,
             'bjn_Latn': 6193,
             'arz_Arab': 6193,
             'kas_Arab': 6193,
             'hne_Deva': 6193,
             'mni_Beng': 6193,
             'knc_Arab

In [11]:
df["num_sents_seed"] = df["lang"].map(lang_sizes_seed)

In [13]:
df.to_csv("lang_train_size.csv", index=False)

### GERL

In [14]:
GERL_PATH = "/Users/hirak/nllb_train_data/primary/gerl.json"
gerl = json.load(open(GERL_PATH))

In [26]:
data = gerl[2]["data"]

In [31]:
num_sents_gerl = 0

for pair in data:
    if pair["ee_sentence"] and pair["eng_sentence"]:
        num_sents_gerl += 1

In [32]:
num_sents_gerl

540

## Mined Bitext

### Retrieving Metadata URLs

In [50]:
README_PATH = "/Users/hirak/nllb_train_data/mined/README.md"
URL_PATTERN = r"https://dl.fbaipublicfiles.com/nllb/data/\w{3}_\w{4}-\w{3}_\w{4}.meta.v1.xz"

In [51]:
with open(README_PATH) as f:
    readme = f.read()

urls = re.findall(URL_PATTERN, readme)

In [53]:
with open("metadata_urls.txt", "w") as f:
    for url in urls:
        f.write(url + "\n")

### Downloading Metadata from URLs

In [62]:
with open("metadata_urls.txt") as f:
    urls = [line.strip() for line in f.readlines()]

In [68]:
test_urls = urls[:3]

for url in test_urls:
    urlretrieve(url, filename=url.split("/")[-1])

### Unpacking Metadata

In [70]:
! unxz code/ace_Latn-ban_Latn.meta.v1.xz

unxz: code/ace_Latn-ban_Latn.meta.v1.xz: No such file or directory


In [34]:
df_mined = pd.read_csv("../ace_Latn-ban_Latn.meta.v1", delimiter=" ")