## **Entrenamiento del tokenizador basado en sub-palabras**

### Librerías a usar

In [1]:
import sentencepiece as spm
import os

### Creamos archivo combinado: spanish + aymara --> combined.txt

In [2]:
bible_spanish_file_clean = "data/clean/bible/spanish.txt"
book_spanish_file_clean = "data/clean/book/spanish.txt"
combined_file = "data/clean/spanish.txt"

with open(combined_file, "w", encoding="utf-8") as out:
    with open(bible_spanish_file_clean, "r", encoding="utf-8") as f_bible_spanish:
        out.write(f_bible_spanish.read())
    
    with open(book_spanish_file_clean, "r", encoding="utf-8") as f_book_spanish:
        out.write(f_book_spanish.read())

print(f"Archivo {combined_file} guardado exitosamente ...")

Archivo data/clean/spanish.txt guardado exitosamente ...


In [3]:
bible_aymara_file_clean = "data/clean/bible/aymara.txt"
book_aymara_file_clean = "data/clean/book/aymara.txt"
combined_file = "data/clean/aymara.txt"

with open(combined_file, "w", encoding="utf-8") as out:
    with open(bible_aymara_file_clean, "r", encoding="utf-8") as f_bible_aymara:
        out.write(f_bible_aymara.read())
    
    with open(book_aymara_file_clean, "r", encoding="utf-8") as f_book_aymara:
        out.write(f_book_aymara.read())

print(f"Archivo {combined_file} guardado exitosamente ...")

Archivo data/clean/aymara.txt guardado exitosamente ...


In [4]:
spanish_file_clean = "data/clean/spanish.txt"
aymara_file_clean = "data/clean/aymara.txt"
combined_file = "data/clean/combined.txt"

with open(combined_file, "w", encoding="utf-8") as out:
    with open(spanish_file_clean, "r", encoding="utf-8") as f_spanish:
        out.write(f_spanish.read())
    
    with open(aymara_file_clean, "r", encoding="utf-8") as f_aymara:
        out.write(f_aymara.read())

print(f"Archivo {combined_file} guardado exitosamente ...")

Archivo data/clean/combined.txt guardado exitosamente ...


### Entrenamiento del tokenizador SentencePiece (BPE)

In [5]:
os.makedirs("tokenizer", exist_ok=True) # Creamos carpeta si no existe

In [6]:
combined_file = "data/clean/combined.txt"

In [7]:
spm.SentencePieceTrainer.Train(
    input=combined_file,
    model_prefix="tokenizer/SentencePiece",
    vocab_size=16000,
    model_type="bpe",
    character_coverage=1.0, # 100% de caracteres del texto debe ser reconocido por el tokenizer
    shuffle_input_sentence=True,
    normalization_rule_name="nmt_nfkc",
    pad_id=0,
    bos_id=1,
    eos_id=2,
    unk_id=3,
)

## **Verificación del tokenizador SentencePiece entrenado**

### Librerías a usar

In [8]:
import sentencepiece as spm

### Tokens especiales

In [9]:
sp = spm.SentencePieceProcessor()
sp.load("tokenizer/SentencePiece.model") # cargamos el tokenizador entrenado

True

In [10]:
print(f"pad_id --> {sp.pad_id()}")
print(f"bos_id --> {sp.bos_id()}")
print(f"eos_id --> {sp.eos_id()}")
print(f"unk_id --> {sp.unk_id()}")

pad_id --> 0
bos_id --> 1
eos_id --> 2
unk_id --> 3


### Probando el tokenizador

In [11]:
text = "¡ay de jerusalén, ciudad rebelde, contaminada y opresora!"
print("Tokens (ids):", sp.encode(text, out_type=int))
print("Tokens (Subwords):", sp.encode(text, out_type=str))

Tokens (ids): [231, 46, 25, 571, 15958, 551, 7931, 15958, 5794, 10296, 29, 7316, 703, 15981]
Tokens (Subwords): ['▁¡', 'ay', '▁de', '▁jerusalén', ',', '▁ciudad', '▁rebelde', ',', '▁contam', 'inada', '▁y', '▁opres', 'ora', '!']


In [12]:
# Los tokens BOS, EOS y PAD se ignoran al decodificar
sp.decode([1] + sp.encode(text, out_type=int) + [2] + [0, 0])

'¡ay de jerusalén, ciudad rebelde, contaminada y opresora!'

## **Tokenización del corpus**

### Librerías a utilizar

In [13]:
import sentencepiece as spm
import os
import numpy as np

### Cargamos tokenizador entrenado

In [15]:
sp = spm.SentencePieceProcessor()
sp.load("tokenizer/SentencePiece.model")
print("Tokenizador cargado exitosamente...")

Tokenizador cargado exitosamente...


### Tokenización de archivos: spanish.txt y aymara.txt

In [16]:
input_spanish = "data/clean/spanish.txt"
input_aymara = "data/clean/aymara.txt"

folder = "data/tokenized"
os.makedirs(folder, exist_ok=True)

output_spanish = f"{folder}/spanish.ids"
output_aymara = f"{folder}/aymara.ids"

# Tokenización de spanish.txt
with open(input_spanish, "r", encoding="utf-8") as f_input, open(output_spanish, "w", encoding="utf-8") as f_output:
    for line in f_input:
        text = line.strip()
        ids = [sp.bos_id()] + sp.encode(text, out_type=int) + [sp.eos_id()] # Tokenización usando SentencePiece
        f_output.write(" ".join(map(str, ids)) + "\n") # Guardamos la línea tokenizada

# Tokenización de aymara.txt
with open(input_aymara, "r", encoding="utf-8") as f_input, open(output_aymara, "w", encoding="utf-8") as f_output:
    for line in f_input:
        text = line.strip()
        ids = [sp.bos_id()] + sp.encode(text, out_type=int) + [sp.eos_id()] # Tokenización usando SentencePiece
        f_output.write(" ".join(map(str, ids)) + "\n") # Guardamos la línea tokenizada

## **Análisis de las longitudes de las secuencias**

In [17]:
spanish_tokenized = "data/tokenized/spanish.ids"
aymara_tokenized = "data/tokenized/aymara.ids"

pair_lengths = []  # aquí guardamos la cantidad máxima de tokens de cada par
with open(spanish_tokenized, "r", encoding="utf-8") as f_spanish, open(aymara_tokenized, "r", encoding="utf-8") as f_aymara:
    for spanish_line, aymara_line in zip(f_spanish, f_aymara):
        spanish_string = spanish_line.strip()
        spanish_ids = spanish_string.split()
        spanish_len = len(spanish_ids)

        aymara_string = aymara_line.strip()
        aymara_ids = aymara_string.split()
        aymara_len = len(aymara_ids)

        max_pair_len = max(spanish_len, aymara_len)
        pair_lengths.append(max_pair_len)


In [18]:
pair_lengths = np.array(pair_lengths)

for perc in [90, 95, 98, 99, 100]:
    print(f"percentile-{perc}%: {np.percentile(pair_lengths, perc):.0f} tokens")

percentile-90%: 51 tokens
percentile-95%: 58 tokens
percentile-98%: 66 tokens
percentile-99%: 72 tokens
percentile-100%: 153 tokens


### Resultados de las longitudes
* percentile-90%: 51 tokens
* percentile-95%: 58 tokens
* percentile-98%: 66 tokens
* percentile-99%: 72 tokens
* percentile-100%: 153 tokens

Establecemos una longitud máxima de 80 tokens, ya que este valor cubre más del 99 % de los pares del corpus y es computacionalmente eficiente para el entrenamiento del Transformer.

## **Creamos los splits: train / valid / test**

### Librerías a usar

In [19]:
import random
import os

### Cálculo de los splits

In [21]:
SEED = 36
random.seed(SEED)

MAX_LEN = 80 # Máxima longitud de los pares (español, aymara) --> cubre más del 99% de los pares del corpus

spanish_tokenized = "data/tokenized/spanish.ids"
aymara_tokenized  = "data/tokenized/aymara.ids"
spanish_clean     = "data/clean/spanish.txt"
aymara_clean      = "data/clean/aymara.txt"

pairs = []  # (es_ids, ay_ids, es_txt, ay_txt)

with open(spanish_tokenized, encoding="utf-8") as f_es_ids, \
     open(aymara_tokenized, encoding="utf-8") as f_ay_ids, \
     open(spanish_clean, encoding="utf-8") as f_es_txt, \
     open(aymara_clean, encoding="utf-8") as f_ay_txt:

    for es_ids, ay_ids, es_txt, ay_txt in zip(f_es_ids, f_ay_ids, f_es_txt, f_ay_txt):
        es_ids = es_ids.strip()
        ay_ids = ay_ids.strip()
        es_txt = es_txt.strip()
        ay_txt = ay_txt.strip()

        if not es_ids or not ay_ids:
            continue

        if len(es_ids.split()) <= MAX_LEN and len(ay_ids.split()) <= MAX_LEN:
            pairs.append((es_ids, ay_ids, es_txt, ay_txt))

print(f"Pares tras filtrado por longitud máxima de tokens --> {len(pairs)}")

random.shuffle(pairs)

n = len(pairs)
train_end = int(0.90 * n)
valid_end = int(0.95 * n)

train = pairs[:train_end]
valid = pairs[train_end:valid_end]
test  = pairs[valid_end:]

Pares tras filtrado por longitud máxima de tokens --> 163733


### Dataset para Zenodo

In [None]:
output_folder_zenodo = "data/zenodo"
os.makedirs(output_folder_zenodo, exist_ok=True) # creamos carpeta si no existe

with open(f"{output_folder_zenodo}/spanish.txt", "w", encoding="utf-8") as f_es_txt, \
     open(f"{output_folder_zenodo}/aymara.txt",  "w", encoding="utf-8") as f_ay_txt:

    for _, _, es_txt, ay_txt in pairs:
        f_es_txt.write(es_txt + "\n")
        f_ay_txt.write(ay_txt + "\n")

### Guardamos los splits: train / val / test

In [23]:
output_folder_ids = "data/splits/ids"
os.makedirs(output_folder_ids, exist_ok=True) # creamos carpeta si no existe
output_folder_texts = "data/splits/texts"
os.makedirs(output_folder_texts, exist_ok=True) # creamos carpeta si no existe

In [24]:
# training set
with open(f"{output_folder_ids}/train.spanish", "w", encoding="utf-8") as f_es_ids, \
     open(f"{output_folder_ids}/train.aymara",  "w", encoding="utf-8") as f_ay_ids, \
     open(f"{output_folder_texts}/train.spanish", "w", encoding="utf-8") as f_es_txt, \
     open(f"{output_folder_texts}/train.aymara",  "w", encoding="utf-8") as f_ay_txt:

    for es_ids, ay_ids, es_txt, ay_txt in train:
        f_es_ids.write(es_ids + "\n")
        f_ay_ids.write(ay_ids + "\n")
        f_es_txt.write(es_txt + "\n")
        f_ay_txt.write(ay_txt + "\n")


In [25]:
# validation set
with open(f"{output_folder_ids}/valid.spanish", "w", encoding="utf-8") as f_es_ids, \
     open(f"{output_folder_ids}/valid.aymara",  "w", encoding="utf-8") as f_ay_ids, \
     open(f"{output_folder_texts}/valid.spanish", "w", encoding="utf-8") as f_es_txt, \
     open(f"{output_folder_texts}/valid.aymara",  "w", encoding="utf-8") as f_ay_txt:

    for es_ids, ay_ids, es_txt, ay_txt in valid:
        f_es_ids.write(es_ids + "\n")
        f_ay_ids.write(ay_ids + "\n")
        f_es_txt.write(es_txt + "\n")
        f_ay_txt.write(ay_txt + "\n")

In [26]:
# test set
with open(f"{output_folder_ids}/test.spanish", "w", encoding="utf-8") as f_es_ids, \
     open(f"{output_folder_ids}/test.aymara",  "w", encoding="utf-8") as f_ay_ids, \
     open(f"{output_folder_texts}/test.spanish", "w", encoding="utf-8") as f_es_txt, \
     open(f"{output_folder_texts}/test.aymara",  "w", encoding="utf-8") as f_ay_txt:

    for es_ids, ay_ids, es_txt, ay_txt in test:
        f_es_ids.write(es_ids + "\n")
        f_ay_ids.write(ay_ids + "\n")
        f_es_txt.write(es_txt + "\n")
        f_ay_txt.write(ay_txt + "\n")