In [69]:
from pathlib import Path

all_names = {}

for lang_file in Path("names/langs/").glob("*.txt"):
    lang = lang_file.stem
    with open(lang_file, "r", encoding="utf-8") as f:
        lang_names = f.read().splitlines()
        all_names[lang] = lang_names

In [70]:
# trick to remove all accent chars from utf-8 encoded, and converting to ascii encoding
import string
import unicodedata

all_letters = string.ascii_letters + " .,;'"

def unicode_to_ascii(name):
    chars = [c for c in unicodedata.normalize("NFD", name) if unicodedata.category(c) != "Mn" and c in all_letters]
    name  = ''.join(chars)
    return name

In [71]:
all_names = {lang: [unicode_to_ascii(name) for name in lang_names] for lang, lang_names in all_names.items()}

In [77]:
print(all_names["English"][:5])

['Abbas', 'Abbey', 'Abbott', 'Abdi', 'Abel']


In [73]:
vocab = set()
for lang, lang_names in all_names.items():
    for name in lang_names:
        for c in name:
            vocab.add(c)
vocab_size = len(vocab)
print(f"vocabulary size: {vocab_size}")

vocabulary size: 55


In [85]:
ctoi = {c: i for i, c in enumerate(sorted(vocab))}
itoc = {i: c for c, i in ctoi.items()}

In [88]:
import torch

In [95]:
# one-hot encode the names, every name is represented by a 2d tensor,
# each row is the one-hot encoded character, but as pytorch expects batches,
# every name is of dimension (len_name, 1, vocab_size), with batch dimension of 1
def name1hot(name):
    name_tensor = torch.zeros(len(name), 1, vocab_size) # (len_name, 1, vocab_size)
    for i, c in enumerate(name):
        idx = ctoi[c]
        name_tensor[i][0][idx] = 1.0
    return name_tensor

In [97]:
name1hot("Jones").shape

torch.Size([5, 1, 55])