In [3]:
import os
data_dir = 'data'
PAD = 0
BOS = 1
EOS = 2
UNK = 3
class Vocab:
	def __init__(self, tokens=None, min_freq=1):
		self.freq = {}
		self.stoi = {}
		self.itos = {}
		# special tokens
		self.stoi = {"<pad>": PAD, "<bos>": BOS, "<eos>": EOS, "<unk>": UNK}
		self.itos = {v: k for k, v in self.stoi.items()}
		if tokens:
			self.build(tokens, min_freq)

	def build(self, token_seqs, min_freq=1):
		for seq in token_seqs:
			for tok in seq.split():
				self.freq[tok] = self.freq.get(tok, 0) + 1
		idx = max(self.itos.keys()) + 1
		for tok, f in sorted(self.freq.items()):
			if f >= min_freq and tok not in self.stoi:
				self.stoi[tok] = idx
				self.itos[idx] = tok
				idx += 1

	def encode(self, text, max_len=None):
		toks = text.split()
		ids = [self.stoi.get(t, UNK) for t in toks]
		ids = [BOS] + ids + [EOS]
		if max_len is not None:
			ids = ids[:max_len]
			if len(ids) < max_len:
				ids = ids + [PAD] * (max_len - len(ids))
		return ids

	def decode(self, ids):
		toks = [self.itos.get(i, "<unk>") for i in ids]
		return " ".join(toks)

In [4]:
src_file = os.path.join(data_dir, 'sample.en')
tgt_file = os.path.join(data_dir, 'sample.de')
src_lines = open(src_file, 'r', encoding='utf-8').read().strip().splitlines()
tgt_lines = open(tgt_file, 'r', encoding='utf-8').read().strip().splitlines()

# build vocabs
src_vocab = Vocab(src_lines)
tgt_vocab = Vocab(tgt_lines)

In [5]:
src_lines

['I am a student',
 'You are a teacher',
 'This is a small dataset',
 'Hello world',
 'Machine learning is fun',
 'I like apples',
 'She loves reading books',
 'He plays football',
 'We study together',
 'The weather is nice',
 'This is a test',
 'Transformers are powerful',
 'Sequence to sequence models',
 'I read a book',
 'Open the door',
 'Close the window',
 'Cows are in the field',
 'Birds can fly',
 'I write code',
 'You write code too']