In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

# Use CUDA if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
print(f"Using device: {device}")

In [None]:
from pathlib import Path

text = Path('tiny-shakespeare.txt').read_text()

In [None]:
print(text[0:1000])

In [None]:
class CharTokenizer:
  def __init__(self, vocabulary):
    self.token_id_for_char = {char: token_id for token_id, char in enumerate(vocabulary)}
    self.char_for_token_id = {token_id: char for token_id, char in enumerate(vocabulary)}

  @staticmethod
  def train_from_text(text):
    vocabulary = set(text)
    return CharTokenizer(sorted(list(vocabulary)))

  def encode(self, text):
    token_ids = []
    for char in text:
      token_ids.append(self.token_id_for_char[char])
    return torch.tensor(token_ids, dtype=torch.long)

  def decode(self, token_ids):
    chars = []
    for token_id in token_ids.tolist():
      chars.append(self.char_for_token_id[token_id])
    return ''.join(chars)

  def vocabulary_size(self):
    return len(self.token_id_for_char)

In [None]:
tokenizer = CharTokenizer.train_from_text(text)

In [None]:
print(tokenizer.encode("Hello world"))

In [None]:
print(tokenizer.decode(tokenizer.encode("Hello world")))

In [None]:
tokenizer.vocabulary_size()

In [None]:
import pprint
pp = pprint.PrettyPrinter(depth=4)

In [None]:
pp.pprint(tokenizer.char_for_token_id)

In [None]:
pp.pprint(tokenizer.token_id_for_char)