In [1]:
import torch

# Check if CUDA is available
device = torch.device('cpu')
if torch.cuda.is_available():
    device = torch.device('cuda')

# Will still need to manually do `.to(device)` on nn.Module objects and some datasets
torch.set_default_device(device)
print(f"Using device = {torch.get_default_device()}")

Using device = cuda:0


In [2]:
import string
import unicodedata

# We can use "_" to represent an out-of-vocabulary character, that is, any character we are not handling in our model
allowed_characters = string.ascii_letters + " .,;'" + "_"
n_letters = len(allowed_characters)

# Turn a Unicode string to plain ASCII, thanks to https://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
        and c in allowed_characters
    )

In [3]:
# Index of a character in all characters list
def letter_to_index(letter):
    if letter not in allowed_characters:
        return allowed_characters.find('_')
    else:
        return allowed_characters.find(letter)

# Character-level Encoding
# One-hotting each letter
def line_to_Tensor(line):
    tensor = torch.zeros(len(line), 1, n_letters)   # The 1 is because PyTorch expects datasets to be in batches, so each letter here is one batch
    for idx, letter in enumerate(line):
        tensor[idx][0][letter_to_index(letter)]=1
    return tensor

In [9]:
line = 'Work-life balance is for the average.'
line_tensor = line_to_Tensor(line)
line_tensor.sum()

tensor(37., device='cuda:0')

In [18]:
DATA_DIR = 'data/names'

In [27]:
import torch
from torch.utils.data import Dataset
import time
import os

class NameDataset(Dataset):
    def __init__(self, data_dir):
        super().__init__()
        self.data_dir = data_dir
        self.load_time = time.localtime     # For profiling and logs
        
        self.data = []  # All names
        self.data_tensors = []  # Tensor representations of all names (in order)
        self.labels = []    # All corresponding labels
        self.label_tensors = []     # Tensor representations of labels
        self.label_to_index = {}  # Unique labels   # Previous implementation was set(), which runs O(n) instead of the current O(1)

        all_files = os.listdir(data_dir)  # paths of the label folder
        for label_idx, filename in enumerate(all_files):
            current_label = os.path.splitext(filename)[0]
            self.label_to_index[current_label] = label_idx
            filepath = os.path.join(data_dir, filename)
            with open(filepath, 'r', encoding='utf-8') as f:
                lines = f.read().strip().split('\n')
            for name in lines:
                self.data.append(name)
                self.labels.append(current_label)
                self.data_tensors.append(line_to_Tensor(name))
                self.label_tensors.append(torch.tensor([label_idx], dtype=torch.long))

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        data_tensor = self.data_tensors[idx]
        data_item = self.data[idx]
        label_tensor = self.label_tensors[idx]
        label_name = self.labels[idx]

        return data_tensor, label_tensor, data_item, label_name

In [28]:
name_dataset = NameDataset(DATA_DIR)
name_dataset[0]

(tensor([[[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
           0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.,
           0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
           0., 0., 0., 0., 0., 0., 0.]],
 
         [[0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
           0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
           0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
           0., 0., 0., 0., 0., 0., 0.]],
 
         [[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
           0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
           0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
           0., 0., 0., 0., 0., 0., 0.]],
 
         [[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
           0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
  

In [31]:
torch.argmax(line_to_Tensor('A'))

tensor(26, device='cuda:0')

In [None]:
# Split into training and validation sets