In [3]:
import datasets
import collections
import tqdm

In [2]:
dataset_group = 'wikitext'
dataset_name = 'wikitext-103-raw-v1'
dataset_split = 'train'
dataset_full_name = '/'.join([dataset_group, dataset_name, dataset_split])

dataset = datasets.load_dataset(dataset_group, name=dataset_name, split=dataset_split)

Found cached dataset wikitext (/home/tom/.cache/huggingface/datasets/wikitext/wikitext-103-raw-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126)


In [6]:
def get_chars_and_counts(dataset):
    """Takes a HF dataset and returns a dictionary {char: count}."""
    char_counts = collections.defaultdict(int)
    for record in tqdm.tqdm(dataset):
        for char in record['text']:
            char_counts[char] += 1
            
    return char_counts

In [7]:
char_counts = get_chars_and_counts(dataset)

100%|██████████████████████████████| 1801350/1801350 [00:30<00:00, 59648.54it/s]


In [10]:
chars_by_count = [(char, char_counts[char]) for char in char_counts]
chars_by_count = sorted(chars_by_count, key=lambda x:x[1], reverse=True)

In [11]:
chars_by_count[:10]

[(' ', 102591407),
 ('e', 49091225),
 ('t', 34095601),
 ('a', 33724387),
 ('n', 29289422),
 ('i', 28992651),
 ('o', 28818255),
 ('r', 26331592),
 ('s', 25200925),
 ('h', 19044643)]

In [22]:
special_chars = ['[PAD]', '[UNK]']

In [23]:
vocab_size = 1024
num_nonspecial_chars = vocab_size - len(special_chars)

chars_to_keep = [(char, 0) for char in special_chars] + chars_by_count[:num_nonspecial_chars]

In [24]:
# Dicts to map both ways
char_to_id = {}
id_to_char = {}

for i, (char, _) in enumerate(chars_to_keep):
    char_to_id[char] = i
    id_to_char[i] = char

In [25]:
def tokenise(data, max_seq_len=512):
    tokens = []
    seq_len = 0
    
    # Tokenise
    for char in data:
        if char in char_to_id:
            tok = char_to_id[char]
        else:
            tok = char_to_id['[UNK]']
            
        tokens.append(tok)
        seq_len += 1
        
        if seq_len > max_seq_len:
            raise ValueError('Sequence to tokenise exceeds length limit.')
            
    # Pad
    num_to_pad = max_seq_len - seq_len
    pad_tok = char_to_id['[PAD]']
    tokens += [pad_tok] * num_to_pad
    return tokens

In [34]:
lipsum = """Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed lectus nulla, pulvinar sed auctor nec, facilisis eu odio. Duis euismod pellentesque turpis, vitae ullamcorper tortor rutrum quis. Duis sed odio ut augue convallis convallis. Morbi at elit ut mi imperdiet vehicula. Suspendisse in sem eget est dapibus pellentesque. In ut condimentum purus. Vivamus vulputate est massa, id pretium quam pharetra eget. Duis porta ipsum vitae nibh tempus, eu ultricies nunc molestie. Nulla facilisi. Donec eu erat vitae leo laoreet mollis a quis metus. In eu libero porta magna vehicula venenatis. Praesent fermentum quam libero, ac volutpat dui tincidunt ac. Pellentesque vitae risus viverra, rhoncus augue ut, pellentesque dui."""
tokenised = tokenise(lipsum[:508])

In [35]:
def detokenise(data):
    return ''.join([id_to_char[tok] for tok in data])

In [36]:
detokenise(tokenised)

'Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed lectus nulla, pulvinar sed auctor nec, facilisis eu odio. Duis euismod pellentesque turpis, vitae ullamcorper tortor rutrum quis. Duis sed odio ut augue convallis convallis. Morbi at elit ut mi imperdiet vehicula. Suspendisse in sem eget est dapibus pellentesque. In ut condimentum purus. Vivamus vulputate est massa, id pretium quam pharetra eget. Duis porta ipsum vitae nibh tempus, eu ultricies nunc molestie. Nulla facilisi. Donec eu erat vita[PAD][PAD][PAD][PAD]'

In [38]:
lipsum[:512]

'Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed lectus nulla, pulvinar sed auctor nec, facilisis eu odio. Duis euismod pellentesque turpis, vitae ullamcorper tortor rutrum quis. Duis sed odio ut augue convallis convallis. Morbi at elit ut mi imperdiet vehicula. Suspendisse in sem eget est dapibus pellentesque. In ut condimentum purus. Vivamus vulputate est massa, id pretium quam pharetra eget. Duis porta ipsum vitae nibh tempus, eu ultricies nunc molestie. Nulla facilisi. Donec eu erat vitae le'