In [1]:
import datasets
import matplotlib.pyplot as plt
import tqdm
import collections
import numpy as np

In [2]:
dataset_group = 'wikitext'
dataset_name = 'wikitext-103-raw-v1'
dataset_split = 'train'
dataset_full_name = '/'.join([dataset_group, dataset_name, dataset_split])

dataset = datasets.load_dataset(dataset_group, name=dataset_name, split=dataset_split)

Found cached dataset wikitext (/home/tom/.cache/huggingface/datasets/wikitext/wikitext-103-raw-v1/1.0.0/a241db52902eaf2c6aa732210bead40c090019a499ceb13bcbfa3f8ab646a126)


In [3]:
batch_size = 3

In [4]:
foo = [['abc', 'def'], ['ghi', 'jkl']]

In [5]:
while foo:
    print(foo.pop())
    
if not foo:
    print('Finished')

['ghi', 'jkl']
['abc', 'def']
Finished


In [6]:
sequences = [dataset[i]['text'] for i in range(batch_size)]

In [7]:
class DatasetIterator:
    
    def __init__(self, dataset, filter_fn):
        self.dataset = dataset
        self.filter_fn = filter_fn
        self.seq_idx = 0
        
    def _return_next_record(self):
        datum = self.dataset[self.seq_idx]
        self.seq_idx += 1
        return datum
        
    def get_next(self):
        datum = self._return_next_record()
        while not self.filter_fn(datum):
            datum = self._return_next_record()
        return datum
    
    def reset(self):
        self.seq_idx = 0

In [8]:
ds_iterator = DatasetIterator(dataset, lambda x: len(x['text']))

In [9]:
for i in range(4):
    print(ds_iterator.get_next())

{'text': ' = Valkyria Chronicles III = \n'}
{'text': ' Senjō no Valkyria 3 : Unrecorded Chronicles ( Japanese : 戦場のヴァルキュリア3 , lit . Valkyria of the Battlefield 3 ) , commonly referred to as Valkyria Chronicles III outside Japan , is a tactical role @-@ playing video game developed by Sega and Media.Vision for the PlayStation Portable . Released in January 2011 in Japan , it is the third game in the Valkyria series . Employing the same fusion of tactical and real @-@ time gameplay as its predecessors , the story runs parallel to the first game and follows the " Nameless " , a penal military unit serving the nation of Gallia during the Second Europan War who perform secret black operations and are pitted against the Imperial unit " Calamaty Raven " . \n'}
{'text': " The game began development in 2010 , carrying over a large portion of the work done on Valkyria Chronicles II . While it retained the standard features of the series , it also underwent multiple adjustments , such as making t

In [10]:
foo = ds_iterator.get_next()

In [11]:
foo['text']

' = = Gameplay = = \n'

In [12]:
text = foo['text']
len(text)

19

In [13]:
text[::128]

' '

In [14]:
class Batcher:
    
    def __init__(self, data_source, batch_size, seq_len):
        self.data_source = data_source
        self.batch_size = batch_size
        self.seq_len = seq_len
        
    def setup(self):
        self._per_element_data = [self.data_source.get_next() for _ in range(self.batch_size)]

In [15]:
batcher = Batcher(ds_iterator, 4, 128)

In [16]:
batcher.setup()

In [17]:
batcher._per_element_data

[{'text': " As with previous Valkyira Chronicles games , Valkyria Chronicles III is a tactical role @-@ playing game where players take control of a military unit and take part in missions against enemy forces . Stories are told through comic book @-@ like panels with animated character portraits , with characters speaking partially through voiced speech bubbles and partially through unvoiced text . The player progresses through a series of linear missions , gradually unlocked as maps that can be freely scanned through and replayed as they are unlocked . The route to each story location on the map varies depending on an individual player 's approach : when one option is selected , the other is sealed off to the player . Outside missions , the player characters rest in a camp , where units can be customized and character growth occurs . Alongside the main story missions are character @-@ specific sub missions relating to different squad members . After the game 's completion , additiona

In [18]:
import char_tokeniser

In [19]:
tokeniser = char_tokeniser.CharacterTokeniser(vocab_size=100)

In [20]:
tokeniser.train(dataset)

100%|██████████████████████████████████████████████████████████████████████████████████████| 1801350/1801350 [00:29<00:00, 60113.39it/s]


In [21]:
text

' = = Gameplay = = \n'

In [22]:
tokens = tokeniser.tokenise(text)

In [23]:
batch_size = 4

In [24]:
seq_len = len(tokens)

In [25]:
import math

In [26]:
split_count = math.ceil(seq_len / batch_size)
print(split_count)

5


In [27]:
for i in range(split_count):
    print(tokens[i*batch_size:(i+1)*batch_size])

[2, 33, 2, 33]
[2, 53, 5, 16]
[3, 19, 12, 5]
[21, 2, 33, 2]
[33, 2, 36]


In [28]:
splits = [tokens[i*batch_size:(i+1)*batch_size] for i in range(split_count)]
splits

[[2, 33, 2, 33], [2, 53, 5, 16], [3, 19, 12, 5], [21, 2, 33, 2], [33, 2, 36]]

In [29]:
def pad(seq, size):
    num_to_pad = size - len(seq)
    seq += [0,] * num_to_pad
    return seq

In [30]:
pad(splits[-1], size=batch_size)

[33, 2, 36, 0]

In [31]:
splits[-1] = pad(splits[-1], size=batch_size)

In [32]:
splits

[[2, 33, 2, 33],
 [2, 53, 5, 16],
 [3, 19, 12, 5],
 [21, 2, 33, 2],
 [33, 2, 36, 0]]

In [33]:
from collections import deque

In [34]:
data_deque = deque(splits)

In [35]:
for i in range(split_count + 1):
    try:
        print(data_deque.popleft())
    except IndexError:
        print('Queue exhausted!')

[2, 33, 2, 33]
[2, 53, 5, 16]
[3, 19, 12, 5]
[21, 2, 33, 2]
[33, 2, 36, 0]
Queue exhausted!


In [36]:
batch_size = 3

In [37]:
dir(dataset)

['_TF_DATASET_REFS',
 '__class__',
 '__del__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__enter__',
 '__eq__',
 '__exit__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_build_local_temp_path',
 '_check_index_is_initialized',
 '_data',
 '_estimate_nbytes',
 '_fingerprint',
 '_format_columns',
 '_format_kwargs',
 '_format_type',
 '_get_cache_file_path',
 '_get_output_signature',
 '_getitem',
 '_indexes',
 '_indices',
 '_info',
 '_map_single',
 '_new_dataset_with_indices',
 '_output_all_columns',
 '_push_parquet_shards_to_hub',
 '_save_to_disk_single',
 '_select_contiguous',
 '_select_with_indices_mapping',
 '_split',
 'add_column',
 'add_elasticsearch_index',
 'add_faiss_index',
 'add_fai

In [38]:
ds_iter = dataset.iter(batch_size=1)

In [39]:
text = next(ds_iter)

In [40]:
text

{'text': ['']}

In [41]:
foo = tokeniser.tokenise_and_batch(text, 5)

Sequence length in tokens: 1
Subsequence length: 5
Number of splits: 1


In [42]:
type(foo)

list

In [43]:
class DataSource:
    
    def __init__(self, data_iterator, tokeniser, seq_len):
        """Initialise the DataSource."""
        self.data_iterator = data_iterator
        self.tokeniser = tokeniser
        self.seq_len = seq_len
        
    def _get_from_iterator_and_tokenise(self):
        """Gets a new sequence from the data iterator and tokenise."""
        tokenised_next_record = None
        while tokenised_next_record is None:
            next_record = next(self.data_iterator)
            try:
                tokenised_next_record = self.tokeniser.tokenise_and_batch(
                    next_record['text'][0], self.seq_len)
            except ValueError:
                continue
        return tokenised_next_record

In [44]:
datasource = DataSource(
    dataset.iter(batch_size=1),
    tokeniser,
    seq_len=4
)

In [49]:
tok_batches = datasource._get_from_iterator_and_tokenise()

{'text': ['']}
{'text': [' Senjō no Valkyria 3 : Unrecorded Chronicles ( Japanese : 戦場のヴァルキュリア3 , lit . Valkyria of the Battlefield 3 ) , commonly referred to as Valkyria Chronicles III outside Japan , is a tactical role @-@ playing video game developed by Sega and Media.Vision for the PlayStation Portable . Released in January 2011 in Japan , it is the third game in the Valkyria series . Employing the same fusion of tactical and real @-@ time gameplay as its predecessors , the story runs parallel to the first game and follows the " Nameless " , a penal military unit serving the nation of Gallia during the Second Europan War who perform secret black operations and are pitted against the Imperial unit " Calamaty Raven " . \n']}
Sequence length in tokens: 706
Subsequence length: 4
Number of splits: 177
[[2, 30, 3, 6], [66, 98, 2, 6], [8, 2, 70, 5], [12, 26, 21, 9], [7, 5, 2, 56], [2, 73, 2, 69], [6, 9, 3, 14], [8, 9, 13, 3], [13, 2, 34, 11], [9, 8, 6, 7], [14, 12, 3, 10], [2, 55, 2, 60],

In [51]:
''.join([tokeniser.detokenise_to_string(b) for b in tok_batches])

' Senjō no Valkyria 3 : Unrecorded Chronicles ( Japanese : [UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK][UNK]3 , lit . Valkyria of the Battlefield 3 ) , commonly referred to as Valkyria Chronicles III outside Japan , is a tactical role @-@ playing video game developed by Sega and Media.Vision for the PlayStation Portable . Released in January 2011 in Japan , it is the third game in the Valkyria series . Employing the same fusion of tactical and real @-@ time gameplay as its predecessors , the story runs parallel to the first game and follows the " Nameless " , a penal military unit serving the nation of Gallia during the Second Europan War who perform secret black operations and are pitted against the Imperial unit " Calamaty Raven " . \n[PAD][PAD]'

In [46]:
text = next(ds_iter)['text']
print(text)

[' = Valkyria Chronicles III = \n']


In [47]:
text[0]

' = Valkyria Chronicles III = \n'

In [48]:
tokeniser.tokenise_and_batch(text['text'], seq_len=10)

TypeError: list indices must be integers or slices, not str

In [None]:
dir(tokeniser)

In [None]:
tokeniser.is_trained

In [None]:
tokeniser._char_to_tok.keys()