In [4]:
import data_utils
import numpy as np
import math

In [5]:
def filter_dataset(dataset, top_k):
    # add the pad, end, oov chars
    results = []
    for sequence in dataset:
        results.append([x for x in sequence if x < top_k])
    return np.array(results)


def vectorize_sequences(sequences, dimension):
    results = np.zeros((len(sequences), dimension), dtype=np.float32)
    for i, sequence in enumerate(sequences):
        results[i, sequence] = 1.
    return results


def to_one_hot(labels, num_classes):
    results = np.zeros((len(labels), num_classes))
    for i, l in enumerate(labels):
        results[i, l] = 1.
    return results

In [6]:
batch_size = 1000
num_batches = 10
min_index = 0
max_index = 10001

In [7]:
for i in range(11):
    n = i % num_batches + 1
    
    d = min_index + (n + 1)* batch_size
    print(d)
    print(d < max_index)

2000
True
3000
True
4000
True
5000
True
6000
True
7000
True
8000
True
9000
True
10000
True
11000
False
2000
True


In [33]:
def data_generator(x, y, x_dimension, y_dimension,
                   batch_size, min_index=None, max_index=None, shuffle=False):
    
    if not min_index:
        min_index = 0
    if not max_index:
        max_index = len(x)
    
    num_batches = (max_index - min_index) // batch_size + 1
    num_classes = len(set(y))

    i = 0
    while True:
        n = i % num_batches
        i += 1
        if (min_index + (n+1) * batch_size) < max_index:
            print('then from {} to {}'.format(min_index + n*batch_size, min_index + (n+1)*batch_size))
            yield (
                vectorize_sequences(
                    x[min_index + n*batch_size: min_index + (n+1)*batch_size],
                    x_dimension),
                to_one_hot(
                    y[min_index + n*batch_size: min_index + (n+1)*batch_size],
                    num_classes)
            )
        else:
            i = 0
            print('else from {} to {}'.format(min_index + n*batch_size, max_index))
            yield (
                vectorize_sequences(
                    x[min_index + n*batch_size: max_index],
                    x_dimension),
                to_one_hot(
                    y[min_index + n*batch_size: max_index], num_classes
                )
            )
        if shuffle:
            indexes = np.array(range(min_index, max_index))
            np.random.shuffle(indexes)
            x[min_index: max_index] = x[indexes]
            y[min_index: max_index] = y[indexes]

In [20]:
dataset = np.load('../../data/dataset/atti_dataset.npz')

x_train = dataset['x_train']
y_train = dataset['y_train']

x_test = dataset['x_test']
y_test = dataset['y_test']

In [21]:
dimensions = 10000

In [22]:
x_train = data_utils.filter_dataset(x_train, dimensions)
x_test = data_utils.filter_dataset(x_test, dimensions)

In [23]:
batch_size = 128
num_classes = len(set(y_train))
train_val_split = math.ceil(len(x_train) * 0.8)

In [24]:
batch_size

128

In [34]:
train_generator = data_generator(
    x_train, y_train, dimensions, num_classes, batch_size, 0, train_val_split, True
)
val_generator = data_generator(
    x_train, y_train, dimensions, num_classes, batch_size, train_val_split, len(x_train), True  
)

In [26]:
train_val_split

115478

In [27]:
train_val_split / batch_size

902.171875

In [28]:
train_epochs = train_val_split // batch_size + 1

for i in range(train_epochs + 10):
    next(train_generator)

then from 0 to 128
then from 128 to 256
then from 256 to 384
then from 384 to 512
then from 512 to 640
then from 640 to 768
then from 768 to 896
then from 896 to 1024
then from 1024 to 1152
then from 1152 to 1280
then from 1280 to 1408
then from 1408 to 1536
then from 1536 to 1664
then from 1664 to 1792
then from 1792 to 1920
then from 1920 to 2048
then from 2048 to 2176
then from 2176 to 2304
then from 2304 to 2432
then from 2432 to 2560
then from 2560 to 2688
then from 2688 to 2816
then from 2816 to 2944
then from 2944 to 3072
then from 3072 to 3200
then from 3200 to 3328
then from 3328 to 3456
then from 3456 to 3584
then from 3584 to 3712
then from 3712 to 3840
then from 3840 to 3968
then from 3968 to 4096
then from 4096 to 4224
then from 4224 to 4352
then from 4352 to 4480
then from 4480 to 4608
then from 4608 to 4736
then from 4736 to 4864
then from 4864 to 4992
then from 4992 to 5120
then from 5120 to 5248
then from 5248 to 5376
then from 5376 to 5504
then from 5504 to 5632
then 

KeyboardInterrupt: 

In [39]:
val_epochs = (len(x_train) - train_val_split) // batch_size + 1
print(val_epochs)
print(val_epochs * batch_size, len(x_train) - train_val_split)

226
28928 28869


In [40]:
for i in range(val_epochs):
    x,y = next(val_generator)

then from 115478 to 115606
then from 115606 to 115734
then from 115734 to 115862
then from 115862 to 115990
then from 115990 to 116118
then from 116118 to 116246
then from 116246 to 116374
then from 116374 to 116502
then from 116502 to 116630
then from 116630 to 116758
then from 116758 to 116886
then from 116886 to 117014
then from 117014 to 117142
then from 117142 to 117270
then from 117270 to 117398
then from 117398 to 117526
then from 117526 to 117654
then from 117654 to 117782
then from 117782 to 117910
then from 117910 to 118038
then from 118038 to 118166
then from 118166 to 118294
then from 118294 to 118422
then from 118422 to 118550
then from 118550 to 118678
then from 118678 to 118806
then from 118806 to 118934
then from 118934 to 119062
then from 119062 to 119190
then from 119190 to 119318
then from 119318 to 119446
then from 119446 to 119574
then from 119574 to 119702
then from 119702 to 119830
then from 119830 to 119958
then from 119958 to 120086
then from 120086 to 120214
t

In [30]:
len(x_train)

144347