## Data, model, and training

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%cd /content/drive/MyDrive/Colab/password/

/content/drive/MyDrive/Colab/password


In [None]:
!ls -ltr data/passwd_db*

-rw------- 1 root root  234670546 Jul  1 00:21 data/passwd_db_val
-rw------- 1 root root 8918059640 Jul  1 00:21 data/passwd_db_train
-rw------- 1 root root  234703662 Jul  1 00:21 data/passwd_db_test
-rw------- 1 root root        102 Jul  3 15:53 data/passwd_db_min
-rw------- 1 root root          0 Jul  3 16:05 data/passwd_db_min.tfrecords


In [None]:
import numpy as np
import tensorflow as tf
import os

import distutils

# Build vocab

In [None]:
# reading from test to build vocab, to reduce loading memory
passwds = tf.io.gfile.GFile('data/passwd_db_test').read().split("\n")

In [None]:
vocab = sorted(list(set(''.join(passwds))))

In [None]:
len(vocab)

95

In [None]:
max_len = max(passwds, key=len)

In [None]:
max_len

'zimin0894zimin0894ver1zimin0894zimin0ziminziminver'

In [None]:
len(max_len)

50

In [None]:
tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='', char_level=True, lower=False)
tokenizer.fit_on_texts(passwds)

In [None]:
vocab_size = len(tokenizer.index_word) + 1
seq_len = len(max_len) - 1

In [None]:
print(f"Vocab size: {vocab_size}")
print(f"Seq len: {seq_len}")

Vocab size: 96
Seq len: 49


In [None]:
in_ten = tokenizer.texts_to_sequences(["pass", "testadf"])
tf.keras.preprocessing.sequence.pad_sequences(in_ten, padding='post')

array([[30,  1, 12, 12,  0,  0,  0],
       [19,  3, 12, 19,  1, 21, 29]], dtype=int32)

# Creating TF Records

In [None]:
!head -10 data/passwd_db_train > data/passwd_db_min

In [None]:
def _int64_feature(value):
  """Returns an int64_list from a bool / enum / int / uint."""
  return tf.train.Feature(int64_list=tf.train.Int64List(value=value))

In [None]:
def parse_single_text_data(in_vec, out_vec):
  data = {
        'in_vec' : _int64_feature(in_vec),
        'out_vec' : _int64_feature(out_vec)
    }
  out = tf.train.Example(features=tf.train.Features(feature=data))
  return out

def write_text_to_tfr(filename):
  writer = tf.io.TFRecordWriter(filename+".tfrecords") #create a writer that'll store our text data to disk
  count = 0
  print(filename)
  with open(filename) as f:
    for line in f:
        t = tokenizer.texts_to_sequences([line])[0]
        inp, out = tf.keras.preprocessing.sequence.pad_sequences([t[:-1], t[1:]], padding='post', maxlen=seq_len)
        out = parse_single_text_data(inp, out)
        writer.write(out.SerializeToString())
        count += 1
  writer.close()
  print(f"Wrote {count} elements to TFRecord")
  return count

In [None]:
write_text_to_tfr('data/passwd_db_min')

data/passwd_db_min
Wrote 10 elements to TFRecord


10

In [None]:
write_text_to_tfr('data/passwd_db_test')

data/passwd_db_test
Wrote 21499998 elements to TFRecord


21499998

In [None]:
write_text_to_tfr('data/passwd_db_val')

data/passwd_db_val
Wrote 21499998 elements to TFRecord


21499998

In [None]:
write_text_to_tfr('data/passwd_db_train')

data/passwd_db_train
Wrote 816999918 elements to TFRecord


816999918

In [None]:
!ls -ltr data/passwd_db*

-rw------- 1 root root    234670546 Jul  1 00:21 data/passwd_db_val
-rw------- 1 root root   8918059640 Jul  1 00:21 data/passwd_db_train
-rw------- 1 root root    234703662 Jul  1 00:21 data/passwd_db_test
-rw------- 1 root root          102 Jul  4 14:54 data/passwd_db_min
-rw------- 1 root root         1500 Jul  4 15:27 data/passwd_db_min.tfrecords
-rw------- 1 root root   3224999700 Jul  4 16:04 data/passwd_db_test.tfrecords
-rw------- 1 root root   3224999700 Jul  4 16:33 data/passwd_db_val.tfrecords
-rw------- 1 root root 122549987700 Jul  5 10:40 data/passwd_db_train.tfrecords


In [None]:
raw_dataset = tf.data.TFRecordDataset('data/passwd_db_min.tfrecords')
for raw_record in raw_dataset.take(10):
  print(raw_record)

tf.Tensor(b'\n\x83\x01\n@\n\x07out_vec\x125\x1a3\n1\x06\x01\x08\x04\x11\x04\x11\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\n?\n\x06in_vec\x125\x1a3\n1#\x06\x01\x08\x04\x11\x04\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00', shape=(), dtype=string)
tf.Tensor(b'\n\x83\x01\n?\n\x06in_vec\x125\x1a3\n1\x0c\x03$\x10\x12\x0b\x0f\x12\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\n@\n\x07out_vec\x125\x1a3\n1\x03$\x10\x12\x0b\x0f\x12\n\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00', shape=(), dtype=string)
tf.Tensor(b'\n\

In [None]:
feature_description = {
      'in_vec' : tf.io.FixedLenSequenceFeature([], tf.int64, allow_missing=True),
      'out_vec':tf.io.FixedLenSequenceFeature([], tf.int64, allow_missing=True),
    }

def _parse_function(example_proto):
  # Parse the input `tf.train.Example` proto using the dictionary above.
  record = tf.io.parse_single_example(example_proto, feature_description)
  return record['in_vec'], record['out_vec']

In [None]:
parsed_dataset = raw_dataset.map(_parse_function)
parsed_dataset

<MapDataset element_spec=(TensorSpec(shape=(None,), dtype=tf.int64, name=None), TensorSpec(shape=(None,), dtype=tf.int64, name=None))>

In [None]:
for parsed_record in parsed_dataset.take(2):
  print(repr(parsed_record))

(<tf.Tensor: shape=(49,), dtype=int64, numpy=
array([35,  6,  1,  8,  4, 17,  4,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0])>, <tf.Tensor: shape=(49,), dtype=int64, numpy=
array([ 6,  1,  8,  4, 17,  4, 17,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0])>)
(<tf.Tensor: shape=(49,), dtype=int64, numpy=
array([12,  3, 36, 16, 18, 11, 15, 18,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0])>, <tf.Tensor: shape=(49,), dtype=int64, numpy=
array([ 3, 36, 16, 18, 11, 15, 18, 10,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        

In [None]:
feature_description = {
      'in_vec' : tf.io.FixedLenSequenceFeature([], tf.int64, allow_missing=True),
      'out_vec':tf.io.FixedLenSequenceFeature([], tf.int64, allow_missing=True),
    }

def _parse_function(example_proto):
  # Parse the input `tf.train.Example` proto using the dictionary above.
  record = tf.io.parse_single_example(example_proto, feature_description)
  return record['in_vec'], record['out_vec']

def get_dataset(filename, batch_size=32):
    dataset = tf.data.TFRecordDataset(filename).map(_parse_function)
    dataset = dataset.shuffle(2048)
    dataset = dataset.prefetch(buffer_size=tf.data.AUTOTUNE)
    dataset = dataset.batch(batch_size)
    return dataset

In [None]:
t_ds = get_dataset('data/passwd_db_min.tfrecords', batch_size=2)

In [None]:
embedding_dim = vocab_size
rnn_units = 256 # was 1024

class MyModel(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, rnn_units):
    super().__init__(self)
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.gru = tf.keras.layers.GRU(rnn_units,
                                   return_sequences=True,
                                   return_state=True)
    self.dense = tf.keras.layers.Dense(vocab_size)

  def call(self, inputs, states=None, return_state=False, training=False):
    x = inputs
    x = self.embedding(x, training=training)
    if states is None:
      states = self.gru.get_initial_state(x)
    x, states = self.gru(x, initial_state=states, training=training)
    x = self.dense(x, training=training)

    if return_state:
      return x, states
    else:
      return x

In [None]:
training_model =  MyModel(
    vocab_size=vocab_size,
    embedding_dim=embedding_dim,
    rnn_units=rnn_units)

training_model.compile(
    optimizer='adam',
    loss=tf.losses.SparseCategoricalCrossentropy(from_logits=True))

In [None]:
sample_test_input, sample_test_target = next(iter(t_ds))

In [None]:
sample_test_input

<tf.Tensor: shape=(2, 49), dtype=int64, numpy=
array([[28,  3,  9, 22,  1,  8, 12,  6, 19,  7,  2,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0],
       [29,  3,  8, 28, 12, 25,  1,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0]])>

In [None]:
sample_test_target

<tf.Tensor: shape=(2, 49), dtype=int64, numpy=
array([[ 3,  9, 22,  1,  8, 12,  6, 19,  7,  2,  4,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0],
       [ 3,  8, 28, 12, 25,  1, 24,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0]])>

In [None]:
sample_test_preds = training_model(sample_test_input)

In [None]:
sample_test_preds.shape

TensorShape([2, 49, 96])

In [None]:
training_model.fit(
    t_ds,
    epochs=1
)



<keras.callbacks.History at 0x7fcb00b200d0>

# Compress and save tfrecords

In [3]:
!which 7z

/usr/bin/7z


In [5]:
!ls data/*.tfrecords

data/passwd_db_min.tfrecords   data/passwd_db_train.tfrecords
data/passwd_db_test.tfrecords  data/passwd_db_val.tfrecords


In [6]:
!du -sh data/*.tfrecords

1.5K	data/passwd_db_min.tfrecords
3.1G	data/passwd_db_test.tfrecords
115G	data/passwd_db_train.tfrecords
3.1G	data/passwd_db_val.tfrecords


In [7]:
%cd data

/content/drive/MyDrive/Colab/password/data


In [8]:
!7za a -t7z passwd_tfrecords.7z passwd_db_test.tfrecords passwd_db_val.tfrecords passwd_db_train.tfrecords


7-Zip (a) [64] 16.02 : Copyright (c) 1999-2016 Igor Pavlov : 2016-05-21
p7zip Version 16.02 (locale=en_US.UTF-8,Utf16=on,HugeFiles=on,64 bits,8 CPUs Intel(R) Xeon(R) CPU @ 2.20GHz (406F0),ASM,AES-NI)

Scanning the drive:
  0M Scan           3 files, 128999987100 bytes (121 GiB)

Creating archive: passwd_tfrecords.7z

Items to compress: 3

  0%      0% + passwd_db_test.tfrecords                                 1% + passwd_db_test.tfrecords                                 2% + passwd_db_test.tfrecords                                 2% 1 + passwd_db_test.tfrecords                                   2% 1 + passwd_db_train.tfrecords                                  