# Password

In [2]:
import numpy as np
import tensorflow as tf
import os

import distutils

In [3]:
tf.__version__

'2.8.0'

## Add required libraries
nltk is required for calculating BLEU Score

In [4]:
!pip install nltk

Defaulting to user installation because normal site-packages is not writeable
[0m

## Set memory growth
This will not allow tf to allocate whole GPU at once and use GPU as needed.

In [5]:
gpus = tf.config.list_physical_devices('GPU')
if gpus:
  try:
    # Currently, memory growth needs to be the same across GPUs
    for gpu in gpus:
      tf.config.experimental.set_memory_growth(gpu, True)
    logical_gpus = tf.config.list_logical_devices('GPU')
    print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
  except RuntimeError as e:
    # Memory growth must be set before GPUs have been initialized
    print(e)

1 Physical GPUs, 1 Logical GPUs


## Peak raw data 
816 M records for training,
21 M records for validation,
21 M records for Testing

In [6]:
!ls -ltr passwd_db*

-rw------- 1 ubuntu ubuntu   8918059640 Jul  1 00:15 passwd_db_train
-rw------- 1 ubuntu ubuntu    234670546 Jul  1 00:20 passwd_db_val
-rw------- 1 ubuntu ubuntu    234703662 Jul  1 00:20 passwd_db_test
-rw-r--r-- 1 ubuntu ubuntu          102 Jul  2 23:10 passwd_db_min
-rw------- 1 ubuntu ubuntu   3224999700 Jul  4 15:36 passwd_db_test.tfrecords
-rw------- 1 ubuntu ubuntu   3224999700 Jul  4 16:04 passwd_db_val.tfrecords
-rw------- 1 ubuntu ubuntu 122549987700 Jul  4 16:33 passwd_db_train.tfrecords


In [6]:
!wc -l passwd_db_train

816999918 passwd_db_train


In [7]:
!wc -l passwd_db_val

21499998 passwd_db_val


In [8]:
!wc -l passwd_db_test

21499998 passwd_db_test


In [7]:
# verify of any GPU issues
inputs = tf.random.normal([32, 10, 8])
gru = tf.keras.layers.GRU(4)
output = gru(inputs)
print(output.shape)

(32, 4)


## Build vocab
Build vocabulary to test dataset. This is to save RAM space.

In [8]:
passwds = tf.io.gfile.GFile('passwd_db_test').read().split("\n")
vocab = sorted(list(set(''.join(passwds))))

In [9]:
len(vocab)

95

In [10]:
max_len = max(passwds, key=len)

In [11]:
max_len

'zimin0894zimin0894ver1zimin0894zimin0ziminziminver'

In [12]:
len(max_len)

50

In [13]:
tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='', char_level=True, lower=False)
tokenizer.fit_on_texts(passwds)

In [14]:
vocab_size = len(tokenizer.index_word) + 1
seq_len = len(max_len) - 1

In [15]:
print(f"Vocab size: {vocab_size}")
print(f"Seq len: {seq_len}")

Vocab size: 96
Seq len: 49


In [16]:
in_ten = tokenizer.texts_to_sequences(["pass", "testadf"])
tf.keras.preprocessing.sequence.pad_sequences(in_ten, padding='post')

array([[30,  1, 12, 12,  0,  0,  0],
       [19,  3, 12, 19,  1, 21, 29]], dtype=int32)

## Loading Dataset
Since loading all raw dataset will require upto ~800GB RAM. Raw data is converted to tf_records. 

In [17]:
feature_description = {
      'in_vec' : tf.io.FixedLenSequenceFeature([], tf.int64, allow_missing=True),
      'out_vec':tf.io.FixedLenSequenceFeature([], tf.int64, allow_missing=True),
    }

def _parse_function(example_proto):
  # Parse the input `tf.train.Example` proto using the dictionary above.
  record = tf.io.parse_single_example(example_proto, feature_description)
  return record['in_vec'], record['out_vec']

def get_dataset(filename, batch_size=32):
    dataset = tf.data.TFRecordDataset(filename).map(_parse_function)
    dataset = dataset.shuffle(2048)
    dataset = dataset.prefetch(buffer_size=tf.data.AUTOTUNE)
    dataset = dataset.batch(batch_size, drop_remainder=True)
    return dataset

In [18]:
!ls -ltr *.tfrecords

-rw------- 1 ubuntu ubuntu   3224999700 Jul  4 15:36 passwd_db_test.tfrecords
-rw------- 1 ubuntu ubuntu   3224999700 Jul  4 16:04 passwd_db_val.tfrecords
-rw------- 1 ubuntu ubuntu 122549987700 Jul  4 16:33 passwd_db_train.tfrecords


In [19]:
test_dataset = get_dataset('passwd_db_test.tfrecords', batch_size=8192)
val_dataset = get_dataset('passwd_db_val.tfrecords', batch_size=8192)
train_dataset = get_dataset('passwd_db_train.tfrecords', batch_size=8192)

## Creating Model

In [40]:
embedding_dim = vocab_size
rnn_units = 256 # was 1024

class MyModel(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, rnn_units):
    super().__init__(self)
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.gru = tf.keras.layers.GRU(rnn_units,
                                   return_sequences=True,
                                   return_state=True)
    self.dense = tf.keras.layers.Dense(vocab_size)

  def call(self, inputs, states=None, return_state=False, training=False):
    x = inputs
    x = self.embedding(x, training=training)
    if states is None:
      states = self.gru.get_initial_state(x)
    x, states = self.gru(x, initial_state=states, training=training)
    x = self.dense(x, training=training)

    if return_state:
      return x, states
    else:
      return x

In [41]:
training_model =  MyModel(
    vocab_size=vocab_size,
    embedding_dim=embedding_dim,
    rnn_units=rnn_units)

training_model.compile(
    optimizer='adam',
    loss=tf.losses.SparseCategoricalCrossentropy(from_logits=True))

## Verify dataset and model

In [22]:
sample_test_input, sample_test_target = next(iter(test_dataset))

In [23]:
sample_test_input.shape

TensorShape([8192, 49])

In [24]:
sample_test_target.shape

TensorShape([8192, 49])

In [25]:
sample_test_preds = training_model(sample_test_input)

In [26]:
sample_test_preds.shape

TensorShape([8192, 49, 96])

## BLEU Score before training

In [27]:
from nltk.translate.bleu_score import corpus_bleu

def bleu_score(y_true, y_pred):
  batch_score = []
  for ref, hyp in zip(y_true, y_pred):
    batch_score.append(corpus_bleu([chr(c) for c in ref], [chr(c) for c in hyp], weights=[0.25]))
  return np.average(batch_score)

# try model without training
def calculate_bleu_score(model, dataset, till_batch):
  scores = []

  for batch_i, (input_test_batch, target_test_batch) in enumerate(dataset, 1):
      test_batch_predictions = model(input_test_batch)
      test_batch_predictions = np.array([tf.argmax(i, axis=1).numpy() for i in test_batch_predictions], dtype=np.int32)
      scores.append(bleu_score(target_test_batch.numpy(), test_batch_predictions))
      if batch_i % 100 == 0:
        print(f"Till {batch_i} the avg bleu score is {np.average(scores)}")
      if batch_i > till_batch:
        break
  
  print(f"Final avg bleu score is {np.average(scores)}")
  return np.average(scores)

In [28]:
tf.random.categorical(sample_test_preds[0], num_samples=1)

<tf.Tensor: shape=(49, 1), dtype=int64, numpy=
array([[ 3],
       [51],
       [ 6],
       [65],
       [82],
       [91],
       [65],
       [37],
       [ 1],
       [42],
       [81],
       [32],
       [72],
       [52],
       [12],
       [68],
       [57],
       [ 9],
       [75],
       [85],
       [70],
       [15],
       [61],
       [67],
       [19],
       [61],
       [ 3],
       [83],
       [22],
       [41],
       [77],
       [30],
       [29],
       [88],
       [ 0],
       [27],
       [72],
       [49],
       [30],
       [46],
       [14],
       [68],
       [18],
       [74],
       [43],
       [85],
       [57],
       [49],
       [ 9]])>

In [29]:
tf.argmax(sample_test_preds[0], axis=1)

<tf.Tensor: shape=(49,), dtype=int64, numpy=
array([20, 48, 39, 56, 41, 17, 41, 81, 81, 54,  1, 46, 73, 76, 76, 76, 76,
       76, 76, 76, 76, 76, 76, 76, 76, 76, 76, 76, 76, 76, 76, 76, 76, 76,
       76, 76, 76, 76, 76, 76, 76, 76, 76, 76, 76, 76, 76, 76, 76])>

In [30]:
np.array([tf.argmax(i, axis=1).numpy() for i in sample_test_preds], dtype=np.int32)

array([[20, 48, 39, ..., 76, 76, 76],
       [81, 51,  8, ..., 76, 76, 76],
       [53, 33, 70, ..., 76, 76, 76],
       ...,
       [15, 47, 47, ..., 76, 76, 76],
       [17, 76,  8, ..., 76, 76, 76],
       [28, 57, 45, ..., 76, 76, 76]], dtype=int32)

In [31]:
np.array([tf.squeeze(tf.random.categorical(i, num_samples=1), axis=-1).numpy() for i in sample_test_preds], dtype=np.int32)

array([[38, 50, 60, ..., 84, 48, 72],
       [93, 42,  4, ..., 14,  2, 69],
       [50, 48, 46, ..., 45,  0, 31],
       ...,
       [55, 19, 78, ...,  5, 37, 63],
       [67, 24, 29, ..., 24, 17, 22],
       [37,  7, 65, ..., 16, 13, 26]], dtype=int32)

In [32]:
sample_test_target.numpy()

array([[23, 25,  7, ...,  0,  0,  0],
       [43, 22, 20, ...,  0,  0,  0],
       [ 1, 22,  6, ...,  0,  0,  0],
       ...,
       [19, 24, 17, ...,  0,  0,  0],
       [ 7, 22,  7, ...,  0,  0,  0],
       [ 2, 10, 15, ...,  0,  0,  0]])

In [33]:
bleu_score(sample_test_target.numpy(), np.array([tf.argmax(i, axis=1).numpy() for i in sample_test_preds], dtype=np.int32))

0.02486440619999932

In [34]:
bleu_score(sample_test_target.numpy(), np.array([tf.squeeze(tf.random.categorical(i, num_samples=1), axis=-1).numpy() for i in sample_test_preds], dtype=np.int32))

0.1592550773478552

## Training

In [42]:
earlystopping_cb = tf.keras.callbacks.EarlyStopping(
    monitor = 'val_loss',
    mode = 'min',
    verbose = 1,
    patience = 3,
    restore_best_weights = True
)

In [43]:
training_model.fit(
    train_dataset,
    epochs=10,
    validation_data=val_dataset,
    callbacks=[earlystopping_cb]
)
training_model.save_weights('gpu_password_gru', overwrite=True)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


## BLEU Score after training

In [44]:
sample_test_preds = training_model(sample_test_input)

In [45]:
bleu_score(sample_test_target.numpy(), np.array([tf.argmax(i, axis=1).numpy() for i in sample_test_preds], dtype=np.int32))

0.9614950184899456

In [46]:
bleu_score(sample_test_target.numpy(), np.array([tf.squeeze(tf.random.categorical(i, num_samples=1), axis=-1).numpy() for i in sample_test_preds], dtype=np.int32))

0.9567861176470129

## Inference

In [47]:
prediction_model = MyModel(
    vocab_size=vocab_size,
    embedding_dim=embedding_dim,
    rnn_units=rnn_units)

In [48]:
prediction_model.load_weights('gpu_password_gru')

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f635aa55c40>

In [49]:
class OneStep(tf.keras.Model):
  def __init__(self, model, tokenizer, temperature=1.0):
    super().__init__()
    self.temperature = temperature
    self.model = model
    self.tokenizer = tokenizer

  #@tf.function
  def generate_one_step(self, input_chars, states=None):
    #import pdb; pdb.set_trace()
    # Convert strings to token IDs.
    # input_chars = tf.strings.unicode_split(inputs, 'UTF-8')
    if input_chars[0] == '':
      input_chars = ' '
    input_ids = self.tokenizer.texts_to_sequences(input_chars)
    #print(input_chars)
    input_ids = tf.keras.preprocessing.sequence.pad_sequences(input_ids, padding='post')

    # Run the model.
    # predicted_logits.shape is [batch, char, next_char_logits]
    predicted_logits, states = self.model(inputs=input_ids, states=states,
                                          return_state=True)
    # Only use the last prediction.
    predicted_logits = predicted_logits[:, -1, :]
    predicted_logits = predicted_logits/self.temperature

    # Sample the output logits to generate token IDs.
    predicted_ids = tf.random.categorical(predicted_logits, num_samples=1)
    predicted_ids = tf.squeeze(predicted_ids, axis=-1)

    # Convert from token ids to characters
    predicted_chars = self.tokenizer.sequences_to_texts([predicted_ids.numpy()])

    # Return the characters and model state.
    return predicted_chars, states

In [50]:
one_step_model = OneStep(prediction_model, tokenizer)

In [51]:
import time

start = time.time()
states = None
next_char = ['w']
result = [next_char]

for n in range(2):
  next_char, states = one_step_model.generate_one_step(next_char, states=states)
  result.append(next_char)

result = tf.strings.join(result)
end = time.time()
print(result[0].numpy().decode('utf-8'), '\n\n' + '_'*80)
print('\nRun time:', end - start)

wes 

________________________________________________________________________________

Run time: 0.11068558692932129


In [52]:
def pwds_from_chars(model, start_char, len):
  states = None
  next_char = [start_char]
  result = [next_char]

  for n in range(len-1):
    next_char, states = model.generate_one_step(next_char, states=states)
    result.append(next_char)

  result = tf.strings.join(result)
  return result[0].numpy().decode('utf-8')

In [53]:
all_pos = []
while True:
  pred = pwds_from_chars(one_step_model,'w', 3)
  if pred not in all_pos:
    all_pos.append(pred)
  else:
    break

print(all_pos)

['wus', 'wdV', 'wil', 'wyl', 'wow', 'wel', 'wep', 'www', 'wes', 'wnd', 'wav', 'wg2', 'w9V', 'wqe', 'wha', 'wac', 'wob', 'wei', 'wos', 'way', 'wut', 'wic', 'wen', 'ww3', 'wab', 'wly', 'whi', 'wal', 'wor', 'wDk', 'wax', 'wma', 'wae', 'wwj', 'w94', 'wez', 'wun']


In [55]:
given_pass = 'pass'
attempts = 1

while True:
  pred = pwds_from_chars(one_step_model, given_pass[0], len(given_pass))
  print(f"{attempts} - {pred}")
  if pred == given_pass:
    break
  attempts += 1

print(f"Model took {attempts} attempts to find password - {given_pass}")

1 - pra5
2 - paul
3 - pera
4 - puze
5 - popu
6 - phan
7 - pney
8 - pass
Model took 8 attempts to find password - pass


In [58]:
import random

def gen_passwords_random(num, pass_len=5, start_char=None):
  passwords = []
  gen_char = False
  if start_char is None:
    gen_char = True
  for i in range(num):
    # if start char is none, then randomly pick start_char
    if gen_char:
      start_char = tokenizer.index_word[random.randint(1, len(tokenizer.index_word) - 1)]

    password = start_char
    for j in range(pass_len - 1):
      password += tokenizer.index_word[random.randint(1, len(tokenizer.index_word) - 1)]
    passwords.append(password)
  return passwords

In [59]:
gen_passwords_random(4, pass_len=5)

[';\\u{l', 'sM8=a', 'Exu3P', '`2 XA']

In [61]:
def gen_passwords(model, num, pass_len=5, seq_len=3, start_char=None):
  passwords = []
  gen_char = False
  if start_char is None:
    gen_char = True
  for i in range(num):
    # if start char is none, then randomly pick start_char
    if gen_char:
      start_char = tokenizer.index_word[random.randint(1, len(tokenizer.index_word) - 1)]
    passwords.append(pwds_from_chars(model, start_char, pass_len))
  return passwords

In [62]:
gen_passwords(one_step_model, 4, pass_len=5, start_char='m')

['mairy', 'mihos', 'mikee', 'micro']

In [63]:
import hashlib
import requests

def check_pwned(passes):
  headers = {
    "User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.2.8) Gecko/20100722 Firefox/3.6.8 GTB7.1 (.NET CLR 3.5.30729)", 
    "Referer": "https://haveibeenpwned.com/"
  }
  found_passwds = {}
  pwned_api = 'https://api.pwnedpasswords.com/range/'
  for p in passes:
    hash_object = hashlib.sha1(p.encode())
    pbHash = hash_object.hexdigest().upper()
    try:
      res = requests.get(pwned_api + pbHash[:5],  headers=headers, timeout=10)
      range_hashes = res.text.split('\r\n')
      for h in range_hashes:
        h_c = h.split(':')
        if h_c[0] == pbHash[5:]:
          found_passwds[p] = h_c[1]
    except Exception as e:
      print(f'request timed out for pass {p}')
  return found_passwds

In [64]:
random_preds = {}
model_preds = {}

total_num_pass = 100
pass_length = 5

for i in vocab:
  gen_rand_pass = gen_passwords_random(total_num_pass, pass_len=pass_length, start_char=i)
  random_preds[i] = len(check_pwned(gen_rand_pass))/total_num_pass
  gen_pass = gen_passwords(one_step_model, total_num_pass, pass_len=pass_length, start_char=i)
  model_preds[i] = len(check_pwned(gen_pass))/total_num_pass
  print(f"char '{i}' : random password prob {random_preds[i]}, model password prob: {model_preds[i]}")

char ' ' : random password prob 0.0, model password prob: 0.06
char '!' : random password prob 0.0, model password prob: 0.12
char '"' : random password prob 0.0, model password prob: 0.01
char '#' : random password prob 0.0, model password prob: 0.03
char '$' : random password prob 0.0, model password prob: 0.34
char '%' : random password prob 0.0, model password prob: 0.01
char '&' : random password prob 0.0, model password prob: 0.08
char ''' : random password prob 0.0, model password prob: 0.0
char '(' : random password prob 0.0, model password prob: 0.0
char ')' : random password prob 0.0, model password prob: 0.01
char '*' : random password prob 0.0, model password prob: 0.04
char '+' : random password prob 0.0, model password prob: 0.03
char ',' : random password prob 0.0, model password prob: 0.02
char '-' : random password prob 0.0, model password prob: 0.05
char '.' : random password prob 0.0, model password prob: 0.08
char '/' : random password prob 0.0, model password prob:

In [66]:
import json

f = open('passwd_dist.json')
data_dist = json.load(f)

In [67]:
data_dist

{'`': 25371,
 '^': 34390,
 '~': 46895,
 '<': 84039,
 '=': 76110,
 '>': 11010,
 '|': 13683,
 ';': 80891,
 '.': 432644,
 '[': 76870,
 '*': 897864,
 '(': 197652,
 '#': 248357,
 '_': 486252,
 '-': 339118,
 '@': 500543,
 ',': 76695,
 '?': 179414,
 ')': 29046,
 ']': 14499,
 '/': 73812,
 '%': 61061,
 '!': 702601,
 '"': 35088,
 '{': 29576,
 '&': 164935,
 '+': 126958,
 ':': 669,
 "'": 24411,
 '}': 9221,
 '$': 1200710,
 '\\': 23422,
 '0': 20507166,
 '1': 40734951,
 '2': 25194418,
 '3': 16414591,
 '4': 13236911,
 '5': 13154142,
 '6': 11912339,
 '7': 12795409,
 '8': 15947414,
 '9': 15017858,
 'a': 40332222,
 'A': 7585477,
 'b': 31338915,
 'B': 5559200,
 'c': 31089790,
 'C': 5040938,
 'd': 30078421,
 'D': 5262567,
 'e': 17031957,
 'E': 3113239,
 'f': 23680570,
 'F': 3453015,
 'g': 21228338,
 'G': 3935545,
 'h': 18660755,
 'H': 3273647,
 'i': 13447261,
 'I': 2676512,
 'j': 23625092,
 'J': 3942097,
 'k': 25896397,
 'K': 4609562,
 'l': 27520209,
 'L': 4880087,
 'm': 40192522,
 'M': 7681713,
 'n': 2043

In [70]:
total_passwds = 0
for k in data_dist:
    total_passwds += data_dist[k]
    
total_passwds

859999915

In [72]:
prob_pass_start_chars = dict([(k, data_dist[k]/total_passwds) for k in data_dist])

In [73]:
sorted_num_pass_w_char = sorted(prob_pass_start_chars.items(), key=lambda x:x[1])
sorted_num_pass_w_char[-10:]

[('k', 0.030112092511078912),
 ('t', 0.03019875298476047),
 ('l', 0.03200024618607084),
 ('d', 0.034974911596357544),
 ('c', 0.03615092217770743),
 ('b', 0.036440602438896755),
 ('m', 0.04673549531688035),
 ('a', 0.04689793719340077),
 ('1', 0.04736622677456893),
 ('s', 0.04979685375899136)]

In [74]:
char_dist = dict(sorted_num_pass_w_char)

In [79]:
rand_we_avg = 0
for x in vocab:
    if x == ' ':
        continue
    rand_we_avg += (char_dist[x] * random_preds[x])

In [80]:
rand_we_avg

0.001278550184507867

In [82]:
model_we_avg = 0
for x in vocab:
    if x == ' ':
        continue
    model_we_avg += (char_dist[x] * model_preds[x])

In [83]:
model_we_avg

0.5487125983378731

In [84]:
un_training_model =  MyModel(
    vocab_size=vocab_size,
    embedding_dim=embedding_dim,
    rnn_units=rnn_units)

In [85]:
calculate_bleu_score(un_training_model, test_dataset, 2000)

Till 100 the avg bleu score is 0.0209580082891451
Till 200 the avg bleu score is 0.02102168448135571
Till 300 the avg bleu score is 0.021016760012814516
Till 400 the avg bleu score is 0.021033122238615583
Till 500 the avg bleu score is 0.021005981872360163
Till 600 the avg bleu score is 0.02098966191974917
Till 700 the avg bleu score is 0.021001287331644136
Till 800 the avg bleu score is 0.0210111565168881
Till 900 the avg bleu score is 0.02102369734777124
Till 1000 the avg bleu score is 0.0210375336540853
Till 1100 the avg bleu score is 0.021045548160576785
Till 1200 the avg bleu score is 0.021050480816782426
Till 1300 the avg bleu score is 0.021049299169772336
Till 1400 the avg bleu score is 0.021053356085393538
Till 1500 the avg bleu score is 0.021042231208090272
Till 1600 the avg bleu score is 0.02104176738039839
Till 1700 the avg bleu score is 0.021046803193946494
Till 1800 the avg bleu score is 0.021045170989821935
Till 1900 the avg bleu score is 0.021043772006728557
Till 2000 th

0.021047438312318877

In [86]:
calculate_bleu_score(prediction_model, test_dataset, 2000)

Till 100 the avg bleu score is 0.9615185299230765
Till 200 the avg bleu score is 0.9615270699617517
Till 300 the avg bleu score is 0.9615200477031344
Till 400 the avg bleu score is 0.9615294572748456
Till 500 the avg bleu score is 0.9615273819538481
Till 600 the avg bleu score is 0.9615271568197761
Till 700 the avg bleu score is 0.9615303514730746
Till 800 the avg bleu score is 0.9615278975783217
Till 900 the avg bleu score is 0.9615319933058355
Till 1000 the avg bleu score is 0.9615324558697163
Till 1100 the avg bleu score is 0.9615305745961558
Till 1200 the avg bleu score is 0.9615318202010693
Till 1300 the avg bleu score is 0.9615294347813979
Till 1400 the avg bleu score is 0.9615278364723404
Till 1500 the avg bleu score is 0.9615250027668489
Till 1600 the avg bleu score is 0.9615252427443969
Till 1700 the avg bleu score is 0.9615256334658429
Till 1800 the avg bleu score is 0.961526388168298
Till 1900 the avg bleu score is 0.9615260739864636
Till 2000 the avg bleu score is 0.9615257

0.9615254745755878

In [88]:
import pandas as pd

df1 =pd.DataFrame(char_dist.items(), columns=['char', 'dist'])
df2 = pd.DataFrame(random_preds.items(), columns=['char', 'rand_pred_prob'])
df3 = pd.DataFrame(model_preds.items(), columns=['char', 'model_pred_prob'])

df4 = df1.join(df2.set_index('char'), on='char').join(df3.set_index('char'), on='char')
df4.sort_values(by=['dist'], ascending=False)

Unnamed: 0,char,dist,rand_pred_prob,model_pred_prob
93,s,4.979685e-02,0.01,0.67
92,1,4.736623e-02,0.00,0.60
91,a,4.689794e-02,0.00,0.67
90,m,4.673550e-02,0.00,0.63
89,b,3.644060e-02,0.00,0.64
...,...,...,...,...
4,],1.685930e-05,0.00,0.01
3,|,1.591047e-05,0.00,0.00
2,>,1.280233e-05,0.00,0.00
1,},1.072209e-05,0.00,0.00


In [89]:
df4.sort_values(by=['dist'], ascending=False).to_csv('passwd_stats.csv')

# Save Embeddings

In [90]:
training_model.summary()

Model: "my_model_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     multiple                  9216      
                                                                 
 gru_4 (GRU)                 multiple                  271872    
                                                                 
 dense_3 (Dense)             multiple                  24672     
                                                                 
Total params: 305,760
Trainable params: 305,760
Non-trainable params: 0
_________________________________________________________________


In [92]:
embed_weights = training_model.get_layer('embedding_3').get_weights()[0]

In [93]:
embed_weights.shape

(96, 96)

In [94]:
len(tokenizer.word_index)

95

In [95]:
len(vocab)

95

In [96]:
embed_weights[1]

array([-8.52828845e-03,  3.98259237e-03, -5.86652604e-04,  2.21915753e-03,
        7.69110629e-04,  2.64078286e-03, -9.25570028e-04,  8.43729749e-02,
        1.75543560e-03,  4.55471361e-03, -1.72915459e-01, -5.01352770e-04,
        3.16659957e-02,  5.15194088e-02, -7.49078719e-03, -2.35463511e-02,
       -1.54948533e-01,  9.75966081e-02,  4.81108934e-01,  1.38093717e-03,
       -3.08980495e-02, -1.97718546e-05,  1.34239893e-03, -2.01491460e-01,
        1.42857581e-01,  1.92554891e-02, -1.41481287e-05, -5.06383479e-02,
        2.58298265e-03, -4.95540490e-03, -5.88862821e-02, -5.40693384e-03,
        1.39898583e-02,  1.12475425e-01, -9.34882089e-02, -8.26020762e-02,
        1.23469010e-02, -2.82282770e-01, -4.60601971e-03, -1.18654454e-02,
       -1.67712793e-02,  1.22016385e-01,  4.14871814e-04,  1.48158088e-01,
       -5.28979413e-02, -4.54684999e-03, -8.67892727e-02,  1.63611412e-01,
       -1.65318735e-02,  7.26840943e-02, -1.81778742e-03, -7.71169551e-03,
        1.31675750e-02, -

In [97]:
import io

out_v = io.open('password_v2_tf_vectors.tsv', 'w', encoding='utf-8')
out_m = io.open('password_v2_tf_metadata.tsv', 'w', encoding='utf-8')

for index in range(len(tokenizer.index_word) + 1):
  if index == 0:
    continue #skip 0, it's padding. 
  vec = embed_weights[index]
  out_v.write('\t'.join(str(x) for x in vec)+"\n")
  out_m.write(tokenizer.index_word[index] + "\n")

out_v.close()
out_m.close()

In [101]:
!cat password_v2_tf_metadata.tsv

a
1
e
2
0
i
o
n
r
9
3
s
l
8
7
5
4
6
t
u
d
m
c
y
h
b
k
g
f
p
v
j
w
z
q
x
A
_
.
E
S
M
R
N
L
I
D
T
O
B
C
K
H
G
P
F
-
U
Y
V
J
W
Q
Z
X
!
@
*
$
#
?
/
 
&
=
+
;
)
%
(
,
[
'
"
]
<
|
^
~
\
`
>
{
}
:


In [102]:
!cat password_v2_tf_vectors.tsv

-0.008528288	0.0039825924	-0.0005866526	0.0022191575	0.00076911063	0.0026407829	-0.00092557	0.084372975	0.0017554356	0.0045547136	-0.17291546	-0.00050135277	0.031665996	0.05151941	-0.007490787	-0.023546351	-0.15494853	0.09759661	0.48110893	0.0013809372	-0.03089805	-1.9771855e-05	0.0013423989	-0.20149146	0.14285758	0.01925549	-1.4148129e-05	-0.050638348	0.0025829826	-0.004955405	-0.058886282	-0.005406934	0.013989858	0.112475425	-0.09348821	-0.082602076	0.012346901	-0.28228277	-0.0046060197	-0.011865445	-0.01677128	0.122016385	0.0004148718	0.14815809	-0.05289794	-0.00454685	-0.08678927	0.16361141	-0.016531873	0.072684094	-0.0018177874	-0.0077116955	0.013167575	-0.028214399	-0.052988812	-0.26735684	0.005758425	-0.008617277	0.025361579	0.0869156	-0.1560898	-0.09912373	0.0003499788	-0.0061299424	-0.0010027254	0.20870434	0.0037770227	0.6751217	0.17393513	0.019061195	-0.17407806	-0.06483	-0.001326852	0.011136352	-0.0032978358	-0.04375746	-0.018904652	0.049524754	-0.016168734	-0.0015113354	-0.