In [None]:
!pip install nltk



You should consider upgrading via the 'C:\Users\rajas\Anaconda3\python.exe -m pip install --upgrade pip' command.


In [None]:
import os
import time

import tensorflow as tf

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from nltk.translate.bleu_score import corpus_bleu, SmoothingFunction


In [None]:
data = open('passwords_db.txt').read()

In [None]:
len(data)

178313552

In [None]:
passwds = data.split("\n")

In [None]:
len(passwds)

18308617

# Vectorize the text

In [None]:
vocab = sorted(list(set(''.join(passwds))))

In [None]:
len(vocab)

95

In [None]:
char_indices = dict((c, i) for i, c in enumerate(vocab))
indices_char = dict((i, c) for i, c in enumerate(vocab))

In [None]:
max_len = max(passwds, key=len)

In [None]:
len(max_len)

50

In [None]:
print(f"Total number of passwords {len(passwds)}")
print(f"Passwords vocab size {len(vocab)}")
print(f"Max passwords length {len(max_len)}")

Total number of passwords 18308617
Passwords vocab size 95
Max passwords length 50


In [None]:
input_text = [p[:-1] for p in passwds]
target_text = [p[1:] for p in passwds]

In [None]:
print(f"{passwds[0]} {input_text[0]} {target_text[0]}")

12STEVEN 12STEVE 2STEVEN


In [None]:
tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='', char_level=True, lower=False)
tokenizer.fit_on_texts(passwds)

In [None]:
input_tensor = tokenizer.texts_to_sequences(input_text)
input_tensor = tf.keras.preprocessing.sequence.pad_sequences(input_tensor, padding='post')

In [None]:
input_tensor.shape

(18308617, 49)

In [None]:
target_tensor = tokenizer.texts_to_sequences(target_text)
target_tensor = tf.keras.preprocessing.sequence.pad_sequences(target_tensor, padding='post')

In [None]:
target_tensor.shape

(18308617, 49)

In [None]:
[''.join(i.split()) for i in tokenizer.sequences_to_texts(input_tensor[:5])]

['12STEVE', 'pedroantunescaetan', 'nanloveken', 'cyvoe', '2011kai']

In [None]:
[''.join(i.split()) for i in tokenizer.sequences_to_texts(target_tensor[:5])]

['2STEVEN', 'edroantunescaetano', 'anlovekeng', 'yvoet', '011kaid']

In [None]:
len(tokenizer.word_index)

95

# Split data into Train and Validation

In [None]:
# Batch size
BATCH_SIZE = 32

# Buffer size to shuffle the dataset
BUFFER_SIZE = 10000

# split to 70 30 
input_tensor_train, input_tensor_rem, target_tensor_train, target_tensor_rem = train_test_split(input_tensor, target_tensor, test_size=0.1, shuffle=True)

train_dataset = tf.data.Dataset.from_tensor_slices((input_tensor_train, target_tensor_train))
train_dataset = train_dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

# split to 50 50
input_tensor_val, input_tensor_test, target_tensor_val, target_tensor_test = train_test_split(input_tensor_rem, target_tensor_rem, test_size=0.5)

val_dataset = tf.data.Dataset.from_tensor_slices((input_tensor_val, target_tensor_val))
val_dataset = val_dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)

test_dataset = tf.data.Dataset.from_tensor_slices((input_tensor_test, target_tensor_test))
test_dataset = test_dataset.batch(BATCH_SIZE, drop_remainder=True)

In [None]:
# For performance
AUTOTUNE = tf.data.AUTOTUNE

train_dataset = train_dataset.cache().prefetch(buffer_size=AUTOTUNE)
val_dataset = val_dataset.cache().prefetch(buffer_size=AUTOTUNE)
test_dataset = test_dataset.cache().prefetch(buffer_size=AUTOTUNE)

In [None]:
example_input_batch, example_target_batch = next(iter(train_dataset))
example_input_batch.shape, example_target_batch.shape

(TensorShape([32, 49]), TensorShape([32, 49]))

In [None]:
vocab_size = len(tokenizer.word_index) + 1

max_length_input = input_tensor.shape[1]
max_length_output = target_tensor.shape[1]

embedding_dim = vocab_size
rnn_units = 256 # was 1024

print(f'Vocab size {vocab_size}')
print(f"Max input length {max_length_input}")
print(f"Max input length {max_length_output}")

Vocab size 96
Max input length 49
Max input length 49


# Create Model

In [None]:
class MyModel(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, rnn_units):
    super().__init__(self)
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.gru = tf.keras.layers.GRU(rnn_units,
                                   return_sequences=True,
                                   return_state=True)
    self.dense = tf.keras.layers.Dense(vocab_size)

  def call(self, inputs, states=None, return_state=False, training=False):
    x = inputs
    x = self.embedding(x, training=training)
    if states is None:
      states = self.gru.get_initial_state(x)
    x, states = self.gru(x, initial_state=states, training=training)
    x = self.dense(x, training=training)

    if return_state:
      return x, states
    else:
      return x

In [None]:
model = MyModel(
    vocab_size=vocab_size,
    embedding_dim=embedding_dim,
    rnn_units=rnn_units)

In [None]:
# try model without training
for input_example_batch, target_example_batch in train_dataset.take(1):
    example_batch_predictions = model(input_example_batch)
    print(example_batch_predictions.shape, "# (batch_size, sequence_length, vocab_size)")

(32, 49, 96) # (batch_size, sequence_length, vocab_size)


In [None]:
model.summary()

Model: "my_model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      multiple                  9216      
_________________________________________________________________
gru_1 (GRU)                  multiple                  271872    
_________________________________________________________________
dense_1 (Dense)              multiple                  24672     
Total params: 305,760
Trainable params: 305,760
Non-trainable params: 0
_________________________________________________________________


In [None]:
input_example_batch

<tf.Tensor: shape=(32, 49), dtype=int32, numpy=
array([[10,  7, 11, ...,  0,  0,  0],
       [26,  2, 26, ...,  0,  0,  0],
       [11, 15,  1, ...,  0,  0,  0],
       ...,
       [ 2, 10, 20, ...,  0,  0,  0],
       [ 1, 20,  5, ...,  0,  0,  0],
       [10,  1, 23, ...,  0,  0,  0]])>

In [None]:
tf.random.categorical(example_batch_predictions[0], num_samples=1)

<tf.Tensor: shape=(49, 1), dtype=int64, numpy=
array([[52],
       [85],
       [94],
       [35],
       [ 3],
       [61],
       [46],
       [85],
       [43],
       [65],
       [24],
       [53],
       [14],
       [68],
       [ 8],
       [73],
       [48],
       [45],
       [15],
       [27],
       [10],
       [ 1],
       [14],
       [ 9],
       [34],
       [94],
       [18],
       [18],
       [23],
       [59],
       [80],
       [82],
       [67],
       [49],
       [ 9],
       [48],
       [86],
       [49],
       [92],
       [45],
       [79],
       [76],
       [32],
       [93],
       [90],
       [78],
       [87],
       [67],
       [66]], dtype=int64)>

In [None]:
sampled_indices = tf.random.categorical(example_batch_predictions[0], num_samples=1)
sampled_indices = tf.squeeze(sampled_indices, axis=-1).numpy()

In [None]:
[tf.squeeze(tf.random.categorical(pred, num_samples=1), axis=-1).numpy() for pred in example_batch_predictions] 

[array([ 1, 47, 68, 38, 91, 88,  7, 48, 14, 15, 22, 45, 80, 77, 38, 66,  0,
         1, 37, 92, 68, 68, 49, 74, 75, 22, 75, 20, 93, 50, 30,  8, 36,  2,
        81, 65, 26, 17,  8, 28, 68, 92, 29, 75, 73, 35, 68, 41, 39],
       dtype=int64),
 array([69, 94, 36, 56,  3, 44, 46, 88, 65, 60,  9, 18, 93,  2, 49, 40, 47,
        10, 39, 45, 39, 48, 19, 42, 89, 46,  8, 51, 75,  9, 29, 12, 67, 37,
        11,  9, 37, 81, 89, 15, 95, 68, 23, 17, 78, 47, 26, 79, 30],
       dtype=int64),
 array([80, 23, 76, 25, 59, 56, 23,  2, 20, 42, 19, 91, 33, 56, 84, 42, 29,
        51, 54,  9,  4, 65, 47, 83, 28, 82, 77, 14,  1, 84,  7, 10, 90,  5,
        76,  7,  6, 34, 95, 82,  4, 81, 51, 94, 89, 84, 82,  1, 76],
       dtype=int64),
 array([76, 82, 34, 72, 35,  4, 14, 52, 36, 73, 58, 59, 95, 90, 23, 54, 18,
        93, 93, 32, 53, 90, 89, 84, 86, 59, 25, 23, 38, 33, 63, 20, 31, 95,
         5, 21, 40, 52, 32, 11, 59, 72, 78,  2, 69, 41,  1, 32, 65],
       dtype=int64),
 array([20, 13, 32,  2, 67, 84, 

In [None]:
tokenizer.sequences_to_texts([tf.squeeze(tf.random.categorical(pred, num_samples=1), axis=-1).numpy() for pred in example_batch_predictions] )

['N ( i 7 | l A : k k Y 2 D % { + } _ * p 6 R 2 / 4 ! c * B A r + w + f R ! O , M Y |   ~ $ O 7 (',
 '? . g p a & L A 0 w g a M 0 d Q - > j P x   o L m 9 E > y a 6 J K Y % [ { b i C 1 P J I 5 X 3 D 4',
 'q f 3 u a } O = B ? : 6 T Q _ W U X / b % 1 m > ; ~ 9 m & V T r q N z ) L 2 v v } f X L Y S 9 w K',
 'L & # h v s n ] c i + # b k W , K c / E r | k T ] k 6 l , : ? G C / P v | _ + : , v @ v C d & ? -',
 '5 [ I t p | ` # 4 c 3 G d l + " - % ! w y 0 c ~ 1 0 I R h { # U - { = v K _ 7 ? 6 # 7 ) ^ < p',
 'X Z M T { I l b g ~ Z 8 ` r h , c s 7 1 * < g d ? y Q c E * i T % 6 n K Z ! Z - \\ , d 3 4 X _ J',
 'E k k U M p Y W < . Q [ J R L 0 S V l $ h | E O s K G 9 : / = 2 { 1 ) r S O M } X @   - k ) N v',
 'n : a d s > X h x H " Q ; c Q W F # k } J c + t 4 ~ A 6 " \' X : \\ : [ Z = s q m & e 5 G < z ] T S',
 'E D A n j U 0 C & ~ 5 v * ) K f T - 0 3 i 1 { 3 z B 7 , i ) Q 1 h M 4 @ : s z ) b t 4 ) ~ f 1 r',
 'b i P ) p p A < K H Y S , % . < $ e + P V = m 3 $ n / z d m K 6 P = ` 6 d c d . C C 9 I $

In [None]:
sampled_indices

array([82,  3, 87, 39, 55, 55, 71, 30, 70, 77, 59, 12, 63, 19,  6, 57, 90,
       68,  7, 53, 66, 33, 77, 69,  1, 16, 11, 82,  1, 81, 42, 12,  9, 94,
       26, 73, 26, 69, 80, 28, 15, 80, 85, 36, 91, 32, 72,  6, 34],
      dtype=int64)

In [None]:
input_example_batch[0].numpy()

array([10,  7, 11, 28,  1,  8, 11, 15,  2,  9, 11,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0])

In [None]:
print(f"Input: {[''.join(i.split()) for i in tokenizer.sequences_to_texts([input_example_batch[0].numpy()]) ]}")
print(f"next char prediction : {[''.join(i.split()) for i in tokenizer.sequences_to_texts([sampled_indices]) ]}")

Input: ['losgansters']
next char prediction : ['%1[OGG/jQ?!9*72_`XoKw?#a4s%a\'L9r}b,b#(gt(<E"v$2z']


# Train Model

In [None]:
loss = tf.losses.SparseCategoricalCrossentropy(from_logits=True)

In [None]:
example_batch_mean_loss = loss(target_example_batch, example_batch_predictions)
print("Prediction shape: ", example_batch_predictions.shape, " # (batch_size, sequence_length, vocab_size)")
print("Mean loss:        ", example_batch_mean_loss)

Prediction shape:  (32, 49, 96)  # (batch_size, sequence_length, vocab_size)
Mean loss:         tf.Tensor(4.567118, shape=(), dtype=float32)


In [None]:
tf.exp(example_batch_mean_loss).numpy()

96.26628

In [None]:
input_example_batch.numpy()

array([[10,  7, 11, ...,  0,  0,  0],
       [26,  2, 26, ...,  0,  0,  0],
       [11, 15,  1, ...,  0,  0,  0],
       ...,
       [ 2, 10, 20, ...,  0,  0,  0],
       [ 1, 20,  5, ...,  0,  0,  0],
       [10,  1, 23, ...,  0,  0,  0]])

In [None]:
tokenizer.sequences_to_texts(input_example_batch.numpy())

['l o s g a n s t e r s',
 'b e b o p',
 's t a c y 3',
 'r e w e k i',
 'g o o d d s',
 '5 6 5 7 8 8',
 'l a r e i n a d e p e r s i',
 '1 1 1 3 5 4 a b',
 'n h 1 1 2',
 'b y z r t y',
 'm i l l o n a',
 't e r r e 2',
 'r o b b y r o b',
 '5 0 0 f r e e s t y l',
 '2 3 4 0 5 5',
 'p o l i t h e',
 'd a r a n e e j a',
 'p l a t o o n s e r g e a n',
 'l a f l o u f',
 'c l i i t',
 'p r o v e n c e 1',
 'k g x c',
 'a l e x 6 4 1',
 '7 8 1 k 0 8 m 7 4 5 6',
 'T M O E 1 6',
 '3 0 5 9 8 4 1 1 5 9',
 'z a t i',
 'A m o s 0',
 '0 8 1 9 3 9 6 1 6',
 'e l m o 8 0',
 'a m i n g h o',
 'l a u r i t a 0 6 0']

In [None]:
[p.split() for p in tokenizer.sequences_to_texts(input_example_batch.numpy())]

[['l', 'o', 's', 'g', 'a', 'n', 's', 't', 'e', 'r', 's'],
 ['b', 'e', 'b', 'o', 'p'],
 ['s', 't', 'a', 'c', 'y', '3'],
 ['r', 'e', 'w', 'e', 'k', 'i'],
 ['g', 'o', 'o', 'd', 'd', 's'],
 ['5', '6', '5', '7', '8', '8'],
 ['l', 'a', 'r', 'e', 'i', 'n', 'a', 'd', 'e', 'p', 'e', 'r', 's', 'i'],
 ['1', '1', '1', '3', '5', '4', 'a', 'b'],
 ['n', 'h', '1', '1', '2'],
 ['b', 'y', 'z', 'r', 't', 'y'],
 ['m', 'i', 'l', 'l', 'o', 'n', 'a'],
 ['t', 'e', 'r', 'r', 'e', '2'],
 ['r', 'o', 'b', 'b', 'y', 'r', 'o', 'b'],
 ['5', '0', '0', 'f', 'r', 'e', 'e', 's', 't', 'y', 'l'],
 ['2', '3', '4', '0', '5', '5'],
 ['p', 'o', 'l', 'i', 't', 'h', 'e'],
 ['d', 'a', 'r', 'a', 'n', 'e', 'e', 'j', 'a'],
 ['p', 'l', 'a', 't', 'o', 'o', 'n', 's', 'e', 'r', 'g', 'e', 'a', 'n'],
 ['l', 'a', 'f', 'l', 'o', 'u', 'f'],
 ['c', 'l', 'i', 'i', 't'],
 ['p', 'r', 'o', 'v', 'e', 'n', 'c', 'e', '1'],
 ['k', 'g', 'x', 'c'],
 ['a', 'l', 'e', 'x', '6', '4', '1'],
 ['7', '8', '1', 'k', '0', '8', 'm', '7', '4', '5', '6'],
 ['T', '

In [None]:
ref = ['a','b','c']
hyp = ['a','b','d']
corpus_bleu(ref, hyp, weights=[0.25])

0.9036020036098448

In [None]:
# if all are equal
ref = [['a', 'b', 'c'], ['d','e'], ['f'], [' ']]
hyp = [['a', 'b', 'c'], ['d','e'], ['f'], [' ']]
corpus_bleu(ref, hyp, weights=[0.25])

1.0

In [None]:
# if some are equal
ref = [['a', 'b', 'c'], ['d','e'], ['f'], [' ']]
hyp = [['a', 'b', 'f'], ['d','e'], ['f'], [' ']]
corpus_bleu(ref, hyp, weights=[0.25])

0.9621954581957615

In [None]:
# if non are equal
ref = [['a', 'b', 'c'], ['d','e'], ['f'], [' ']]
hyp = [['p', 'q', 'r'], ['s','t'], ['u'], ['w']]
corpus_bleu(ref, hyp, weights=[0.25])

0

In [None]:
def bleu_score(y_true, y_pred):
  true_seq = tokenizer.sequences_to_texts(y_true.numpy())
  reference = [p.split() if p.split() else [' '] for p in true_seq]
  preds = [tf.squeeze(tf.random.categorical(pred, num_samples=1), axis=-1).numpy() for pred in y_pred]
  hypothesis = [p.split() if p.split() else [' '] for p in tokenizer.sequences_to_texts(preds)]
  return corpus_bleu(reference, hypothesis, weights=[0.25])

In [None]:
bleu_score(input_example_batch, example_batch_predictions)

0.48924102131677855

In [None]:
# GPU
model.compile(optimizer='adam', loss=loss)

# BLEU score on test dataset before training

In [None]:
# try model without training
def calculate_bleu_score(model, dataset):
  scores = []
  for input_test_batch, target_test_batch in dataset:
      test_batch_predictions = model(input_test_batch)
      scores.append(bleu_score(target_test_batch, test_batch_predictions))
  return np.average(scores)

In [None]:
calculate_bleu_score(model, test_dataset)

0.478942012748074

# Configure Checkpoints

In [None]:
# Directory where the checkpoints will be saved
checkpoint_dir = './training_checkpoints_gru'
# Name of the checkpoint files
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt_{epoch}")

earlystopping_cb = tf.keras.callbacks.EarlyStopping(
    monitor = 'val_loss',
    mode = 'min',
    verbose = 1,
    patience = 3,
    restore_best_weights = True
)

checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_prefix,
    monitor = 'val_loss',
    mode = 'min',
    save_best_only=True,
    verbose = 1)

In [None]:
EPOCHS = 4

history = model.fit(train_dataset, 
                    validation_data=val_dataset, 
                    epochs=EPOCHS, 
                    callbacks=[checkpoint_callback, earlystopping_cb])

Epoch 1/4

Epoch 00001: val_loss improved from inf to 0.41141, saving model to ./training_checkpoints_gru\ckpt_1




INFO:tensorflow:Assets written to: ./training_checkpoints_gru\ckpt_1\assets


INFO:tensorflow:Assets written to: ./training_checkpoints_gru\ckpt_1\assets


Epoch 2/4

Epoch 00002: val_loss improved from 0.41141 to 0.41006, saving model to ./training_checkpoints_gru\ckpt_2




INFO:tensorflow:Assets written to: ./training_checkpoints_gru\ckpt_2\assets


INFO:tensorflow:Assets written to: ./training_checkpoints_gru\ckpt_2\assets


Epoch 3/4

Epoch 00003: val_loss improved from 0.41006 to 0.40927, saving model to ./training_checkpoints_gru\ckpt_3




INFO:tensorflow:Assets written to: ./training_checkpoints_gru\ckpt_3\assets


INFO:tensorflow:Assets written to: ./training_checkpoints_gru\ckpt_3\assets


Epoch 4/4

Epoch 00004: val_loss improved from 0.40927 to 0.40915, saving model to ./training_checkpoints_gru\ckpt_4




INFO:tensorflow:Assets written to: ./training_checkpoints_gru\ckpt_4\assets


INFO:tensorflow:Assets written to: ./training_checkpoints_gru\ckpt_4\assets


In [None]:
model.save('pass_saved_model/password')



INFO:tensorflow:Assets written to: pass_saved_model/password\assets


INFO:tensorflow:Assets written to: pass_saved_model/password\assets


In [None]:
calculate_bleu_score(model, test_dataset)

0.7467063434858601

In [None]:
model.save_weights("pass_tf_weights")

# Inference

In [None]:
class OneStep(tf.keras.Model):
  def __init__(self, model, tokenizer, temperature=1.0):
    super().__init__()
    self.temperature = temperature
    self.model = model
    self.tokenizer = tokenizer

  #@tf.function
  def generate_one_step(self, input_chars, states=None):
    #import pdb; pdb.set_trace()
    # Convert strings to token IDs.
    # input_chars = tf.strings.unicode_split(inputs, 'UTF-8')
    input_ids = self.tokenizer.texts_to_sequences(input_chars)
    input_ids = tf.keras.preprocessing.sequence.pad_sequences(input_ids, padding='post')

    # Run the model.
    # predicted_logits.shape is [batch, char, next_char_logits]
    predicted_logits, states = self.model(inputs=input_ids, states=states,
                                          return_state=True)
    # Only use the last prediction.
    predicted_logits = predicted_logits[:, -1, :]
    predicted_logits = predicted_logits/self.temperature

    # Sample the output logits to generate token IDs.
    predicted_ids = tf.random.categorical(predicted_logits, num_samples=1)
    predicted_ids = tf.squeeze(predicted_ids, axis=-1)

    # Convert from token ids to characters
    predicted_chars = self.tokenizer.sequences_to_texts([predicted_ids.numpy()])

    # Return the characters and model state.
    return predicted_chars, states

In [None]:
one_step_model = OneStep(model, tokenizer)

In [None]:
start = time.time()
states = None
next_char = ['w']
result = [next_char]

for n in range(2):
  next_char, states = one_step_model.generate_one_step(next_char, states=states)
  result.append(next_char)

result = tf.strings.join(result)
end = time.time()
print(result[0].numpy().decode('utf-8'), '\n\n' + '_'*80)
print('\nRun time:', end - start)

wil 

________________________________________________________________________________

Run time: 0.04679465293884277


In [None]:
def pwds_from_chars(start_char, len):
  states = None
  next_char = [start_char]
  result = [next_char]

  for n in range(len-1):
    next_char, states = one_step_model.generate_one_step(next_char, states=states)
    result.append(next_char)

  result = tf.strings.join(result)
  return result[0].numpy().decode('utf-8')

In [None]:
all_pos = []
while True:
  pred = pwds_from_chars('w', 3)
  if pred not in all_pos:
    all_pos.append(pred)
  else:
    break

print(all_pos)

['whn', 'was', 'wss', 'wil', 'wac', 'wob', 'who', 'wer', 'we6', 'wha', 'wes']


In [None]:
given_pass = 'Password1'
attempts = 1

while True:
  pred = pwds_from_chars(given_pass[0], len(given_pass))
  print(f"{attempts} - {pred}")
  if pred == given_pass:
    break
  attempts += 1

print(f"Model took {attempts} attempts to find password - {given_pass}")

1 - PANDOWCOM
2 - Politim12
3 - P1Z5Z36Q1
4 - Phils#312
5 - Peneme160
6 - PaLiVasda
7 - Panisabas
8 - Pedrostin
9 - PRESO514#
10 - PersiaM80
11 - PALOPER55
12 - PAGAUNECI
13 - Pom#2lu80
14 - Papa62873
15 - PONDBOOZZ
16 - PecKhouse
17 - Peec65alm
18 - POODSONEK
19 - PORNILLOS
20 - Pinoyano7
21 - Pamk30000
22 - Padololin
23 - PpA!2002C
24 - Pgc716558
25 - Pordot001
26 - PPLTPWH12
27 - Pirudabon
28 - PSRSS0826
29 - Planet!!!
30 - PRESUNCEC
31 - PINNIEZOV
32 - P08148576
33 - P4oygHdr2
34 - PROWAR123
35 - PATITO101
36 - PIGGLESTE
37 - PUPPYJUN8
38 - Password0
39 - PACHISSEN
40 - P0Dd,m1xX
41 - PARIS1006
42 - Pimpgirl4
43 - PELILLOW9
44 - PAPALITA1
45 - POTTY1135
46 - PUNDY2004
47 - Peace20<1
48 - PAWL04196
49 - PALIDAZER
50 - Paal0pPee
51 - Picalomez
52 - PTR335280
53 - Pietc2Rad
54 - Puppycut1
55 - PANCHi129
56 - Papa99999
57 - PENN555LE
58 - Poohbae14
59 - PICHI9404
60 - PR6WLOWLI
61 - PALTAWNX4
62 - Pnoonet2@
63 - Plavistu1
64 - PSIMOS157
65 - PARADIsel
66 - PiCogWapi
67 - Phombobob
68 -

# Attempts calculation

In [None]:
import random

def gen_passwords(num, pass_len=5, seq_len=3, start_char=None):
  passwords = []
  gen_char = False
  if start_char is None:
    gen_char = True
  for i in range(num):
    # if start char is none, then randomly pick start_char
    if gen_char:
      start_char = indices_char[random.randint(1, len(char_indices) - 1)]
    passwords.append( pwds_from_chars(start_char, pass_len))
  return passwords

In [None]:
gen_pass = gen_passwords(100, pass_len=5)

In [None]:
gen_pass

['Ujp23',
 'BHETT',
 '}damn',
 '}{754',
 '@bubb',
 '+shoe',
 'I084d',
 'LA541',
 'giVe1',
 ']gd]h',
 '`ybeb',
 'GALIT',
 'zephu',
 'venot',
 'han03',
 ']_guy',
 'dame8',
 'AKTAS',
 'irli3',
 'G@$be',
 'BADAN',
 'wal44',
 'I<34,',
 'NANA4',
 'dange',
 '>pxy#',
 'quaqi',
 '`"lov',
 '}X9J9',
 'venda',
 'uture',
 '}{{DU',
 'escol',
 '13613',
 'd0013',
 'IN-DA',
 'ilove',
 '%bram',
 ')+???',
 'PRIDE',
 '`kida',
 'hetes',
 '.mh-m',
 'darcs',
 '}GIll',
 'LUNAR',
 '~+bus',
 'Kayle',
 'VWkha',
 'rrami',
 '#0415',
 'qi77e',
 'Bandi',
 'BLUKY',
 'XLOCE',
 'XQTTA',
 '(octu',
 '!craz',
 'Paper',
 'cdfrs',
 'z21qz',
 'lillo',
 'Wild2',
 '/.B.N',
 'Memno',
 'crfvg',
 'Ginge',
 'Evwv3',
 'ian97',
 'flymj',
 'Respa',
 'Floru',
 '-1453',
 'katsa',
 'nathy',
 'GOREN',
 '8235a',
 '8luas',
 'WHOWL',
 '"1"20',
 '(&bra',
 '93010',
 'RAIEM',
 '&+#91',
 '09378',
 'yinlo',
 '#12ma',
 'Oscar',
 '@bitc',
 '-1423',
 '{Aloh',
 "]1',,",
 '~babe',
 'Titit',
 '?8803',
 'LOVEC',
 'HO0NY',
 'Naegi',
 'JULES',
 'Yharl']

In [None]:
import hashlib
import requests

def check_pwned(passes):
  headers = {
    "User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.2.8) Gecko/20100722 Firefox/3.6.8 GTB7.1 (.NET CLR 3.5.30729)", 
    "Referer": "https://haveibeenpwned.com/"
  }
  found_passwds = {}
  pwned_api = 'https://api.pwnedpasswords.com/range/'
  for p in passes:
    hash_object = hashlib.sha1(p.encode())
    pbHash = hash_object.hexdigest().upper()
    try:
      res = requests.get(pwned_api + pbHash[:5],  headers=headers, timeout=10)
      range_hashes = res.text.split('\r\n')
      for h in range_hashes:
        h_c = h.split(':')
        if h_c[0] == pbHash[5:]:
          found_passwds[p] = h_c[1]
    except Exception as e:
      print(f'request timed out for pass {p}')
  return found_passwds

In [None]:
fnd_pass = check_pwned(gen_pass)

In [None]:
fnd_pass

{'GALIT': '21',
 'venot': '14',
 'han03': '9',
 'dame8': '7',
 'AKTAS': '3',
 'BADAN': '1',
 'NANA4': '10',
 'dange': '234',
 'venda': '628',
 'uture': '10',
 'escol': '55',
 '13613': '224',
 'd0013': '7',
 'ilove': '28149',
 'PRIDE': '95',
 'hetes': '50',
 'darcs': '8',
 'LUNAR': '50',
 'Kayle': '24',
 'rrami': '5',
 'Bandi': '87',
 'Paper': '167',
 'lillo': '3039',
 'Wild2': '29',
 'crfvg': '15',
 'Ginge': '56',
 'ian97': '14',
 'flymj': '1',
 '-1453': '8',
 'katsa': '44',
 'nathy': '980',
 'GOREN': '6',
 '93010': '181',
 '09378': '17',
 'yinlo': '2',
 'Oscar': '2044',
 'Titit': '18',
 'LOVEC': '11',
 'JULES': '339'}

In [None]:
len(fnd_pass)

39

In [None]:
def gen_passwords_random(num, pass_len=5, start_char=None):
  passwords = []
  gen_char = False
  if start_char is None:
    gen_char = True
  for i in range(num):
    # if start char is none, then randomly pick start_char
    if gen_char:
      start_char = indices_char[random.randint(1, len(char_indices) - 1)]

    password = start_char
    for j in range(pass_len - 1):
      password += indices_char[random.randint(1, len(char_indices) - 1)]
    passwords.append(password)
  return passwords

In [None]:
gen_rand_pass = gen_passwords_random(100, pass_len=5)

In [None]:
gen_rand_pass

['?^>y4',
 'K(czS',
 'rv(Rp',
 'YR|1b',
 '<2nXx',
 '^x$J{',
 'h+kJc',
 '_3O,Z',
 's;I:8',
 '$Ed~)',
 '\\VIrf',
 'E-ga~',
 '^*6J`',
 '^t<;P',
 '(_[l|',
 '-YqUS',
 'p#Mb?',
 "r'6Id",
 '_B6hH',
 '?Iv8w',
 '8?<oK',
 'ezfHK',
 'K{hfA',
 'cs41&',
 '))A*E',
 'w]V(+',
 'GOR-*',
 '%/}Yl',
 'a|x*}',
 "#0''m",
 'S!@#d',
 'A@29.',
 '9";D(',
 'mcwHw',
 'wS+Mq',
 'T\\3Gq',
 ':kYSK',
 'Js![!',
 '=x?r;',
 'tU<$<',
 'zcynl',
 'A09/|',
 'PwRLT',
 '{YD%o',
 'z!:M)',
 "'w)c=",
 'NsbQ`',
 '5}+!B',
 '++]9N',
 's=5#~',
 'S"E[`',
 "k'^*m",
 '78yv!',
 'k)5S>',
 '!-lr>',
 '\\5PGk',
 '>J`mz',
 '.o[dy',
 'OJvK6',
 'S`_U^',
 'O}gbQ',
 'XP/cV',
 '%(ZgI',
 'q0Oh+',
 '3_[[,',
 'k2M"@',
 '}/p<n',
 'OPy4C',
 'Y@FHg',
 'ziW7Q',
 'JuZKM',
 '|1vd;',
 'lmp2m',
 'h}9,O',
 ':ADe^',
 '9%FI_',
 'A*`j^',
 'DBZ;|',
 '\\`a5e',
 '!_1Nk',
 'zN~H{',
 'c<X>/',
 'WT+q3',
 'HL:&V',
 'OMynS',
 '^mr)L',
 ')sq3R',
 'uyB[+',
 'e55.o',
 '\\%24b',
 'u~(v1',
 '`nJau',
 'lM2<;',
 'm#yM;',
 '5lV6x',
 'sKE<C',
 'EQ`@I',
 '{TB{-',
 'uz%3;',
 '?[V

In [None]:
fnd_pass = check_pwned(gen_rand_pass)

In [None]:
fnd_pass

{}

# load and test model

In [None]:
new_model = MyModel(
    vocab_size=vocab_size,
    embedding_dim=embedding_dim,
    rnn_units=rnn_units)

In [None]:
new_model.load_weights('pass_tf_weights')

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x2221dc7f148>

In [None]:
new_one_step_model = OneStep(new_model, tokenizer)

In [None]:
start = time.time()
states = None
next_char = ['w']
result = [next_char]

for n in range(2):
  next_char, states = new_one_step_model.generate_one_step(next_char, states=states)
  result.append(next_char)

result = tf.strings.join(result)
end = time.time()
print(result[0].numpy().decode('utf-8'), '\n\n' + '_'*80)
print('\nRun time:', end - start)

whk 

________________________________________________________________________________

Run time: 0.06894755363464355


In [None]:
def pwds_from_chars(model, start_char, len):
  states = None
  next_char = [start_char]
  result = [next_char]

  for n in range(len-1):
    next_char, states = model.generate_one_step(next_char, states=states)
    result.append(next_char)

  result = tf.strings.join(result)
  return result[0].numpy().decode('utf-8')

In [None]:
given_pass = 'Password1'
attempts = 1

while True:
  pred = pwds_from_chars(new_one_step_model,given_pass[0], len(given_pass))
  print(f"{attempts} - {pred}")
  if pred == given_pass:
    break
  attempts += 1

print(f"Model took {attempts} attempts to find password - {given_pass}")

1 - PANNEDSBR
2 - PHU18GENS
3 - PIT301020
4 - PAWWAS112
5 - PURPLE881
6 - PAIMEGANN
7 - PLSISTERA
8 - PMSME1027
9 - PAUDO2295
10 - PANgcua10
11 - PC3006840
12 - PSmATPvA1
13 - Prehanier
14 - PICE11229
15 - PET205LOB
16 - Palesitah
17 - Pzt634orn
18 - Poppydurm
19 - PRIMO9910
20 - PRILSROCK
21 - PVqVEU258
22 - PREETYS09
23 - PINKDOM10
24 - Ptr130020
25 - Punk319#1
26 - Pink78!!!
27 - Pritty,-3
28 - PITDOGYJE
29 - PvjitfhF,
30 - PissybRat
31 - P72610247
32 - Plomasmai
33 - Pa2ton27*
34 - PEN3LOVEL
35 - PERRWELIT
36 - PINTer118
37 - Pashspenc
38 - Purple4ev
39 - PWPSTEST1
40 - PiMp90!@%
41 - PRODUCTUM
42 - Pongatag9
43 - PHILLIPPO
44 - PROCEAN59
45 - PALOMAELE
46 - Papolouel
47 - PORTODY10
48 - Phatman01
49 - PHIRE.VEI
50 - Pescha789
51 - Pwtrutosu
52 - PIZZANO19
53 - Princesa2
54 - P07712245
55 - PENISMASL
56 - PEPESONGE
57 - Patskiand
58 - Perlka745
59 - POIONAMON
60 - Pinaces.1
61 - PUMANSO38
62 - P303lOTOs
63 - PASTUNJOV
64 - PJ3A11990
65 - PRINCESSL
66 - PooTer521
67 - PATRISITO
68 -

In [None]:
passwds[:10]

['12STEVEN',
 'pedroantunescaetano',
 'nanlovekeng',
 'cyvoet',
 '2011kaid',
 'ctcdolf',
 'bottan',
 'mipollito1',
 'jmkis2',
 'donuttop']

In [None]:
num_pass_w_char = {}

for p in passwds:
  if p[0] not in num_pass_w_char:
    num_pass_w_char[p[0]] = 1
  else:
    num_pass_w_char[p[0]] += 1

In [None]:
num_pass_w_char

{'1': 823206,
 'p': 608522,
 'n': 423416,
 'c': 787150,
 '2': 521474,
 'b': 775680,
 'm': 1049670,
 'j': 665901,
 'd': 633793,
 'k': 568850,
 '9': 282543,
 'i': 346747,
 '7': 220206,
 'g': 404797,
 'B': 115107,
 'z': 130732,
 'v': 184468,
 'f': 370375,
 'h': 393561,
 '0': 1015982,
 'A': 119926,
 'y': 165067,
 'K': 73485,
 '8': 231216,
 't': 631618,
 'a': 913219,
 's': 1045484,
 'r': 517768,
 'l': 679391,
 'G': 59707,
 '5': 275344,
 'M': 144893,
 '6': 219532,
 '*': 23749,
 'D': 93139,
 '4': 287482,
 '.': 6830,
 'w': 252915,
 'H': 57762,
 'u': 85352,
 'O': 22780,
 'q': 64662,
 '"': 1184,
 'P': 82620,
 'S': 144318,
 'o': 174433,
 '#': 8665,
 'F': 52735,
 'x': 75198,
 'T': 91790,
 'L': 98330,
 'e': 365722,
 'E': 48011,
 'I': 46595,
 '3': 291181,
 'C': 106820,
 'R': 74293,
 'U': 12783,
 'J': 96716,
 'V': 25220,
 '@': 10417,
 'X': 9974,
 'Z': 16804,
 'N': 56403,
 '/': 1964,
 'Q': 11284,
 '_': 3909,
 'W': 33665,
 'Y': 20822,
 '<': 2359,
 '+': 2042,
 '{': 504,
 '!': 9348,
 '`': 1002,
 '$': 846

In [None]:
len(num_pass_w_char)

95

In [None]:
sum(num_pass_w_char.values())

18308617

In [None]:
len(passwds)

18308617

In [None]:
prob_pass_start_chars = dict([(k, num_pass_w_char[k]/len(passwds)) for k in num_pass_w_char])

In [None]:
sorted_num_pass_w_char = sorted(prob_pass_start_chars.items(), key=lambda x:x[1])

In [None]:
sorted_num_pass_w_char[-10:]

[('d', 0.03461719691880605),
 ('j', 0.03637090666105474),
 ('l', 0.037107718185376865),
 ('b', 0.04236693574397236),
 ('c', 0.04299341670646122),
 ('1', 0.04496276261609492),
 ('a', 0.04987919076574708),
 ('0', 0.05549201231310918),
 ('s', 0.05710338470677496),
 ('m', 0.057332020217583886)]

In [None]:
char_dist = dict(sorted_num_pass_w_char)

In [None]:
gen_rand_pass = gen_passwords_random(100, pass_len=5, start_char='m')

In [None]:
gen_rand_pass

['m!g~=',
 'mDxdX',
 'm39wP',
 'm8xzL',
 'mue9g',
 'm;t?u',
 'ml=}\\',
 'm;q&F',
 'mmTai',
 'mBaw8',
 'mO.w6',
 'm(kX)',
 'mz1qY',
 'm)kv3',
 'm)+tJ',
 'm6Xk4',
 'mqR!t',
 'm#=p9',
 'm\\)W6',
 'mRw\\N',
 'm!o$B',
 'mzy5N',
 'mG8m@',
 'mA+kv',
 'mJno&',
 'm6HNH',
 'mqPN{',
 'mEeM.',
 'm|)fO',
 'mU=g\\',
 'm#A}A',
 'ms3hF',
 'mPYrz',
 'ms<(Z',
 'm-evW',
 'm4E.G',
 'mp?rm',
 'mayaE',
 'm"^F`',
 'myZ?L',
 'm7UOo',
 'm1]/s',
 'mXo&~',
 "mU'0y",
 'mhB!<',
 "mH'7#",
 'mSYZe',
 'm54|O',
 "m&'Z=",
 'm5YWX',
 'm$&ao',
 'm2!yS',
 'm=G:i',
 'mJMST',
 'm6"Z4',
 'mD7ZI',
 'mFq[+',
 'm\\Z\\1',
 'mcHqd',
 'm)Fd=',
 'my%or',
 'm+`rZ',
 'm9U?$',
 'm\\wA$',
 'm{.Gy',
 'mFTA]',
 'mCwTI',
 'm(BIF',
 'm0,L:',
 'm^>y@',
 'mk3#\\',
 'mWW<8',
 'm<fVf',
 'me]nj',
 "m^g'l",
 'm[K3n',
 'mDmb.',
 'm_pQC',
 'mcV2y',
 'mwB@#',
 'm/<bn',
 'mkfXW',
 'm.CVr',
 'mffeM',
 'm0*=&',
 'mF>/1',
 'm(px\\',
 'm|=52',
 'mo]Y<',
 'm{kHT',
 'mPxQg',
 'mbA%D',
 'mL)ag',
 'm],7w',
 'mbkiq',
 'mR%66',
 'mBpsL',
 'm~=HX',
 'm}MA)',
 

In [None]:
fnd_pass = check_pwned(gen_rand_pass)

In [None]:
fnd_pass

{}

In [None]:
import random

def gen_passwords(model, num, pass_len=5, seq_len=3, start_char=None):
  passwords = []
  gen_char = False
  if start_char is None:
    gen_char = True
  for i in range(num):
    # if start char is none, then randomly pick start_char
    if gen_char:
      start_char = indices_char[random.randint(1, len(char_indices) - 1)]
    passwords.append(pwds_from_chars(model, start_char, pass_len))
  return passwords

In [None]:
random_preds = {}
model_preds = {}

total_num_pass = 100
pass_length = 5

for i in vocab:
  gen_rand_pass = gen_passwords_random(total_num_pass, pass_len=pass_length, start_char=i)
  random_preds[i] = len(check_pwned(gen_rand_pass))/total_num_pass
  gen_pass = gen_passwords(new_one_step_model, total_num_pass, pass_len=pass_length, start_char=i)
  model_preds[i] = len(check_pwned(gen_pass))/total_num_pass
  print(f"char '{i}' : char dist {char_dist[i]}, random password prob {random_preds[i]}, model password prob: {model_preds[i]}")

char ' ' : char dist 3.894341118174027e-05, random password prob 0.0, model password prob: 0.05
char '!' : char dist 0.0005105792534739243, random password prob 0.0, model password prob: 0.09
char '"' : char dist 6.466900257949577e-05, random password prob 0.0, model password prob: 0.02
char '#' : char dist 0.00047327441499267806, random password prob 0.0, model password prob: 0.05
char '$' : char dist 0.00046245983516941775, random password prob 0.0, model password prob: 0.09
char '%' : char dist 5.139656370549452e-05, random password prob 0.0, model password prob: 0.01
char '&' : char dist 5.2488945505823845e-05, random password prob 0.0, model password prob: 0.03
char ''' : char dist 2.5288638677623767e-05, random password prob 0.0, model password prob: 0.01
char '(' : char dist 0.0004297976193395711, random password prob 0.0, model password prob: 0.01
char ')' : char dist 2.4196256877294447e-05, random password prob 0.0, model password prob: 0.01
char '*' : char dist 0.001297148768

In [None]:
df1 =pd.DataFrame(char_dist.items(), columns=['char', 'dist'])
df2 = pd.DataFrame(random_preds.items(), columns=['char', 'rand_pred_prob'])
df3 = pd.DataFrame(model_preds.items(), columns=['char', 'model_pred_prob'])

In [None]:
df4 = df1.join(df2.set_index('char'), on='char').join(df3.set_index('char'), on='char')

In [None]:
df4.sort_values(by=['dist'], ascending=False)

Unnamed: 0,char,dist,rand_pred_prob,model_pred_prob
94,m,0.057332,0.00,0.77
93,s,0.057103,0.00,0.76
92,0,0.055492,0.00,0.90
91,a,0.049879,0.00,0.77
90,1,0.044963,0.01,0.71
89,c,0.042993,0.02,0.75
88,b,0.042367,0.01,0.79
87,l,0.037108,0.00,0.77
86,j,0.036371,0.01,0.71
85,d,0.034617,0.01,0.72


# Save embeddings

In [None]:
embed_weights = model.get_layer('embedding_1').get_weights()[0]

In [None]:
embed_weights.shape

(96, 96)

In [None]:
len(tokenizer.word_index)

95

In [None]:
len(vocab)

95

In [None]:
embed_weights[1]

array([ 4.47295845e-01,  1.95193067e-02,  2.30765305e-02,  7.99807385e-02,
        2.22967882e-02,  2.36143614e-03,  4.18019891e-02,  1.42912026e-02,
       -6.22050371e-03,  1.90549597e-01,  1.46776140e-02, -5.95092066e-02,
       -1.42914979e-02,  3.78507487e-02,  6.44187070e-03,  1.32179549e-02,
        1.36095518e-02,  1.74970869e-02,  7.35729141e-03, -1.53447418e-02,
        1.98352919e-03, -9.61466506e-02, -1.02424277e-02, -1.05616990e-02,
        1.07361963e-02,  4.00884636e-02,  3.02073453e-02, -7.59140239e-04,
        1.06512941e-01,  6.18504034e-03,  8.40000883e-02, -3.30386050e-02,
        1.02686426e-02, -5.68289589e-03,  5.82615240e-03,  7.85903074e-03,
        3.64127606e-02, -1.18739665e-01, -3.43784019e-02, -1.36636188e-02,
       -4.93826345e-03, -6.08909037e-03, -7.99061079e-03, -7.60126999e-03,
        3.94845521e-03,  6.11208146e-03, -1.86935943e-02,  7.87161291e-03,
       -1.03353700e-02, -1.55544775e-02,  1.58636138e-01, -3.54893180e-03,
       -4.04362194e-03, -

In [None]:
import io

out_v = io.open('password_tf_vectors.tsv', 'w', encoding='utf-8')
out_m = io.open('password_tf_metadata.tsv', 'w', encoding='utf-8')

for index in range(len(tokenizer.index_word) + 1):
  if index == 0:
    continue #skip 0, it's padding. 
  vec = embed_weights[index]
  out_v.write('\t'.join(str(x) for x in vec)+"\n")
  out_m.write(tokenizer.index_word[index] + "\n")

out_v.close()
out_m.close()

In [None]:
!more password_tf_metadata.tsv

a
e
1
0
i
2
o
n
r
l
s
9
3
8
t
4
5
6
7
m
c
d
u
h
y
b
k
g
p
j
f
v
w
z
A
E
x
I
O
R
S
L
N
M
T
C
D
q
B
.
H
Y
K
U
G
P
_
J
!
-
F
@
*
V
W
 
Z
X
#
Q
/
$
,
+
&
\
?
)
=
(
'
%
;
]
<
~
[
:
^
`
"
>
{
}
|


In [None]:
!more password_tf_vectors.tsv

0.44729584      0.019519307     0.02307653      0.07998074      0.022296788     0.0023614361    0.04180199      0.014291203     -0.0062205037   0.1905496       0.014677614     -0.059509207    -0.014291498    0.03785075      0.0064418707    0.013217955     0.013609552     0.017497087     0.0073572914    -0.015344742    0.0019835292    -0.09614665     -0.010242428    -0.010561699    0.010736196     0.040088464     0.030207345     -0.00075914024  0.10651294      0.0061850403    0.08400009      -0.033038605    0.010268643     -0.005682896    0.0058261524    0.007859031     0.03641276      -0.118739665    -0.034378402    -0.013663619    -0.0049382634   -0.0060890904   -0.007990611    -0.00760127     0.003948455     0.0061120815    -0.018693594    0.007871613     -0.01033537     -0.015554477    0.15863614      -0.0035489318   -0.004043622    -0.114724025    0.011146712     -0.005201223    -0.079353824    0.00022193327   -0.0005136129   -0.008376479    -0.004572001    0.0024597698    0.031618