In [2]:
import tensorflow as tf

## Build Vocab

In [12]:
test_passwds = tf.io.gfile.GFile('passwd_db_test').read().split("\n")

In [13]:
tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='', char_level=True, lower=False)
tokenizer.fit_on_texts(test_passwds)

In [14]:
vocab_size = len(tokenizer.index_word) + 1

## Define Model

In [7]:
embedding_dim = vocab_size
rnn_units = 256 # was 1024

class MyModel(tf.keras.Model):
  def __init__(self, vocab_size, embedding_dim, rnn_units):
    super().__init__(self)
    self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
    self.gru = tf.keras.layers.GRU(rnn_units,
                                   return_sequences=True,
                                   return_state=True)
    self.dense = tf.keras.layers.Dense(vocab_size)

  def call(self, inputs, states=None, return_state=False, training=False):
    x = inputs
    x = self.embedding(x, training=training)
    if states is None:
      states = self.gru.get_initial_state(x)
    x, states = self.gru(x, initial_state=states, training=training)
    x = self.dense(x, training=training)

    if return_state:
      return x, states
    else:
      return x

## Load trained model weights

In [8]:
prediction_model = MyModel(
    vocab_size=vocab_size,
    embedding_dim=embedding_dim,
    rnn_units=rnn_units)

In [9]:
prediction_model.load_weights('gpu_password_gru')

<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f20a42c35b0>

## Methods to generate passwords using model

In [15]:
class OneStep(tf.keras.Model):
  def __init__(self, model, tokenizer, temperature=1.0):
    super().__init__()
    self.temperature = temperature
    self.model = model
    self.tokenizer = tokenizer

  #@tf.function
  def generate_one_step(self, input_chars, states=None):
    #import pdb; pdb.set_trace()
    # Convert strings to token IDs.
    # input_chars = tf.strings.unicode_split(inputs, 'UTF-8')
    if input_chars[0] == '':
      input_chars = ' '
    input_ids = self.tokenizer.texts_to_sequences(input_chars)
    #print(input_chars)
    input_ids = tf.keras.preprocessing.sequence.pad_sequences(input_ids, padding='post')

    # Run the model.
    # predicted_logits.shape is [batch, char, next_char_logits]
    predicted_logits, states = self.model(inputs=input_ids, states=states,
                                          return_state=True)
    # Only use the last prediction.
    predicted_logits = predicted_logits[:, -1, :]
    predicted_logits = predicted_logits/self.temperature

    # Sample the output logits to generate token IDs.
    predicted_ids = tf.random.categorical(predicted_logits, num_samples=1)
    predicted_ids = tf.squeeze(predicted_ids, axis=-1)

    # Convert from token ids to characters
    predicted_chars = self.tokenizer.sequences_to_texts([predicted_ids.numpy()])

    # Return the characters and model state.
    return predicted_chars, states

In [16]:
one_step_model = OneStep(prediction_model, tokenizer)

In [19]:
def pwds_from_chars(model, start_char, len):
  states = None
  next_char = [start_char]
  result = [next_char]

  for n in range(len-1):
    next_char, states = model.generate_one_step(next_char, states=states)
    result.append(next_char)

  result = tf.strings.join(result)
  return result[0].numpy().decode('utf-8')

In [20]:
def gen_passwords(model, num, pass_len=5, seq_len=3, start_char=None):
  passwords = []
  gen_char = False
  if start_char is None:
    gen_char = True
  for i in range(num):
    # if start char is none, then randomly pick start_char
    if gen_char:
      start_char = tokenizer.index_word[random.randint(1, len(tokenizer.index_word) - 1)]
    passwords.append(pwds_from_chars(model, start_char, pass_len))
  return passwords

In [50]:
def gen_passwords_not_in_training(model, num, pass_len=5, seq_len=3, start_char=None):
  passwords = []
  gen_char = False
  if start_char is None:
    gen_char = True
  i = 0
  while i <= num:
    # if start char is none, then randomly pick start_char
    if gen_char:
      start_char = tokenizer.index_word[random.randint(1, len(tokenizer.index_word) - 1)]
    password = pwds_from_chars(model, start_char, pass_len)
    if password not in top_10_passwds[start_char]:
        passwords.append(password)
        i += 1
  return passwords

In [62]:
import random

def gen_passwords_random(num, pass_len=5, start_char=None):
  passwords = []
  gen_char = False
  if start_char is None:
    gen_char = True
  for i in range(num):
    # if start char is none, then randomly pick start_char
    if gen_char:
      start_char = tokenizer.index_word[random.randint(1, len(tokenizer.index_word) - 1)]

    password = start_char
    for j in range(pass_len - 1):
      password += tokenizer.index_word[random.randint(1, len(tokenizer.index_word) - 1)]
    passwords.append(password)
  return passwords

## Get top 10 starting chars which have high password distribution

In [31]:
import json

f = open('passwd_dist.json')
data_dist = json.load(f)

total_passwds = 0
for k in data_dist:
    total_passwds += data_dist[k]
    
prob_pass_start_chars = dict([(k, data_dist[k]/total_passwds) for k in data_dist])

In [32]:
sorted_num_pass_w_char = sorted(prob_pass_start_chars.items(), key=lambda x:x[1])
sorted_num_pass_w_char[-10:]

[('k', 0.030112092511078912),
 ('t', 0.03019875298476047),
 ('l', 0.03200024618607084),
 ('d', 0.034974911596357544),
 ('c', 0.03615092217770743),
 ('b', 0.036440602438896755),
 ('m', 0.04673549531688035),
 ('a', 0.04689793719340077),
 ('1', 0.04736622677456893),
 ('s', 0.04979685375899136)]

In [34]:
top_10 = [i for i,j in sorted_num_pass_w_char[-10:]]

In [35]:
top_10

['k', 't', 'l', 'd', 'c', 'b', 'm', 'a', '1', 's']

## Grab all passwords starting with these chars with length 5

In [43]:
top_10_passwds = {}

for t in top_10:
    top_10_passwds[t] = !grep -E '^{t}....$' passwd_db_train

In [45]:
top_10_passwds['s'][:10]

['ssehm',
 'soxeZ',
 's-nix',
 'sh904',
 'ssmq4',
 'sZ7wk',
 'sonya',
 'sEzW1',
 'spb-0',
 'slid5']

## Generate passwords not in training set

In [51]:
gen_passwords_not_in_training(one_step_model, 4, pass_len=5, start_char='m')

['mad#r', 'mgpri', 'meirc', 'mywog', 'macra']

In [53]:
!grep -E '^macra$' passwd_db_train

## Method to check haveibeenpwned database

In [56]:
import hashlib
import requests

def check_pwned(passes):
  headers = {
    "User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.2.8) Gecko/20100722 Firefox/3.6.8 GTB7.1 (.NET CLR 3.5.30729)", 
    "Referer": "https://haveibeenpwned.com/"
  }
  found_passwds = {}
  pwned_api = 'https://api.pwnedpasswords.com/range/'
  for p in passes:
    hash_object = hashlib.sha1(p.encode())
    pbHash = hash_object.hexdigest().upper()
    try:
      res = requests.get(pwned_api + pbHash[:5],  headers=headers, timeout=10)
      range_hashes = res.text.split('\r\n')
      for h in range_hashes:
        h_c = h.split(':')
        if h_c[0] == pbHash[5:]:
          found_passwds[p] = h_c[1]
    except Exception as e:
      print(f'request timed out for pass {p}')
  return found_passwds

## Generate passwords with top 10 chars and check haveibeenpwned

In [58]:
model_preds = {}

total_num_pass = 100
pass_length = 5

for i in top_10_passwds:
    gen_pass = gen_passwords(one_step_model, total_num_pass, pass_len=pass_length, start_char=i)
    model_preds[i] = len(check_pwned(gen_pass))/total_num_pass
    print(f"char '{i}' - model password prob: {model_preds[i]}")

char 'k' - model password prob: 0.76
char 't' - model password prob: 0.63
char 'l' - model password prob: 0.61
char 'd' - model password prob: 0.64
char 'c' - model password prob: 0.56
char 'b' - model password prob: 0.69
char 'm' - model password prob: 0.71
char 'a' - model password prob: 0.63
char '1' - model password prob: 0.67
char 's' - model password prob: 0.66


## Generate passwords with top 10 chars NOT in training set and check haveibeenpwned

In [57]:
model_preds = {}

total_num_pass = 100
pass_length = 5

for i in top_10_passwds:
    gen_pass = gen_passwords_not_in_training(one_step_model, total_num_pass, pass_len=pass_length, start_char=i)
    model_preds[i] = len(check_pwned(gen_pass))/total_num_pass
    print(f"char '{i}' - model password prob: {model_preds[i]}")

char 'k' - model password prob: 0.12
char 't' - model password prob: 0.11
char 'l' - model password prob: 0.09
char 'd' - model password prob: 0.04
char 'c' - model password prob: 0.1
char 'b' - model password prob: 0.04
char 'm' - model password prob: 0.09
char 'a' - model password prob: 0.09
char '1' - model password prob: 0.12
char 's' - model password prob: 0.11


## Generate random passwords with top 10 chars without using model

In [65]:
rand_preds = {}

total_num_pass = 100
pass_length = 5

for i in top_10_passwds:
    gen_pass = gen_passwords_random(total_num_pass, pass_len=pass_length, start_char=i)
    rand_preds[i] = len(check_pwned(gen_pass))/total_num_pass
    print(f"char '{i}' - model password prob: {rand_preds[i]}")

char 'k' - model password prob: 0.0
char 't' - model password prob: 0.0
char 'l' - model password prob: 0.0
char 'd' - model password prob: 0.0
char 'c' - model password prob: 0.0
char 'b' - model password prob: 0.0
char 'm' - model password prob: 0.01
char 'a' - model password prob: 0.0
char '1' - model password prob: 0.0
char 's' - model password prob: 0.01
