### Get all imports

In [17]:
import math
import os
import string
from collections import Counter, defaultdict
from math import ceil
from typing import List, Set, Tuple, Type, Dict
from abc import abstractmethod

import torch
from torch import nn
from torch.nn import CrossEntropyLoss
import gdown
import unidecode

### Init the class which will give the password datasets. Currently supports Mate1, 000webhost and ClixSense.

In [10]:
class PasswordDataset:
    @abstractmethod
    def get_download_url(self) -> str:
        pass

    @abstractmethod
    def get_dataset_local_path(self) -> str:
        pass


class ClixSense(PasswordDataset):
    def get_download_url(self) -> str:
        return 'https://drive.google.com/uc?id=1S0-1gdzoP-HecS3L5_zStZhvTt9A4q97'

    def get_dataset_local_path(self) -> str:
        return 'pwd_dataset_manager/datasets/ClixSense.txt'


class WebHost(PasswordDataset):
    def get_download_url(self) -> str:
        return 'https://drive.google.com/uc?id=11tsLveuHo3xaVL2DRh3FfuUPG8LEtzYd'

    def get_dataset_local_path(self) -> str:
        return 'pwd_dataset_manager/datasets/000webhost.txt'


class Mate1(PasswordDataset):
    def get_download_url(self) -> str:
        return 'https://drive.google.com/uc?id=10LtJiV9J-Vuy1I8iSxacH4fsPqZamIeB'

    def get_dataset_local_path(self) -> str:
        return 'pwd_dataset_manager/datasets/Mate1.txt'


class DatasetFactory:
    def get(self, dataset_name: str) -> Type[PasswordDataset]:
        if dataset_name == "ClixSense":
            return ClixSense
        elif dataset_name == "000webhost":
            return WebHost
        elif dataset_name == "Mate1":
            return Mate1
        else:
            raise NotImplementedError(f"Dataset: {dataset_name} not supported")


def get_dataset(dataset_klass: Type[PasswordDataset]) -> List[str]:
    local_dataset = dataset_klass().get_dataset_local_path()
    if not os.path.exists(local_dataset):
        gdown.download(dataset_klass().get_download_url(), quiet=True, output=local_dataset)

    with open(local_dataset, "r") as dataset:
        contents = unidecode.unidecode(dataset.read())
        contents = contents.split('\n')
        contents = [content[:-1].strip() for content in contents]
    return contents

### Declare all utilities
This includes method to convert string to tensor + method for training on ClixSense dataset

In [12]:
# min size of ClixSense is 6
# max size of ClixSense is 25
# most common with their no_of_occurrences
# [('123456', 17871), ('123456789', 3294), ('12345678', 2091), ('password', 1967), ('111111', 1892),
# ('1234567', 1299), ('iloveyou', 1266), ('qwerty', 1187), ('clixsense', 1172), ('000000', 977)]

def get_input_expected_clixsense(dataset: List[str]) -> Tuple[List[Tuple[str, int]], Dict[str, Set[Tuple[str, str]]]]:    
    password_slices_dict = defaultdict(set)
    # this was made to be in this way because I wanted to support more slices within a password.
    # However, this is not being done. As a result this looks really stupid.
    [password_slices_dict[pwd].add((pwd[0: -1], pwd[1:])) for pwd in dataset[:]]

    n_most_common = 100000
    all_passwords = [pwd for pwd in dataset]
    counter = Counter(all_passwords)
    most_common = counter.most_common(n_most_common)

    return most_common, password_slices_dict


def convert_str_to_tensor(string_to_convert: str):
    size = len(string_to_convert)
    converted_tensor = torch.zeros(size, 1).long()
    for index, char in enumerate(string_to_convert):
        converted_tensor[index][0] = string.printable.index(char)
    return converted_tensor.to(device)

### The GRU RNN network
This is trained on 100,000 most common passwords in ClixSense for a set number of epochs
The training itself was done locally due to compute constraints on Colab.

In [13]:
class GRU_RNN(nn.Module):
    def __init__(self, embedding_dim: int, hidden_size: int, no_of_hidden_layers: int, output_size: int):
        super().__init__()
        self.embedding_dim = embedding_dim
        # (L, N, H_in)
        self.gru = nn.GRU(self.embedding_dim, hidden_size, num_layers=no_of_hidden_layers)
        self.embedding = nn.Embedding(len(string.printable), embedding_dim=self.embedding_dim)
        self.linear = nn.Linear(hidden_size, output_size)

    def forward(self, input: torch.Tensor, hidden_state: torch.Tensor):
        input = self.embedding(input)
        reshaped_input = input.view(1, 1, self.embedding_dim)
        input, hidden_state = self.gru(reshaped_input, hidden_state)
        output = self.linear(input)
        return output, hidden_state

### Init the class which will train the RNN on ClixSense

In [14]:
class PasswordGuesserUsingRNN:
    def __init__(self):
        self.hidden_size = 100
        self.no_of_hidden_layers = 20
        self.output_size = len(string.printable)
        self.embedding_dim = 5
        self.epochs = 10
        self.eta = 1e-4

    def train_and_evaluate(self):
        dataset_klass = DatasetFactory().get("ClixSense")
        dataset = get_dataset(dataset_klass)

        gru_model = GRU_RNN(self.embedding_dim, self.hidden_size, self.no_of_hidden_layers, self.output_size).to(device)
        optimizer = torch.optim.Adam(gru_model.parameters(), lr=self.eta)
        loss_fn = CrossEntropyLoss()

        n_most_common, pwd_inp_exp = get_input_expected_clixsense(dataset)
        # total_len of dict = 1338980
        # total length of passwords = 2221027
        for epoch in range(self.epochs):
            for pwd_index, (most_common_pwd, num_occ) in enumerate(n_most_common[:]):
                inp_target_set = pwd_inp_exp[most_common_pwd]
                for _ in range(ceil((num_occ / 100000) * 100)):
                    for input_pwd, target_pwd in inp_target_set:
                        loss = 0
                        optimizer.zero_grad()

                        hidden_state = self._init_hidden()
                        input_tensor = convert_str_to_tensor(input_pwd)
                        target_tensor = convert_str_to_tensor(target_pwd)

                        for input, expected in zip(input_tensor, target_tensor):
                            output, hidden_state = gru_model(input, hidden_state)
                            loss += loss_fn(output[-1], expected)

                        loss.backward()
                        optimizer.step()

                        loss = loss.item() / len(input_pwd)

                if pwd_index % 1000 == 0:
                    print(f"At pwd_index: {pwd_index} of {len(n_most_common)}")
                    print(f"training password: {most_common_pwd}")

                    print(f'At epoch: {epoch} with loss: {loss}')
                    start = "123"
                    prediction = self.evaluate_password(gru_model, start, 15)
                    print(f"Prediction is {prediction} for start with '{start}'")

        return gru_model

    def evaluate_password(self, gru_model: nn.Module, password_start: str, max_length: int):
        prediction = password_start
        start_tensor = convert_str_to_tensor(password_start)
        with torch.no_grad():
            hidden_state = self._init_hidden()
            for char in start_tensor:
                _, hidden_state = gru_model(char, hidden_state)

            input = start_tensor[-1]
            for char_gen in range(max_length - len(password_start)):
                output, hidden_state = gru_model(input, hidden_state)

                # understand below; taken from the ref colab
                output_dist = output.data.view(-1).exp()
                top_i = torch.multinomial(output_dist, 1)[0]

                char_predicted = string.printable[top_i]
                prediction += char_predicted
                input = convert_str_to_tensor(char_predicted)

        return prediction

    def _init_hidden(self):
        # (D∗num_layers, N, Hout)
        return torch.zeros(self.no_of_hidden_layers, 1, self.hidden_size).to(device)

### Train the GRU model on ClixSense

In [18]:
device = "cuda"
gru_model = PasswordGuesserUsingRNN().train_and_evaluate()
torch.save(gru_model, 'saved_gru_pwd.model')

At pwd_index: 0 of 100000
training password: 12345
At epoch: 0 with loss: 4.540670871734619
Prediction is 123l-T
M.0#L)=0 for start with '123'
At pwd_index: 1000 of 100000
training password: coffe
At epoch: 0 with loss: 3.8765103816986084
Prediction is 123nu2lh1ealamc for start with '123'


KeyboardInterrupt: 

The actual training was done locally and the logs for which can be found here: https://gist.github.com/venomouscyanide/8c4e18d042f4db891a614f838fa1b03a

The training took approximately 11 hours to complete.
After training, I save the model for easily running experiments on the other 2 datasets.

The saved model is hosted here: https://drive.google.com/file/d/1P5_RetiDuEh-dLMpPt_sdjrSc9fX9Pej/view?usp=sharing

## Init the class that will help make password guesses

In [19]:
class MakePasswordGuesses:
    MIN_LENGTH: int = 5
    MAX_LENGTH: int = 15
    MAX_TRIES_PER_CONFIG: int = 5
    MAX_GUESSES: int = int(math.pow(10, 6))

    def __init__(self, model: nn.Module, verbose: bool = True):
        self.model = model
        self.verbose = verbose

    def evaluate_dataset(self, dataset_name: str) -> Tuple[Set[str], Set[str]]:
        dataset_klass = DatasetFactory().get(dataset_name)
        dataset = get_dataset(dataset_klass)

        dataset_counter = Counter(dataset)

        print(f'Total passwords in {dataset_name} is {len(dataset)}')
        most_common = dataset_counter.most_common()

        total_correct_guesses, guessed_passwords, all_starters_used = self._make_guesses(most_common, dataset_counter)
        missed_passwords = set(dataset_counter.keys()).difference(guessed_passwords)

        print(
            f"Unique guesses correct: {len(guessed_passwords)} and Total guesses: {total_correct_guesses} and total misses: {len(missed_passwords)}"
        )
        print(f"Coverage on {dataset_name}: {round(total_correct_guesses) / len(dataset)}")

        self._write_debug(dataset_name, "all_starters.txt", all_starters_used)
        self._write_debug(dataset_name, "unique_correct_guesses.txt", guessed_passwords)
        self._write_debug(dataset_name, "missed_passwords.txt", missed_passwords)

        return guessed_passwords, missed_passwords

    def _form_candidates(self, common_pwd: str) -> List[str]:
        min_len = 3
        return [common_pwd[0:end] for end in range(min_len, len(common_pwd))]

    def _make_guesses(self, most_common: List[Tuple[str, int]], dataset_counter: Counter):
        total_correct_guesses = 0
        uniq_guessed_passwords = set()
        all_starters_used = set()
        total_guess_tracker = 0

        for common_pwd, _ in most_common:
            starter_candidates = self._form_candidates(common_pwd)
            all_starters_used |= set(starter_candidates)

            for candidate in starter_candidates:

                for max_len in range(self.MIN_LENGTH, self.MAX_LENGTH):

                    for _ in range(self.MAX_TRIES_PER_CONFIG):
                        if total_guess_tracker > self.MAX_GUESSES:
                            return total_correct_guesses, uniq_guessed_passwords, all_starters_used

                        total_guess_tracker += 1
                        if total_guess_tracker % 1000 == 0 and self.verbose:
                            print(f"At guess {total_guess_tracker} of {self.MAX_GUESSES}")

                        guess = PasswordGuesserUsingRNN().evaluate_password(self.model, candidate, max_length=max_len)

                        if guess not in uniq_guessed_passwords:
                            occurrences = dataset_counter.get(guess)
                            if occurrences:
                                if self.verbose:
                                    print(
                                        f"Correct guess: {guess}, for candidate: {candidate}, given max_len: {max_len}")
                                total_correct_guesses += occurrences
                                uniq_guessed_passwords.add(guess)

        return total_correct_guesses, uniq_guessed_passwords, all_starters_used

    def _write_debug(self, dataset_name: str, file_name: str, data_as_set: Set[str]):
        debug_folder = f'debug_{dataset_name}'
        if not os.path.exists(debug_folder):
            os.mkdir(debug_folder)

        data_as_str = '\n'.join(data_as_set)
        with open(os.path.join(debug_folder, file_name), "w") as debug_file:
            debug_file.write(data_as_str)

In [None]:
gru_model = torch.load('saved_gru_pwd.model')
MakePasswordGuesses(gru_model).evaluate_dataset('Mate1')

Total passwords in Mate1 is 27398563
Correct guess: 12359, for candidate: 123, given max_len: 5
Correct guess: 12329, for candidate: 123, given max_len: 5
Correct guess: 12374, for candidate: 123, given max_len: 5
Correct guess: 12340, for candidate: 123, given max_len: 5
Correct guess: 12322, for candidate: 123, given max_len: 5
Correct guess: 123939, for candidate: 123, given max_len: 6
Correct guess: 123129, for candidate: 123, given max_len: 6
Correct guess: 12347, for candidate: 1234, given max_len: 5
Correct guess: 12348, for candidate: 1234, given max_len: 5
Correct guess: 12346, for candidate: 1234, given max_len: 5
Correct guess: 12342, for candidate: 1234, given max_len: 5
Correct guess: 12344, for candidate: 1234, given max_len: 5
Correct guess: 123472, for candidate: 1234, given max_len: 6
Correct guess: 123401, for candidate: 1234, given max_len: 6
Correct guess: 123430, for candidate: 1234, given max_len: 6
Correct guess: 123453, for candidate: 1234, given max_len: 6
Corr

Correct guess: 123452, for candidate: 12345, given max_len: 6
Correct guess: 123450, for candidate: 12345, given max_len: 6
Correct guess: 1234532, for candidate: 12345, given max_len: 7
Correct guess: 1234553, for candidate: 12345, given max_len: 7
Correct guess: 12345136, for candidate: 12345, given max_len: 8
Correct guess: 1234560, for candidate: 123456, given max_len: 7
Correct guess: 12345661, for candidate: 123456, given max_len: 8
Correct guess: 12345632, for candidate: 123456, given max_len: 8
Correct guess: 12345690, for candidate: 123456, given max_len: 8
Correct guess: 12345676, for candidate: 1234567, given max_len: 8
Correct guess: 12345673, for candidate: 1234567, given max_len: 8
Correct guess: 12345677654, for candidate: 1234567, given max_len: 11
Correct guess: 123456780, for candidate: 12345678, given max_len: 9
Correct guess: 123456789, for candidate: 12345678, given max_len: 9
Correct guess: 123456782, for candidate: 12345678, given max_len: 9
Correct guess: 123456

Correct guess: comep, for candidate: com, given max_len: 5
Correct guess: comik, for candidate: com, given max_len: 5
Correct guess: comso, for candidate: com, given max_len: 5
At guess 3000 of 1000000
Correct guess: compl, for candidate: comp, given max_len: 5
Correct guess: compn, for candidate: comp, given max_len: 5
Correct guess: compe, for candidate: comp, given max_len: 5
Correct guess: comp1, for candidate: comp, given max_len: 5
Correct guess: compo, for candidate: comp, given max_len: 5
Correct guess: compu, for candidate: compu, given max_len: 5
Correct guess: comput, for candidate: comput, given max_len: 5
Correct guess: computn, for candidate: comput, given max_len: 7
Correct guess: lonil, for candidate: lon, given max_len: 5
Correct guess: lonle, for candidate: lon, given max_len: 5
Correct guess: londi, for candidate: lond, given max_len: 5
Correct guess: londa, for candidate: lond, given max_len: 5
Correct guess: londo, for candidate: lond, given max_len: 5
Correct gues

Correct guess: harls, for candidate: harl, given max_len: 5
Correct guess: harlt, for candidate: harl, given max_len: 5
Correct guess: harlea, for candidate: harl, given max_len: 6
Correct guess: harlegr, for candidate: harl, given max_len: 7
Correct guess: monab, for candidate: mon, given max_len: 5
Correct guess: monde, for candidate: mon, given max_len: 5
Correct guess: monono, for candidate: mon, given max_len: 6
Correct guess: monoso, for candidate: mon, given max_len: 6
Correct guess: monyk0, for candidate: mon, given max_len: 6
Correct guess: monkr, for candidate: monk, given max_len: 5
Correct guess: monkl, for candidate: monk, given max_len: 5
Correct guess: monko, for candidate: monk, given max_len: 5
Correct guess: monka, for candidate: monk, given max_len: 5
Correct guess: monk7, for candidate: monk, given max_len: 5
Correct guess: 98742, for candidate: 987, given max_len: 5
Correct guess: 98715, for candidate: 987, given max_len: 5
Correct guess: 98760, for candidate: 987,

Correct guess: shadi, for candidate: shad, given max_len: 5
Correct guess: shads, for candidate: shad, given max_len: 5
Correct guess: shady, for candidate: shad, given max_len: 5
Correct guess: shada, for candidate: shad, given max_len: 5
Correct guess: shado0, for candidate: shad, given max_len: 6
Correct guess: shadlan, for candidate: shad, given max_len: 7
At guess 7000 of 1000000
Correct guess: sunee, for candidate: sun, given max_len: 5
Correct guess: sunny, for candidate: sun, given max_len: 5
Correct guess: sunsee, for candidate: sun, given max_len: 6
Correct guess: sundae, for candidate: sun, given max_len: 6
Correct guess: sunsi, for candidate: suns, given max_len: 5
Correct guess: sunsh, for candidate: sunsh, given max_len: 5
Correct guess: sunshn, for candidate: sunsh, given max_len: 6
Correct guess: sunshi, for candidate: sunshi, given max_len: 5
Correct guess: sunshim, for candidate: sunshi, given max_len: 7
Correct guess: drage, for candidate: drag, given max_len: 5
Corr

Correct guess: daniy, for candidate: dani, given max_len: 5
Correct guess: danid, for candidate: dani, given max_len: 5
Correct guess: danii, for candidate: dani, given max_len: 5
Correct guess: danik, for candidate: dani, given max_len: 5
Correct guess: dania, for candidate: dani, given max_len: 5
Correct guess: dania2, for candidate: dani, given max_len: 6
Correct guess: samia, for candidate: sam, given max_len: 5
Correct guess: sam4t, for candidate: sam, given max_len: 5
Correct guess: sama2, for candidate: sam, given max_len: 5
Correct guess: samna, for candidate: sam, given max_len: 5
Correct guess: sama12, for candidate: sam, given max_len: 6
Correct guess: sammo1, for candidate: sam, given max_len: 6
Correct guess: samu2, for candidate: samu, given max_len: 5
Correct guess: samu1, for candidate: samu, given max_len: 5
Correct guess: samue, for candidate: samu, given max_len: 5
Correct guess: samula, for candidate: samu, given max_len: 6
Correct guess: jesne, for candidate: jes, 

Correct guess: ashlk, for candidate: ashl, given max_len: 5
Correct guess: ashly, for candidate: ashl, given max_len: 5
Correct guess: ashlil, for candidate: ashl, given max_len: 6
Correct guess: 11290, for candidate: 112, given max_len: 5
Correct guess: 11220, for candidate: 112, given max_len: 5
Correct guess: 11241, for candidate: 112, given max_len: 5
Correct guess: 11212, for candidate: 112, given max_len: 5
Correct guess: 112291, for candidate: 112, given max_len: 6
Correct guess: 112691, for candidate: 112, given max_len: 6
Correct guess: 112012, for candidate: 112, given max_len: 6
Correct guess: 11224, for candidate: 1122, given max_len: 5
Correct guess: 11228, for candidate: 1122, given max_len: 5
Correct guess: 11229, for candidate: 1122, given max_len: 5
Correct guess: 11226, for candidate: 1122, given max_len: 5
Correct guess: 112222, for candidate: 1122, given max_len: 6
Correct guess: 112248, for candidate: 1122, given max_len: 6
Correct guess: 1122515, for candidate: 11

Correct guess: masio, for candidate: mas, given max_len: 5
Correct guess: masho, for candidate: mas, given max_len: 5
Correct guess: masic, for candidate: mas, given max_len: 5
Correct guess: masen, for candidate: mas, given max_len: 5
Correct guess: masis, for candidate: mas, given max_len: 5
Correct guess: masina, for candidate: mas, given max_len: 6
Correct guess: mastm, for candidate: mast, given max_len: 5
Correct guess: masta, for candidate: mast, given max_len: 5
Correct guess: maste, for candidate: mast, given max_len: 5
Correct guess: josli, for candidate: jos, given max_len: 5
Correct guess: joshl, for candidate: jos, given max_len: 5
Correct guess: joshn, for candidate: josh, given max_len: 5
Correct guess: josh0, for candidate: josh, given max_len: 5
Correct guess: josha, for candidate: josh, given max_len: 5
Correct guess: joshs1, for candidate: josh, given max_len: 6
Correct guess: joshm7, for candidate: josh, given max_len: 6
Correct guess: joshav, for candidate: josh, g

Correct guess: bigbl, for candidate: bigb, given max_len: 5
Correct guess: bigbn, for candidate: bigb, given max_len: 5
Correct guess: bigby, for candidate: bigb, given max_len: 5
Correct guess: bigba, for candidate: bigb, given max_len: 5
Correct guess: bigb42, for candidate: bigb, given max_len: 6
Correct guess: wilea, for candidate: wil, given max_len: 5
Correct guess: willn, for candidate: will, given max_len: 5
Correct guess: wills, for candidate: will, given max_len: 5
Correct guess: willr, for candidate: will, given max_len: 5
Correct guess: will0, for candidate: will, given max_len: 5
Correct guess: will1, for candidate: will, given max_len: 5
Correct guess: willra, for candidate: will, given max_len: 6
Correct guess: willi, for candidate: willi, given max_len: 5
Correct guess: willin, for candidate: willi, given max_len: 6
Correct guess: willid, for candidate: willi, given max_len: 6
Correct guess: willia, for candidate: willi, given max_len: 6
Correct guess: williat, for cand

Correct guess: brand, for candidate: brand, given max_len: 5
Correct guess: brandh, for candidate: brand, given max_len: 6
Correct guess: brands, for candidate: brand, given max_len: 6
Correct guess: brand6, for candidate: brand, given max_len: 6
Correct guess: brandk, for candidate: brand, given max_len: 6
Correct guess: brando, for candidate: brand, given max_len: 6
Correct guess: brandt6, for candidate: brand, given max_len: 7
Correct guess: 55585, for candidate: 555, given max_len: 5
Correct guess: 55551, for candidate: 555, given max_len: 5
Correct guess: 55528, for candidate: 555, given max_len: 5
Correct guess: 55522, for candidate: 555, given max_len: 5
Correct guess: 555911, for candidate: 555, given max_len: 6
Correct guess: 555223, for candidate: 555, given max_len: 6
Correct guess: 555550, for candidate: 555, given max_len: 6
Correct guess: cowad, for candidate: cow, given max_len: 5
Correct guess: cowil, for candidate: cow, given max_len: 5
Correct guess: cowpi, for candid

In [None]:
gru_model = torch.load('saved_gru_pwd.model')
MakePasswordGuesses(gru_model).evaluate_dataset('000webhost')