In [None]:
%cd /content/drive/MyDrive/Colab/password_predict

/content/drive/MyDrive/Colab/password_predict


In [None]:
%ls

 [0m[01;34m3_epochs[0m/                            pass_predict_100_sentence_mapping.pt
 [01;34m5_epochs[0m/                            pass_predict.pt
'Copy of train.ipynb'                 pass_predict_sampling.pt
 [01;34mdata[0m/                                test.ipynb
 data_prep_100.ipynb                  train_100_1m_direct_mapping.ipynb
 data_prep.ipynb                      train_100_1m_sentence_mapping.ipynb
 dataset_sampling.ipynb               train_100.ipynb
 hdf5_conv.ipynb                      train_char_sampling.ipynb
 [01;34mlambda_labs[0m/                         train.ipynb
 pass_predict_100_direct_mapping.pt   train_sampling.ipynb


In [None]:
!grep -iE '.*admin.*@' data/test.csv | head -10

admin@incontakte.ru,123vfif123
packadmin@web.de,brandeeanne
adminissante@virgilio.it,iwaki123
adminisstrator2008@mail.ru,2732865
admin@olli.la,leckmich
admin@saintpatricks.net,baltimore1
alf.admin.real4ever@rambler.ru,Dr011206
admin@medisana.com.ua,15091968
admin@lawpropertygroup.couk,freeland
sosclanadmin@lycos.de,1rnumb


In [None]:
!grep -iE '.*test.*@' data/test.csv | head -10

testing.trinhh@yahoo.com,oink99
testi2112@ya.ru,djlbwf
przhottestchicka16@hotmail.com,luvu4
sweetest_dew@hotmail.com,1659232hana
alessandro_testa@tiscali.it,domenico
test171@yandex.ru,2921405
test0002@mail.ru,qweqwe
applegatestudios@yahoo.com,POOHLINKEDIN
steve+test@gmx.net,testtest
jackisthegreatest@hotmail.com,poohead12


In [None]:
import csv

# Custom char tokenizer
def char_tokenizer(text):
    return list(text)

# Yield tokens from dataset for vocabulary building
def yield_tokens(data):
    for src, trg in data:
        yield char_tokenizer(src)
        yield char_tokenizer(trg)

# Load data from CSV
def load_data_from_csv(filename):
    with open(filename, 'r', encoding='utf-8') as file:
        csv_reader = csv.reader(file)
        data = [(row[0], row[1]) for row in csv_reader]
    return data

In [None]:
train_data_raw = load_data_from_csv('data/train.csv')

In [None]:
from torchtext.vocab import build_vocab_from_iterator

vocab = build_vocab_from_iterator(yield_tokens(train_data_raw), specials=['<sos>', '<eos>', '<unk>'])
vocab.set_default_index(vocab["<unk>"])

print(f"Vocab length: {len(vocab)}")

Vocab length: 4131


In [None]:
# save vocab
import pickle

with open('data/vocab_org.pkl', 'wb') as f:
    pickle.dump(vocab, f)

In [None]:
import torch
import torch.nn as nn
import pickle

class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()
        self.hid_dim = hid_dim
        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout = dropout)
        self.dropout = nn.Dropout(dropout)

    def forward(self, src):
        embedded = self.dropout(self.embedding(src))
        outputs, (hidden, cell) = self.rnn(embedded)
        return hidden, cell

class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()
        self.output_dim = output_dim
        self.hid_dim = hid_dim
        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout = dropout)
        self.fc_out = nn.Linear(hid_dim, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, input, hidden, cell):
        input = input.unsqueeze(0)
        embedded = self.dropout(self.embedding(input))
        output, (hidden, cell) = self.rnn(embedded, (hidden, cell))
        prediction = self.fc_out(output.squeeze(0))
        return prediction, hidden, cell

class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
        self.device = device

    def forward(self, src, trg, teacher_forcing_ratio=0.5):
        trg_len, batch_size = trg.shape
        trg_vocab_size = self.decoder.output_dim
        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size).to(self.device)
        hidden, cell = self.encoder(src)
        input = trg[0, :]

        for t in range(1, trg_len):
            output, hidden, cell = self.decoder(input, hidden, cell)
            outputs[t] = output
            teacher_force = torch.rand(1) < teacher_forcing_ratio
            top1 = output.argmax(1)
            input = trg[t] if teacher_force else top1

        return outputs

# Make sure to use the correct path where your 'vocab.pkl' file is saved
vocab_file_path = '/content/drive/MyDrive/Colab/password_predict/data/vocab_org.pkl'

# Load the vocabulary object from the file
with open(vocab_file_path, 'rb') as f:
    vocab = pickle.load(f)

INPUT_DIM = len(vocab)
OUTPUT_DIM = len(vocab)
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
HID_DIM = 512
N_LAYERS = 2
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, N_LAYERS, DEC_DROPOUT)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = Seq2Seq(enc, dec, device).to(device)

In [None]:
print(f"Vocab length: {len(vocab)}")

Vocab length: 4131


In [None]:
%ls -ltr /content/drive/MyDrive/Colab/password_predict/5_epochs/pass_predict.pt

-rw------- 1 root root 46367098 Jan 10 20:48 /content/drive/MyDrive/Colab/password_predict/5_epochs/pass_predict.pt


In [None]:
model.load_state_dict(torch.load('/content/drive/MyDrive/Colab/password_predict/5_epochs/pass_predict.pt',map_location=torch.device('cpu')))

<All keys matched successfully>

In [None]:
itos = vocab.get_itos()

In [None]:
def predict(model, vocab, device, src_sentence, n=1):
    model.eval()

    # Convert the source sentence to tokens
    src_tokens = [tok for tok in src_sentence]
    src_indexes = [vocab[tok] for tok in src_tokens] + [vocab['<eos>']]

    # Convert to tensor and add batch dimension
    src_tensor = torch.LongTensor(src_indexes).unsqueeze(1).to(device)

    # Initialize target sequence with <sos> token
    result = []
    for i in range(n):
        trg_indexes = [vocab['<sos>']]
        predicted_sentence = []
        # Predict next token
        while True:
            trg_tensor = torch.LongTensor(trg_indexes).unsqueeze(1).to(device)
            with torch.no_grad():
                output = model(src_tensor, trg_tensor)
                #print(output)
                predicted_token = output.argmax(2)[-1, :].item()
                # Check for end of sentence
                if predicted_token == vocab['<eos>']:
                    break

                # Append the predicted token to the target sequence
                trg_indexes.append(predicted_token)
                # Convert the target sequence to words
                predicted_sentence.append(itos[predicted_token])
        if ''.join(predicted_sentence[1:]) in result:
            continue
        result.append(''.join(predicted_sentence[1:]))

    return result  # Exclude the <sos> token

In [None]:
predict(model, vocab, device, "steve+test@gmx.net", 100)

['maneee',
 'mannee',
 'moarnen',
 'monres',
 'moannee',
 'manneet',
 'maanee',
 'moannet',
 'maannee',
 'moaeee',
 'maannae',
 'maaeeet',
 'monreet',
 'maneer',
 'maaneete',
 'monneet',
 'moanee',
 'monney',
 'monnee',
 'maaneet',
 'maannet',
 'moaenet',
 'manner',
 'moaree',
 'moaneet',
 'maanneee',
 'maaenlt',
 'monees',
 'maaens',
 'maanne',
 'maaeee',
 'moaredee',
 'maaenet',
 'moarnt',
 'maanae',
 'moarnr',
 'moarne',
 'moaeeet',
 'maaees',
 'manees',
 'monreye',
 'moanet',
 'monneey',
 'moares',
 'moareen',
 'maanes',
 'moanne',
 'moaene',
 'monree']

In [None]:
predict(model, vocab, device, "admin@saintpatricks.net", 100)

['maarn3a',
 'mariian',
 'marria',
 'mar1in3',
 'marii3e',
 'marn2d',
 'marnia',
 'maarnaen',
 'maanmaa',
 'maanna',
 'mar1e2',
 'maain3',
 'maa1n2',
 'mar1ea',
 'marr2e',
 'maanid',
 'mar1n2',
 'mariinn',
 'marni3c',
 'maaima',
 'marii3',
 'maarniac',
 'maarin',
 'marrna',
 'marien',
 'marr2a',
 'marnin',
 'maarim',
 'mar1ian',
 'marnidc',
 'marrti',
 'mariin',
 'maanne',
 'maa1i2',
 'maaina',
 'marite',
 'marnis',
 'maa1nm',
 'maarn3',
 'maainn',
 'mar1ee',
 'marriinn',
 'maarni',
 'mariie',
 'maa1na',
 'maarnaa',
 'mar1233',
 'marrii',
 'maane3',
 'maa1im',
 'mar1i23',
 'maannma',
 'maai2a',
 'maar2i',
 'marr2i',
 'marrisn',
 'mar12a',
 'mariia',
 'maannme',
 'marnei',
 'maainl',
 'maania',
 'marnee',
 'maar2d',
 'mar1te3r',
 'marnii',
 'maanem',
 'maar2n0',
 'marnie',
 'marrie',
 'maarnian',
 'mariis',
 'mar1ia',
 'maaime',
 'maar23',
 'maa1e13',
 'maarma',
 'maanmi']

Below are the stats on common passwords on used dataset

```
2034268 "123456"
1214120 "123456789"
 988396 "qwerty"
 738200 "password"
 687856 "12345"
 537369 "qwerty123"
 511861 "1q2w3e"
 438009 "DEFAULT"
 349405 "12345678"
 281911 "111111"
 240898 "1234567890"
 237525 "1234567"
 228249 "123123"
 185868 "000000"
 166589 "30media"
 163341 "qwertyuiop"
 158324 "10pace"
 149391 "abc123"
 148217 "59mile"
 148056 "59trick"
 147743 "24crow"
 147397 "19weed"
 146654 "66bob"
 142180 "123321"
 130429 "1q2w3e4r5t"
 127332 ")ryan"
 124314 "654321"
 122437 "password1"
 121947 "1234"
 121186 "666666"
 104129 "qwe123"
 104082 "7777777"
  95942 "asdasd"
  95769 "iloveyou"
  94372 "123"
  92904 "1q2w3e4r"
  90866 "555555"
  83472 "123qwe"
  82834 "987654321"
  81854 "1qaz2wsx"
  81290 "zxcvbnm"
  79111 "123456a"
  75744 "121212"
  69730 "qazwsx"
  67380 "112233"
  65360 "dragon"
  59351 "monkey"
  58386 "159753"
  57884 "777777"
  54620 "1234qwer"
```

In [None]:
!grep -iE '\b123456\b' data/test.csv | head -10

seni_seviyorum43@hotmail.de,123456
handeus@hotmail.com,123456
kapne2009@rambler.ru,123456
terue@bk.ru,123456
nfokuiodl@yandex.ru,123456
analassassin@rocketmail.com,123456
tabibi_qi@yahoo.com,123456
amangeldi_85.86@inbox.ru,123456
kacherra96@yandex.ru,123456
wangjinwei@sina.com,123456


In [None]:
!grep -iE '\b123456\b' data/train.csv | head -10

missis.korshunova@bk.ru,123456
srinachaus@mail.ru,123456
katynaymkina15@mail.ru,123456
green_day98@hotmail.com,123456
rfb_auto@yahoo.com,123456
puftachok@rambler.ru,123456
sasha.dochkisinochki@mail.ru,123456
kaban_ne@bk.ru,123456
hylmi_25@hotmail.com,123456
thao.nguyenthanh@ya.ru,123456


In [None]:
predict(model, vocab, device, "handeus@hotmail.com", 100)

['hannee',
 'handeu',
 'haanlae',
 'haadne',
 'haanne',
 'haadlien',
 'haarlian',
 'hanrei',
 'hannlee',
 'haannes',
 'haarlion',
 'haaraa',
 'haanea',
 'haannaa',
 'hannei',
 'haadlee',
 'haadla',
 'handea',
 'hannliln',
 'haanee',
 'haanlua',
 'haanae',
 'haadee',
 'handee',
 'haarni',
 'haannea',
 'haaree',
 'haanle',
 'haanna',
 'hannlio',
 'haanlee',
 'handeee',
 'handlel',
 'handeer',
 'hanree',
 'hannees',
 'haanees',
 'haadlae',
 'haarea',
 'haanni',
 'haannee',
 'haarei',
 'haadnuas',
 'haadnea',
 'hanreus',
 'hannlei',
 'haanlel',
 'haarao',
 'haadnee',
 'haanlai',
 'hannlue',
 'hannlul']

In [None]:
import pandas as pd

df = pd.read_csv("data/test.csv", names=['name','password'])
df.head()

Unnamed: 0,name,password
0,arturo100@rubi.net,tequiero1
1,mahei001@mail.ru,йцувыфячс
2,machomiscellanykml4@yandex.ru,iEW843lGo
3,angieg@tm.net.my,c_sunstrom
4,js152008@yahoo.com,boots5


In [None]:
!pip install python-Levenshtein

Collecting python-Levenshtein
  Downloading python_Levenshtein-0.25.0-py3-none-any.whl (9.4 kB)
Collecting Levenshtein==0.25.0 (from python-Levenshtein)
  Downloading Levenshtein-0.25.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (177 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m177.4/177.4 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting rapidfuzz<4.0.0,>=3.1.0 (from Levenshtein==0.25.0->python-Levenshtein)
  Downloading rapidfuzz-3.6.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.4/3.4 MB[0m [31m83.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rapidfuzz, Levenshtein, python-Levenshtein
Successfully installed Levenshtein-0.25.0 python-Levenshtein-0.25.0 rapidfuzz-3.6.1


In [None]:
from Levenshtein import distance as lev
lev("123456", "123456")

0

In [None]:
lev("51234", "123456")

3

In [None]:
device

device(type='cuda')

In [None]:
from tqdm.notebook import tqdm
import pandas as pd
from joblib import Parallel, delayed

# Assuming the predict and lev functions are defined elsewhere

tqdm.pandas()

def avg_and_min_edit_dist(username, actual_pass):
    pred_passes = predict(model, vocab, device, username, 100)
    if not pred_passes:
        return (0, None)  # Return 0 and None if pred_passes is empty

    # Calculate edit distances between actual_pass and each predicted pass
    edit_distances = [lev(str(actual_pass), str(pred_pass)) for pred_pass in pred_passes]
    avg_edit_dist = sum(edit_distances) / len(pred_passes)
    min_distance = min(edit_distances)

    # Return the predicted pass with the minimum edit distance
    best_pred_pass = pred_passes[edit_distances.index(min(edit_distances))]
    return (best_pred_pass, avg_edit_dist, min_distance)

# Sample a subset of the dataframe
temp_df = df.sample(n=10000)

# Apply the function and create two new columns for best predicted password and average edit distance
# temp_df[['best_pred_pass', 'avg_edit_dist']] = temp_df.progress_apply(lambda x: pd.Series(avg_and_min_edit_dist(x['name'], x['password'])), axis=1)

results = Parallel(n_jobs=-1, verbose=10)(delayed(avg_and_min_edit_dist)(row['name'], row['password']) for index, row in temp_df.iterrows())
temp_df[['best_pred_pass', 'avg_edit_dist', 'min_distance']] = pd.DataFrame(results, index=temp_df.index)

temp_df.sort_values('avg_edit_dist').head(10)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 tasks      | elapsed:   12.5s
[Parallel(n_jobs=-1)]: Done   8 tasks      | elapsed:   24.2s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   35.6s
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:   47.9s
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done  48 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done  61 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done  74 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done  89 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 104 tasks      | elapsed:  2.7min
[Parallel(n_jobs=-1)]: Done 121 tasks      | elapsed:  3.2min
[Parallel(n_jobs=-1)]: Done 138 tasks      | elapsed:  3.5min
[Parallel(n_jobs=-1)]: Done 157 tasks      | elapsed:  4.0min
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:  4.5min
[Parallel(n_jobs=-1)]: Done 197 tasks      | elapsed:  

Unnamed: 0,name,password,best_pred_pass,avg_edit_dist,min_distance
3104677,cheyen1@hotmail.fr,cheyen,cheyen,2.333333,0
25405791,111erik@yahoo.de,111erik,1111rik,2.5,1
22627039,awutaba@yahoo.com,awutaba,aautaa,2.692308,2
23293912,niknak_niknak@aol.com,niknak,niknak,2.8,0
8455995,mariya27092000@mail.ru,mariya,marnya,2.8,1
19209736,corradoant@yahoo.ie,corrado,corredo,2.888889,1
2485136,ipas@icanet.net.mx,ipas,ipasa,2.9,1
27504328,nan632@rambler.ru,nan632,nan636,2.906977,1
7948035,dssdsdds@inbox.ru,dssdsdds,dssssdds,2.916667,1
8202643,ali-sel@mail.ru,alisel,alisel,2.929825,0


In [None]:
# save temp_df to a csv
temp_df.to_csv("data/org_test_results.csv", index=False)

In [None]:
import pandas as pd

temp_df = pd.read_csv("data/org_test_results.csv")

In [None]:
lev(str(123.0),str(23.2))

2

In [None]:
temp_df.sort_values('min_distance').head(10)

Unnamed: 0,name,password,best_pred_pass,avg_edit_dist,min_distance
22192965,jinkies_0@hotmail.com,jinkies,jinkies,3.246154,0
2782697,leonid-lebedev00@rambler.ru,leonid,leonid,3.622222,0
3104677,cheyen1@hotmail.fr,cheyen,cheyen,2.333333,0
23293912,niknak_niknak@aol.com,niknak,niknak,2.8,0
8202643,ali-sel@mail.ru,alisel,alisel,2.929825,0
27738652,nataliewest21@gmail.com,natalie,natalie,3.036585,0
3080859,holopp190@pisem.net,holopp,holope,4.484536,1
5738463,woppahead@mail.ru,123456,1234556,4.132653,1
30651967,arielbeasley1@yahoo.com,ariel1,aaiel1,3.962963,1
19209736,corradoant@yahoo.ie,corrado,corredo,2.888889,1


In [None]:
temp_df.sort_values('min_distance').to_csv("data/org_test_results_sorted.csv", index=False)