# Password Descriminator

Use data from password breaches and random strings to train a Neural Network to classify strings as likely or unlikely to be a human generated password.

In [202]:
import numpy as np
import pandas as pd
import string
import random

In [217]:
# set some display options
pd.set_option('display.max_colwidth', 170)

In [218]:
## initial version uses the adobe password breach - a relative small data breach
df = pd.read_csv('passwordlist\\adobe100.txt', encoding='iso-8859-1', header=None, names=['password'], sep='\t')
df['real'] = True

In [204]:
# from: https://stackoverflow.com/questions/17315737/split-a-large-pandas-dataframe
# input - df: a Dataframe, chunkSize: the chunk size
# output - a list of DataFrame
# purpose - splits the DataFrame into smaller of max size chunkSize (last is smaller)
def split_dataframe(df, chunkSize = 10000): 
    listOfDf = list()
    numberChunks = len(df) // chunkSize + 1
    for i in range(numberChunks):
        listOfDf.append(df[i*chunkSize:(i+1)*chunkSize])
    return listOfDf

In [205]:
# method to create a random password
# this may create passwords that appear on the real password list, that's okay - a few isn't going to break it
def password_generator(size=12, chars=string.punctuation + string.digits + string.ascii_letters):
    return ''.join(random.choice(chars) for _ in range(size))

In [None]:
# I've not found a better way to convert strings to arrays of ints
# 32 is magic, it's the size of input array the NN expects
def string_to_intarray(string):
    byte_array = string.encode('UTF-8')
    result = [0] * 32
    for i in range(len(byte_array)):
        result[i] = byte_array[i]
    return result

In [206]:
# note this cell IS NOT idempotent, rerunning this cell will result in uneven split of fake:real passwords

# create a set of fake passwords
# join with the real passwords
# shuffle the dataframe

import array

num_passwords = df['password'].count()
rows_list = []

for x in range(0, num_passwords):
    new_row = {'real': False, 'password': password_generator(size=random.randint(1,32))}
    rows_list.append(new_row)

fake_passwords = pd.DataFrame(rows_list)
df = pd.concat([df, fake_passwords])
df = df.sample(frac=1).reset_index(drop=True)

# convert the password strings to int arrays
df['ints'] = df['password'].apply(string_to_intarray)

In [208]:
# split the password list into the training and testing sets (80:20)
sets = split_dataframe(df, int(df['password'].count() * 0.8))
training_set = sets[0]
testing_set = sets[1]

In [220]:
training_set.sample(5)

Unnamed: 0,password,real,ints
154,asdfgh,True,"[97, 115, 100, 102, 103, 104, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
55,andrea,True,"[97, 110, 100, 114, 101, 97, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
157,"]0.kbm(]07R(&jnP?g,_)z|/bE,eAO|",False,"[93, 48, 46, 107, 98, 109, 40, 93, 48, 55, 82, 40, 38, 106, 110, 80, 63, 103, 44, 95, 41, 122, 124, 47, 98, 69, 44, 101, 65, 79, 124, 0]"
62,Kp~oZd,False,"[75, 112, 126, 111, 90, 100, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
124,",+<{*>V)*W12i$z@kQjxA(*!<+&8i",False,"[44, 43, 60, 123, 42, 62, 86, 41, 42, 87, 49, 50, 105, 36, 122, 64, 107, 81, 106, 120, 65, 40, 42, 33, 60, 43, 38, 56, 105, 0, 0, 0]"


In [221]:
# the pandas column wasn't playing nicely with the numpy dot product functions, transform this data for use
input_values = df['ints'].values
res = []
for i in range(len(input_values)):
    res.append(input_values[i])
input_values = res
expected_output = np.array(df['real'].values)

Neural Network Class and supporting functions

In [211]:
# https://github.com/PacktPublishing/Neural-Network-Projects-with-Python/blob/master/Chapter01/train_neural_network_from_scratch.py

def sigmoid(x):
    return 1.0/(1+ np.exp(-x))

def sigmoid_derivative(x):
    return x * (1.0 - x)

def transpose(x):
    return [[x[j][i] for j in range(len(x))] for i in range(len(x[0]))]

class NeuralNetwork:
    def __init__(self, x, y):
        self.input      = x
        self.weights1   = np.random.rand(len(x[1]),16)
        self.weights2   = np.random.rand(16,len(x))                 
        self.y          = y
        self.output     = np.zeros(self.y.shape)

    def feedforward(self):
        self.layer1 = sigmoid(np.dot(self.input, self.weights1))
        self.output = sigmoid(np.dot(self.layer1, self.weights2))
        
    def execute(self, data):
        exec_data = string_to_intarray(data)
        print(exec_data)
        exec_layer1 = sigmoid(np.dot(exec_data, self.weights1))
        print(exec_layer1)
        exec_output = sigmoid(np.dot(exec_layer1, self.weights2))
        print(exec_output)
        return exec_output

    def backprop(self):
        # application of the chain rule to find derivative of the loss function with respect to weights2 and weights1
        d_weights2 = np.dot(transpose(self.layer1), (2*(self.y - self.output) * sigmoid_derivative(self.output)))
        d_weights1 = np.dot(transpose(self.input),  (np.dot(2*(self.y - self.output) * sigmoid_derivative(self.output), (self.weights2.T)) * sigmoid_derivative(self.layer1)))

        # update the weights with the derivative (slope) of the loss function
        self.weights1 += d_weights1
        self.weights2 += d_weights2

In [212]:
nn = NeuralNetwork(input_values, expected_output)
for i in range(500):
    nn.feedforward()
    nn.backprop()

In [222]:
res = NeuralNetwork.execute(nn, '4mM]P~y\riX8;KY=SOW}XGr4-2C[1V')

[52, 109, 77, 93, 80, 126, 121, 13, 105, 88, 56, 59, 75, 89, 61, 83, 79, 87, 125, 88, 71, 114, 52, 45, 50, 67, 91, 49, 86, 0, 0, 0]
[1.00000000e+00 1.01371094e-88 1.00000000e+00 1.00000000e+00
 1.00000000e+00 1.00000000e+00 1.00000000e+00 1.00000000e+00
 1.00000000e+00 1.00000000e+00 1.00000000e+00 1.00000000e+00
 1.00000000e+00 1.00000000e+00 1.00000000e+00 1.00000000e+00]
[9.99599369e-001 9.99619387e-001 1.63584637e-009 9.99666789e-001
 8.54411049e-080 9.99634418e-001 3.69833545e-007 9.99621491e-001
 9.99752883e-001 7.59932467e-028 9.99858163e-001 3.65786325e-162
 4.12694514e-099 3.25963597e-006 9.99612650e-001 0.00000000e+000
 0.00000000e+000 3.58623381e-176 9.99640934e-001 1.03961661e-191
 0.00000000e+000 3.48890978e-044 3.77339570e-205 9.99675562e-001
 9.99932265e-001 5.06614207e-030 1.18320281e-300 5.21941878e-008
 9.99595591e-001 9.99605227e-001 1.90664600e-016 9.99690720e-001
 9.99767990e-001 9.99634096e-001 0.00000000e+000 9.99743343e-001
 9.99623047e-001 0.00000000e+000 9.997

In [214]:
# example using XOR - use this to ensure the network still works

X = np.array([[0,0,1],
                  [0,1,1],
                  [1,0,1],
                  [1,1,1]])
y = np.array([[0],[1],[1],[0]])
nn_xor = NeuralNetwork(X,y)

for i in range(2500):
        nn_xor.feedforward()
        nn_xor.backprop()

print(nn_xor.output)

[[0.01343471 0.01335837 0.01333857 0.01334619]
 [0.98981196 0.98940522 0.99031923 0.98939153]
 [0.98702917 0.98740724 0.98695994 0.98732275]
 [0.01042928 0.01042558 0.01025811 0.01051545]]


In [215]:
df['regression'] = res

In [216]:
df

Unnamed: 0,password,real,ints,regression
0,tigger,True,"[116, 105, 103, 103, 101, 114, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",9.995994e-01
1,freedom,True,"[102, 114, 101, 101, 100, 111, 109, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",9.996194e-01
2,"%i!a<V0X/@sE'/5]=B']s`,MRE",False,"[37, 105, 33, 97, 60, 86, 48, 88, 47, 64, 115, 69, 39, 47, 53, 93, 61, 66, 39, 93, 115, 96, 44, 77, 82, 69, 0, 0, 0, 0, 0, 0]",1.635846e-09
3,killer,True,"[107, 105, 108, 108, 101, 114, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",9.996668e-01
4,"ZxOkN'TY}]h3=[j~Kx""KW""ZC",False,"[90, 120, 79, 107, 78, 39, 84, 89, 125, 93, 104, 51, 61, 91, 106, 126, 75, 120, 34, 75, 87, 34, 90, 67, 0, 0, 0, 0, 0, 0, 0, 0]",8.544110e-80
5,123456,True,"[49, 50, 51, 52, 53, 54, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",9.996344e-01
6,f.pJ>Vib?S#YCtlcBSZw~]2MtY$dP:RQ,False,"[102, 46, 112, 74, 62, 86, 105, 98, 63, 83, 35, 89, 67, 116, 108, 99, 66, 83, 90, 119, 126, 93, 50, 77, 116, 89, 36, 100, 80, 58, 82, 81]",3.698335e-07
7,maggie,True,"[109, 97, 103, 103, 105, 101, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",9.996215e-01
8,1234,True,"[49, 50, 51, 52, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]",9.997529e-01
9,"~LKya4,|*mDq~MP>3K3PMM_fYD",False,"[126, 76, 75, 121, 97, 52, 44, 124, 42, 109, 68, 113, 126, 77, 80, 62, 51, 75, 51, 80, 77, 77, 95, 102, 89, 68, 0, 0, 0, 0, 0, 0]",7.599325e-28
