<h1>1. Necessary Imports & Steps</h1>

In [1]:
import pandas as pd
import numpy as np
import re

<h1>2. Read the Original <b>PWNED</b> Dataset</h1>
<ul>
    <li>Password: Representing history of passwords been compromised before</li>
    <li>Hash: Representing their corresponding hashes</li>
    <li>Count: Representing how many times have they been reported compromised</li>
</ul>

In [3]:
# Read target dataset
dataset = pd.read_csv("pwned.csv")
dataset

Unnamed: 0,password,hash,count
0,zzzzqwer1,4B7265DFDDE366D4F3EFA656F3A7A2A489152E5C,13
1,yuna1980,48724FF94344862149FF78D29F83A4E59DD22787,13
2,WERTU280505,CC85B678CACA66D9DFFF6D186A81B693C9A3CA52,13
3,yqdiy35m,B1A580123BB253819F878B264F91149EAECBB2F9,13
4,movynhxk,FFE6A5112C85BD347013A3BCAA2FC8C80971EBE2,13
...,...,...,...
684408,140073,AB2F000568F448769E64DC2CF7682CA66CC080D0,69
684409,210596n,D2BDD16E36C106BAEB7700710210F6C5B9AD4E79,69
684410,quotaji,8D9959D2389F5908372C3C649EBD67560E6A0183,69
684411,dfkthf12345,EDFAD69F388BCF89694B6E09F5C9FFF0F5EBF21E,69


In [4]:
# Removing hash from target dataset - not necessary for us at this moment
dataset.drop(columns=["hash"], inplace=True)
dataset

Unnamed: 0,password,count
0,zzzzqwer1,13
1,yuna1980,13
2,WERTU280505,13
3,yqdiy35m,13
4,movynhxk,13
...,...,...
684408,140073,69
684409,210596n,69
684410,quotaji,69
684411,dfkthf12345,69


In [5]:
# Add new feature 'length' to target dataset - count the number of letters in the password
dataset['length'] = dataset['password'].str.len()
dataset['password'] = dataset['password'].fillna("").astype(str)
dataset

Unnamed: 0,password,count,length
0,zzzzqwer1,13,9.0
1,yuna1980,13,8.0
2,WERTU280505,13,11.0
3,yqdiy35m,13,8.0
4,movynhxk,13,8.0
...,...,...,...
684408,140073,69,6.0
684409,210596n,69,7.0
684410,quotaji,69,7.0
684411,dfkthf12345,69,11.0


In [6]:
# Read common password's dataset - rockyou has millions of common passwords in the history
commonPasswords = pd.read_csv("rockyou.csv", header=None)
commonPasswords

Unnamed: 0,0
0,CommonPasswords
1,123456
2,12345
3,123456789
4,password
...,...
14344387,"xCvBnM,"
14344388,ie168
14344389,abygurl69
14344390,a6_123


<h1>3. Preprocessing Steps - Getting the Dataset Ready</h1>

In [7]:
# If a password in PWNED dataset is common/1 or not/0 - comparing each password from PWNED to all the password of ROCKYOU
commonSet = set(commonPasswords.iloc[:, 0]) # Convert commonPasswords to a set (exclude repetitive values)
dataset['common'] = np.vectorize(lambda x: 1 if x in commonSet else 0)(dataset['password']) # Use a NumPy vectorized function for speed
dataset

Unnamed: 0,password,count,length,common
0,zzzzqwer1,13,9.0,0
1,yuna1980,13,8.0,0
2,WERTU280505,13,11.0,0
3,yqdiy35m,13,8.0,0
4,movynhxk,13,8.0,0
...,...,...,...,...
684408,140073,69,6.0,0
684409,210596n,69,7.0,0
684410,quotaji,69,7.0,0
684411,dfkthf12345,69,11.0,0


In [8]:
def hasAlphabets(password):
    return 1 if any(char.isalpha() for char in password) else 0
def hasNumbers(password):
    return 1 if any(char.isdigit() for char in password) else 0
def hasSpecialChars(password):
    return 1 if any(not char.isalnum() for char in password) else 0

dataset['alphabets'] = dataset['password'].apply(hasAlphabets)
dataset['numbers'] = dataset['password'].apply(hasNumbers)
dataset['special'] = dataset['password'].apply(hasSpecialChars)
dataset

Unnamed: 0,password,count,length,common,alphabets,numbers,special
0,zzzzqwer1,13,9.0,0,1,1,0
1,yuna1980,13,8.0,0,1,1,0
2,WERTU280505,13,11.0,0,1,1,0
3,yqdiy35m,13,8.0,0,1,1,0
4,movynhxk,13,8.0,0,1,0,0
...,...,...,...,...,...,...,...
684408,140073,69,6.0,0,0,1,0
684409,210596n,69,7.0,0,1,1,0
684410,quotaji,69,7.0,0,1,0,0
684411,dfkthf12345,69,11.0,0,1,1,0


In [9]:
def countAlphabets(password):
    return sum(char.isalpha() for char in password)
def countNumbers(password):
    return sum(char.isdigit() for char in password)
def countSpecialChars(password):
    return sum(not char.isalnum() for char in password)

dataset['alpha-count'] = dataset['password'].apply(countAlphabets)
dataset['numeric-count'] = dataset['password'].apply(countNumbers)
dataset['special-count'] = dataset['password'].apply(countSpecialChars)
dataset

Unnamed: 0,password,count,length,common,alphabets,numbers,special,alpha-count,numeric-count,special-count
0,zzzzqwer1,13,9.0,0,1,1,0,8,1,0
1,yuna1980,13,8.0,0,1,1,0,4,4,0
2,WERTU280505,13,11.0,0,1,1,0,5,6,0
3,yqdiy35m,13,8.0,0,1,1,0,6,2,0
4,movynhxk,13,8.0,0,1,0,0,8,0,0
...,...,...,...,...,...,...,...,...,...,...
684408,140073,69,6.0,0,0,1,0,0,6,0
684409,210596n,69,7.0,0,1,1,0,1,6,0
684410,quotaji,69,7.0,0,1,0,0,7,0,0
684411,dfkthf12345,69,11.0,0,1,1,0,6,5,0


In [10]:
def hasUppercase(password):
    return 1 if any(char.isupper() for char in password) else 0
def hasLowercase(password):
    return 1 if any(char.islower() for char in password) else 0

dataset['uppercase'] = dataset['password'].apply(hasUppercase)
dataset['lowercase'] = dataset['password'].apply(hasLowercase)
dataset

Unnamed: 0,password,count,length,common,alphabets,numbers,special,alpha-count,numeric-count,special-count,uppercase,lowercase
0,zzzzqwer1,13,9.0,0,1,1,0,8,1,0,0,1
1,yuna1980,13,8.0,0,1,1,0,4,4,0,0,1
2,WERTU280505,13,11.0,0,1,1,0,5,6,0,1,0
3,yqdiy35m,13,8.0,0,1,1,0,6,2,0,0,1
4,movynhxk,13,8.0,0,1,0,0,8,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
684408,140073,69,6.0,0,0,1,0,0,6,0,0,0
684409,210596n,69,7.0,0,1,1,0,1,6,0,0,1
684410,quotaji,69,7.0,0,1,0,0,7,0,0,0,1
684411,dfkthf12345,69,11.0,0,1,1,0,6,5,0,0,1


In [11]:


def hasSequentialPatterns(password):
    alphabeticSequence = 'abcdefghijklmnopqrstuvwxyz'
    numericSequence = '0123456789'
    commonPatterns = ['qwe', 'asd', 'zxc', 'ghj', 'iop', 'jkl', 'bnm']

    for pattern in commonPatterns:
        if pattern in password.lower():
            return 1
    if any(password.lower() == alphabeticSequence[i:i+len(password)] for i in range(len(alphabeticSequence)-len(password)+1)):
        return 1
    if any(password == numericSequence[i:i+len(password)] for i in range(len(numericSequence)-len(password)+1)):
        return 1
    return 0

dataset['sequential-pattern'] = dataset['password'].apply(hasSequentialPatterns)
dataset

Unnamed: 0,password,count,length,common,alphabets,numbers,special,alpha-count,numeric-count,special-count,uppercase,lowercase,sequential-pattern
0,zzzzqwer1,13,9.0,0,1,1,0,8,1,0,0,1,1
1,yuna1980,13,8.0,0,1,1,0,4,4,0,0,1,0
2,WERTU280505,13,11.0,0,1,1,0,5,6,0,1,0,0
3,yqdiy35m,13,8.0,0,1,1,0,6,2,0,0,1,0
4,movynhxk,13,8.0,0,1,0,0,8,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
684408,140073,69,6.0,0,0,1,0,0,6,0,0,0,0
684409,210596n,69,7.0,0,1,1,0,1,6,0,0,1,0
684410,quotaji,69,7.0,0,1,0,0,7,0,0,0,1,0
684411,dfkthf12345,69,11.0,0,1,1,0,6,5,0,0,1,0


In [12]:
def hasRepeatedCharacters(password):
    # Check if the password contains a sequence of the same character repeated, e.g., 'aaaa', 'bbbb', '@@@@', etc.
    if re.search(r"(.)\1{2,}", password):  # Checks for any character repeated 3 or more times consecutively
        return 1
    return 0

dataset['repeated-characters'] = dataset['password'].apply(hasRepeatedCharacters)
dataset

Unnamed: 0,password,count,length,common,alphabets,numbers,special,alpha-count,numeric-count,special-count,uppercase,lowercase,sequential-pattern,repeated-characters
0,zzzzqwer1,13,9.0,0,1,1,0,8,1,0,0,1,1,1
1,yuna1980,13,8.0,0,1,1,0,4,4,0,0,1,0,0
2,WERTU280505,13,11.0,0,1,1,0,5,6,0,1,0,0,0
3,yqdiy35m,13,8.0,0,1,1,0,6,2,0,0,1,0,0
4,movynhxk,13,8.0,0,1,0,0,8,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
684408,140073,69,6.0,0,0,1,0,0,6,0,0,0,0,0
684409,210596n,69,7.0,0,1,1,0,1,6,0,0,1,0,0
684410,quotaji,69,7.0,0,1,0,0,7,0,0,0,1,0,0
684411,dfkthf12345,69,11.0,0,1,1,0,6,5,0,0,1,0,0


In [13]:

def caseRatio(password):
    # Count lowercase and uppercase characters
    lowercase_count = sum(1 for char in password if char.islower())
    uppercase_count = sum(1 for char in password if char.isupper())
    
    # Avoid division by zero if no uppercase letters
    if uppercase_count == 0:
        return 0
    else:
        return lowercase_count / uppercase_count

dataset['case-ratio'] = dataset['password'].apply(caseRatio)
dataset

Unnamed: 0,password,count,length,common,alphabets,numbers,special,alpha-count,numeric-count,special-count,uppercase,lowercase,sequential-pattern,repeated-characters,case-ratio
0,zzzzqwer1,13,9.0,0,1,1,0,8,1,0,0,1,1,1,0.0
1,yuna1980,13,8.0,0,1,1,0,4,4,0,0,1,0,0,0.0
2,WERTU280505,13,11.0,0,1,1,0,5,6,0,1,0,0,0,0.0
3,yqdiy35m,13,8.0,0,1,1,0,6,2,0,0,1,0,0,0.0
4,movynhxk,13,8.0,0,1,0,0,8,0,0,0,1,0,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
684408,140073,69,6.0,0,0,1,0,0,6,0,0,0,0,0,0.0
684409,210596n,69,7.0,0,1,1,0,1,6,0,0,1,0,0,0.0
684410,quotaji,69,7.0,0,1,0,0,7,0,0,0,1,0,0,0.0
684411,dfkthf12345,69,11.0,0,1,1,0,6,5,0,0,1,0,0,0.0


In [14]:
dataset.drop('alphabets', axis=1, inplace=True)
dataset.drop('numbers', axis=1, inplace=True)
dataset.drop('special', axis=1, inplace=True)
dataset

Unnamed: 0,password,count,length,common,alpha-count,numeric-count,special-count,uppercase,lowercase,sequential-pattern,repeated-characters,case-ratio
0,zzzzqwer1,13,9.0,0,8,1,0,0,1,1,1,0.0
1,yuna1980,13,8.0,0,4,4,0,0,1,0,0,0.0
2,WERTU280505,13,11.0,0,5,6,0,1,0,0,0,0.0
3,yqdiy35m,13,8.0,0,6,2,0,0,1,0,0,0.0
4,movynhxk,13,8.0,0,8,0,0,0,1,0,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
684408,140073,69,6.0,0,0,6,0,0,0,0,0,0.0
684409,210596n,69,7.0,0,1,6,0,0,1,0,0,0.0
684410,quotaji,69,7.0,0,7,0,0,0,1,0,0,0.0
684411,dfkthf12345,69,11.0,0,6,5,0,0,1,0,0,0.0


<h1>4. Saving the Final Dataset (No Labels Yet)</h1>

In [None]:
dataset.to_csv("Featured-Dataset.csv", index=False)

***