<h1>1. Necessary Imports & Steps</h1>

In [1]:
import pandas as pd
import numpy as np
import re

<h1>2. Read the Original <b>PWLDS</b> Dataset</h1>
<ul>
    <li>Password: Representing history of passwords been compromised before</li>
    <li>Hash: Representing their corresponding hashes</li>
    <li>Count: Representing how many times have they been reported compromised</li>
</ul>

In [2]:
# Read target dataset
dataset = pd.read_csv("pwlds.csv")
dataset

Unnamed: 0,Password,Strength
0,7hqwv,0
1,cjml,0
2,asuy,0
3,kcyth,0
4,whcq,0
...,...,...
5000091,"U,c0Da#<EaW.2(Bd|pGScmj/bJ",4
5000092,0Y</Y{zz^O2*fAWh9G8hE+yEeF^_,4
5000093,"N,tYi',X0pMmnx6_=PLMG",4
5000094,"aBtn>DR_xk4AaEus$R8J!jZ^JKq""KCqt",4


In [3]:
# Add new feature 'Length' to target dataset - count the number of letters in the password
dataset['Length'] = dataset['Password'].str.len()
dataset['Password'] = dataset['Password'].fillna("").astype(str)
dataset

Unnamed: 0,Password,Strength,Length
0,7hqwv,0,5.0
1,cjml,0,4.0
2,asuy,0,4.0
3,kcyth,0,5.0
4,whcq,0,4.0
...,...,...,...
5000091,"U,c0Da#<EaW.2(Bd|pGScmj/bJ",4,26.0
5000092,0Y</Y{zz^O2*fAWh9G8hE+yEeF^_,4,28.0
5000093,"N,tYi',X0pMmnx6_=PLMG",4,21.0
5000094,"aBtn>DR_xk4AaEus$R8J!jZ^JKq""KCqt",4,32.0


<h1>3. Preprocessing Steps - Getting the Dataset Ready</h1>

In [4]:
def countAlphabets(password):
    return sum(char.isalpha() for char in password)
def countNumbers(password):
    return sum(char.isdigit() for char in password)
def countSpecialChars(password):
    return sum(not char.isalnum() for char in password)

dataset['Count(Alphabets)'] = dataset['Password'].apply(countAlphabets)
dataset['Count(Numerics)'] = dataset['Password'].apply(countNumbers)
dataset['Count(SpecialChars)'] = dataset['Password'].apply(countSpecialChars)
dataset

Unnamed: 0,Password,Strength,Length,Count(Alphabets),Count(Numerics),Count(SpecialChars)
0,7hqwv,0,5.0,4,1,0
1,cjml,0,4.0,4,0,0
2,asuy,0,4.0,4,0,0
3,kcyth,0,5.0,5,0,0
4,whcq,0,4.0,4,0,0
...,...,...,...,...,...,...
5000091,"U,c0Da#<EaW.2(Bd|pGScmj/bJ",4,26.0,17,2,7
5000092,0Y</Y{zz^O2*fAWh9G8hE+yEeF^_,4,28.0,16,4,8
5000093,"N,tYi',X0pMmnx6_=PLMG",4,21.0,14,2,5
5000094,"aBtn>DR_xk4AaEus$R8J!jZ^JKq""KCqt",4,32.0,24,2,6


In [5]:
def countUppercase(password):
    return sum(char.isupper() for char in password)
def countLowercase(password):
    return sum(char.islower() for char in password)

dataset['Count(Uppercase)'] = dataset['Password'].apply(countUppercase)
dataset['Count(Lowercase)'] = dataset['Password'].apply(countLowercase)
dataset


Unnamed: 0,Password,Strength,Length,Count(Alphabets),Count(Numerics),Count(SpecialChars),Count(Uppercase),Count(Lowercase)
0,7hqwv,0,5.0,4,1,0,0,4
1,cjml,0,4.0,4,0,0,0,4
2,asuy,0,4.0,4,0,0,0,4
3,kcyth,0,5.0,5,0,0,0,5
4,whcq,0,4.0,4,0,0,0,4
...,...,...,...,...,...,...,...,...
5000091,"U,c0Da#<EaW.2(Bd|pGScmj/bJ",4,26.0,17,2,7,8,9
5000092,0Y</Y{zz^O2*fAWh9G8hE+yEeF^_,4,28.0,16,4,8,9,7
5000093,"N,tYi',X0pMmnx6_=PLMG",4,21.0,14,2,5,8,6
5000094,"aBtn>DR_xk4AaEus$R8J!jZ^JKq""KCqt",4,32.0,24,2,6,12,12


In [6]:
def hasRepeatedCharacters(password):
    # Check if the password contains a sequence of the same character repeated, e.g., 'aaaa', 'bbbb', '@@@@', etc.
    if re.search(r"(.)\1{2,}", password):  # Checks for any character repeated 3 or more times consecutively
        return 1
    return 0

dataset['RepeatedChars'] = dataset['Password'].apply(hasRepeatedCharacters)
dataset

Unnamed: 0,Password,Strength,Length,Count(Alphabets),Count(Numerics),Count(SpecialChars),Count(Uppercase),Count(Lowercase),RepeatedChars
0,7hqwv,0,5.0,4,1,0,0,4,0
1,cjml,0,4.0,4,0,0,0,4,0
2,asuy,0,4.0,4,0,0,0,4,0
3,kcyth,0,5.0,5,0,0,0,5,0
4,whcq,0,4.0,4,0,0,0,4,0
...,...,...,...,...,...,...,...,...,...
5000091,"U,c0Da#<EaW.2(Bd|pGScmj/bJ",4,26.0,17,2,7,8,9,0
5000092,0Y</Y{zz^O2*fAWh9G8hE+yEeF^_,4,28.0,16,4,8,9,7,0
5000093,"N,tYi',X0pMmnx6_=PLMG",4,21.0,14,2,5,8,6,0
5000094,"aBtn>DR_xk4AaEus$R8J!jZ^JKq""KCqt",4,32.0,24,2,6,12,12,0


In [7]:
def caseRatio(row):
    if row['Count(Uppercase)'] == 0:
        return 0
    return row['Count(Lowercase)'] / row['Count(Uppercase)']

# Apply the function row-wise
dataset['CaseRatio'] = dataset.apply(caseRatio, axis=1)
dataset

Unnamed: 0,Password,Strength,Length,Count(Alphabets),Count(Numerics),Count(SpecialChars),Count(Uppercase),Count(Lowercase),RepeatedChars,CaseRatio
0,7hqwv,0,5.0,4,1,0,0,4,0,0.000000
1,cjml,0,4.0,4,0,0,0,4,0,0.000000
2,asuy,0,4.0,4,0,0,0,4,0,0.000000
3,kcyth,0,5.0,5,0,0,0,5,0,0.000000
4,whcq,0,4.0,4,0,0,0,4,0,0.000000
...,...,...,...,...,...,...,...,...,...,...
5000091,"U,c0Da#<EaW.2(Bd|pGScmj/bJ",4,26.0,17,2,7,8,9,0,1.125000
5000092,0Y</Y{zz^O2*fAWh9G8hE+yEeF^_,4,28.0,16,4,8,9,7,0,0.777778
5000093,"N,tYi',X0pMmnx6_=PLMG",4,21.0,14,2,5,8,6,0,0.750000
5000094,"aBtn>DR_xk4AaEus$R8J!jZ^JKq""KCqt",4,32.0,24,2,6,12,12,0,1.000000


In [8]:
# Taking the Strength Column to the End as Labels
strengthColumn = dataset.pop('Strength')
dataset['Strength'] = strengthColumn
dataset

Unnamed: 0,Password,Length,Count(Alphabets),Count(Numerics),Count(SpecialChars),Count(Uppercase),Count(Lowercase),RepeatedChars,CaseRatio,Strength
0,7hqwv,5.0,4,1,0,0,4,0,0.000000,0
1,cjml,4.0,4,0,0,0,4,0,0.000000,0
2,asuy,4.0,4,0,0,0,4,0,0.000000,0
3,kcyth,5.0,5,0,0,0,5,0,0.000000,0
4,whcq,4.0,4,0,0,0,4,0,0.000000,0
...,...,...,...,...,...,...,...,...,...,...
5000091,"U,c0Da#<EaW.2(Bd|pGScmj/bJ",26.0,17,2,7,8,9,0,1.125000,4
5000092,0Y</Y{zz^O2*fAWh9G8hE+yEeF^_,28.0,16,4,8,9,7,0,0.777778,4
5000093,"N,tYi',X0pMmnx6_=PLMG",21.0,14,2,5,8,6,0,0.750000,4
5000094,"aBtn>DR_xk4AaEus$R8J!jZ^JKq""KCqt",32.0,24,2,6,12,12,0,1.000000,4


<h1>4. Saving the Final Dataset (No Labels Yet)</h1>

In [9]:
dataset.to_csv("New-Featured-Dataset.csv", index=False)

***