<h1>1. Necessary Imports & Steps</h1>

In [1]:
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

<h1>2. Read Featured-Dataset</h1>

In [2]:
# Read the preprocessed dataset and drop NaN values
dataset = pd.read_csv("Featured-Dataset.csv")
dataset = dataset.dropna()
dataset = dataset.drop(columns=['password'])
dataset

Unnamed: 0,count,length,common,alpha-count,numeric-count,special-count,uppercase,lowercase,sequential-pattern,repeated-characters,case-ratio
0,13,9.0,0,8,1,0,0,1,1,1,0.0
1,13,8.0,0,4,4,0,0,1,0,0,0.0
2,13,11.0,0,5,6,0,1,0,0,0,0.0
3,13,8.0,0,6,2,0,0,1,0,0,0.0
4,13,8.0,0,8,0,0,0,1,0,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
684408,69,6.0,0,0,6,0,0,0,0,0,0.0
684409,69,7.0,0,1,6,0,0,1,0,0,0.0
684410,69,7.0,0,7,0,0,0,1,0,0,0.0
684411,69,11.0,0,6,5,0,0,1,0,0,0.0


<h1>3. Apply K-Means with 10 Clusters - Based on Elbow Method (<small><i>03-howmanyclusters.ipynb</i></small>)</h1>

In [3]:
# Scale the data first (important for K-Means)
scaler = StandardScaler()
scaledFeatures = scaler.fit_transform(dataset[['count', 'length', 'common', 'alpha-count', 'numeric-count', 
                                                'special-count', 'uppercase', 'lowercase', 'sequential-pattern', 
                                                'repeated-characters', 'case-ratio']])

KMeans = KMeans(n_clusters=10, random_state=42)
dataset['strength'] = KMeans.fit_predict(scaledFeatures)
dataset

Unnamed: 0,count,length,common,alpha-count,numeric-count,special-count,uppercase,lowercase,sequential-pattern,repeated-characters,case-ratio,strength
0,13,9.0,0,8,1,0,0,1,1,1,0.0,6
1,13,8.0,0,4,4,0,0,1,0,0,0.0,3
2,13,11.0,0,5,6,0,1,0,0,0,0.0,0
3,13,8.0,0,6,2,0,0,1,0,0,0.0,8
4,13,8.0,0,8,0,0,0,1,0,0,0.0,8
...,...,...,...,...,...,...,...,...,...,...,...,...
684408,69,6.0,0,0,6,0,0,0,0,0,0.0,4
684409,69,7.0,0,1,6,0,0,1,0,0,0.0,3
684410,69,7.0,0,7,0,0,0,1,0,0,0.0,8
684411,69,11.0,0,6,5,0,0,1,0,0,0.0,8


<h1>4. Labelling First 3 Classes as 'Weak', Next 4 as 'Moderate' and Last 3 as 'Strong'</h1>

In [4]:
# Map the numerical strength to categories
dataset['strength'] = dataset['strength'].replace({0: 'weak', 1: 'weak', 2: 'weak', 
                                                   3: 'moderate', 4: 'moderate', 5: 'moderate', 6: 'moderate', 
                                                   7: 'strong', 8: 'strong', 9: 'strong'})
dataset

Unnamed: 0,count,length,common,alpha-count,numeric-count,special-count,uppercase,lowercase,sequential-pattern,repeated-characters,case-ratio,strength
0,13,9.0,0,8,1,0,0,1,1,1,0.0,moderate
1,13,8.0,0,4,4,0,0,1,0,0,0.0,moderate
2,13,11.0,0,5,6,0,1,0,0,0,0.0,weak
3,13,8.0,0,6,2,0,0,1,0,0,0.0,strong
4,13,8.0,0,8,0,0,0,1,0,0,0.0,strong
...,...,...,...,...,...,...,...,...,...,...,...,...
684408,69,6.0,0,0,6,0,0,0,0,0,0.0,moderate
684409,69,7.0,0,1,6,0,0,1,0,0,0.0,moderate
684410,69,7.0,0,7,0,0,0,1,0,0,0.0,strong
684411,69,11.0,0,6,5,0,0,1,0,0,0.0,strong


In [5]:
strongCount = dataset[dataset['strength'] == 'strong'].shape[0]
print(f"Number of strong passwords after relaxing rules: {strongCount}")

moderateCount = dataset[dataset['strength'] == 'moderate'].shape[0]
print(f"Number of moderate passwords after relaxing rules: {moderateCount}")

weakCount = dataset[dataset['strength'] == 'weak'].shape[0]
print(f"Number of weak passwords after relaxing rules: {weakCount}")

Number of strong passwords after relaxing rules: 132924
Number of moderate passwords after relaxing rules: 251618
Number of weak passwords after relaxing rules: 299868


<h1>5. Saving the Finalized Dataset</h1>

In [6]:
# Extract password column from temp_dataset
tempDataset = pd.read_csv("Featured-Dataset.csv")
passwordColumn = tempDataset['password']

# Insert the password column at the beginning of the main DataFrame (df)
dataset.insert(0, 'password', passwordColumn)
dataset

Unnamed: 0,password,count,length,common,alpha-count,numeric-count,special-count,uppercase,lowercase,sequential-pattern,repeated-characters,case-ratio,strength
0,zzzzqwer1,13,9.0,0,8,1,0,0,1,1,1,0.0,moderate
1,yuna1980,13,8.0,0,4,4,0,0,1,0,0,0.0,moderate
2,WERTU280505,13,11.0,0,5,6,0,1,0,0,0,0.0,weak
3,yqdiy35m,13,8.0,0,6,2,0,0,1,0,0,0.0,strong
4,movynhxk,13,8.0,0,8,0,0,0,1,0,0,0.0,strong
...,...,...,...,...,...,...,...,...,...,...,...,...,...
684408,140073,69,6.0,0,0,6,0,0,0,0,0,0.0,moderate
684409,210596n,69,7.0,0,1,6,0,0,1,0,0,0.0,moderate
684410,quotaji,69,7.0,0,7,0,0,0,1,0,0,0.0,strong
684411,dfkthf12345,69,11.0,0,6,5,0,0,1,0,0,0.0,strong


In [7]:
dataset.to_csv("Final-Dataset.csv", index=False)

***