In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
import math

In [None]:
!pip install pyahocorasick


In [None]:
import ahocorasick

In [None]:
data = pd.read_csv('/kaggle/input/password-strength-classifier-dataset/data.csv' , on_bad_lines = 'skip')
vocabset = pd.read_csv('/kaggle/input/english-word-frequency-list/ngram_freq.csv')

In [None]:
vocab_df = vocabset.copy()

Test train split 

In [None]:
def extract_features(password: str):
    features = {
        'length': len(password),
        'uppercase': sum(1 for char in password if char.isupper()),
        'lowercase': sum(1 for char in password if char.islower()),
        'digits': sum(1 for char in password if char.isdigit()),
        'special_chars': sum(1 for char in password if not char.isalnum())
    }
    return features

In [None]:
data['password'] = data['password'].astype(str)

In [None]:
new_df = pd.DataFrame([{'password' : data['password'].iloc[i] , 'strength' : data['strength'].iloc[i] ,
                       **extract_features(data['password'].iloc[i])} for i in range(len(data))])

In [None]:
new_df

In [None]:
X = new_df.drop(columns=['strength'])  # Features
y = new_df['strength']                  # Target/Labels

# Split the dataset into training and testing sets with a 3:7 ratio
X_train, X_test, y_train, y_test = train_test_split(
    X, 
    y, 
    test_size=0.7,   # 70% test size, thus 30% train size
    stratify=y,      # Ensures class distribution is maintained
    random_state=42  # For reproducibility
)

In [None]:
actual_words = vocab_df[vocab_df['word'].str.len() > 1 ] 

In [None]:
q_25 , q_50 , q_75 , q_90 = actual_words['count'].quantile([0.25,0.5,0.75 ,0.90])

In [None]:
q_25

In [None]:
def classify_words_by_quantiles(df):
    actual_words = df.copy()
    # Calculate quantile thresholds
    q_25, q_50, q_75, q_90 = actual_words['count'].quantile([0.25, 0.5, 0.75, 0.90])

    # Classify based on quantiles using pd.cut
    bins = [-float('inf'), q_25, q_50, q_75, q_90, float('inf')]
    labels = ['very_low', 'low', 'medium', 'high', 'very_high']

    actual_words['Occurance'] = pd.cut(actual_words['count'], bins=bins, labels=labels)

    # Create the new DataFrame directly
    new_vocab = actual_words[['word', 'count', 'Occurance']].copy()

    return new_vocab

In [None]:
actual_words = pd.DataFrame(actual_words)

new_vocab_df = classify_words_by_quantiles(actual_words)


In [None]:
new_vocab_df

In [None]:
vocab_list = [new_vocab_df['word'].iloc[i] for i in range(len(new_vocab_df['word']))] 

In [None]:
password_list = new_df['password'].tolist()

In [None]:
new_vocab_df

In [None]:
# Convert to dictionary for fast lookup
vocab_tiers = dict(zip(new_vocab_df['word'], new_vocab_df['Occurance']))

# Define tier priority
tier_priority = {'very_low': 1, 'low': 2, 'medium': 3 , 'high':4 , 'very_high':5}  # Higher number = higher priority

# Initialize the automaton
automaton = ahocorasick.Automaton()

# Add words to automaton with tier info
for word, tier in vocab_tiers.items():
    automaton.add_word(word.lower(), (word.lower(), tier))

# Finalize automaton for fast searching
automaton.make_automaton()

In [None]:
def check_password_debug(password):
    # Normalize the password to lowercase.
    text = str(password).lower()
    matched_words = set()  # Using a set to keep unique matched words.
    highest_tier = "none"    # Default tier if no words are found.
    highest_priority = 0

    # Iterate over all matches in the password using the automaton.
    for end_index, (word, tier) in automaton.iter(text):
        matched_words.add(word)  # Add the word to our set.
        current_priority = tier_priority.get(tier, 0)
        # Update the highest tier if this word's tier has a higher priority.
        if current_priority > highest_priority:
            highest_priority = current_priority
            highest_tier = tier

    # Return a sorted list (optional, for easier debugging) and the highest tier.
    return sorted(matched_words), highest_tier



In [None]:
new_df[['contain_vocab', 'vocab_tier']] = new_df['password'].apply(    lambda x: pd.Series(check_password_debug(x)))

In [None]:
new_df

In [None]:
new_df

In [None]:
new_df = new_df.drop(columns = 'contain_vocab')

In [None]:
new_df

In [None]:
def calculate_entropy(password):
    # A rough estimation using Shannon entropy.
    if not password:
        return 0
    freq = {}
    for char in password:
        freq[char] = freq.get(char, 0) + 1
    entropy = 0.0
    for count in freq.values():
        p = count / len(password)
        entropy -= p * math.log2(p)
    return entropy

Ratio features extraction

In [None]:
def normalize(password):
    substitutions = {
        'a': ['@', '4'],
        'b': ['8'],
        'c': ['(', '<', '{', '['],
        'd': ['|)'],
        'e': ['3'],
        'f': [],         # No common replacement, but can add one if needed
        'g': ['6', '9'],
        'h': ['#'],
        'i': ['1', '!'],  # 'i' -> '1' (or '!')
        'j': [],
        'k': [],
        'l': ['1', '|'],
        'm': [],         # Could add common alternatives if desired
        'n': [],
        'o': ['0'],
        'p': [],
        'q': [],
        'r': [],
        's': ['5', '$'],
        't': ['7'],
        'u': ['v'],      # sometimes 'u' is replaced with 'v'
        'v': [],
        'w': ['vv'],
        'x': ['%'],
        'y': [],
        'z': ['2']
    }
    
    normalized_chars = []
    for ch in password.lower():
        rep = substitutions.get(ch, ch)
        if isinstance(rep, list):
            # If the list is non-empty, use the first replacement; otherwise, use the original character.
            normalized_chars.append(rep[0] if rep else ch)
        else:
            normalized_chars.append(rep)
    return ''.join(normalized_chars)

In [None]:
def ratio_feature_extract(password:str) : 
    features = {}
    # Ensure password is a string
    password = str(password)
    
    # Check if the password is not empty
    if len(password) > 0:
        features['num_upper'] = sum(1 for c in password if c.isupper())
        features['num_lower'] = sum(1 for c in password if c.islower())
        features['num_digits'] = sum(1 for c in password if c.isdigit())
        features['num_special'] = len(password) - features['num_upper'] - features['num_lower'] - features['num_digits']
        
        features['upper_ratio'] = features['num_upper'] / len(password)
        features['lower_ratio'] = features['num_lower'] / len(password)
        features['digit_ratio'] = features['num_digits'] / len(password)
        features['special_ratio'] = features['num_special'] / len(password)
    else:
        # If password is empty, define all features as 0
        features['num_upper'] = features['num_lower'] = features['num_digits'] = features['num_special'] = 0
        features['upper_ratio'] = features['lower_ratio'] = features['digit_ratio'] = features['special_ratio'] = 0

    features['entropy'] = calculate_entropy(password)

    return features
    

In [None]:
features_series = data['password'].apply(ratio_feature_extract)

In [None]:
features_series

In [None]:
features_df = pd.DataFrame(features_series.tolist())

In [None]:
new_df_all = pd.concat([new_df, features_df], axis=1)

In [66]:
new_df_all.to_csv('/kaggle/working/pretrain_dataset.csv')

Actual model making 

In [None]:
preprocessed_df = pd.read_csv('/kaggle/working/preprocessed_data.csv')

In [None]:
preprocessed_df