### IMPORT LIBRARIES

In [1]:
import re
import pandas as pd
from catboost import CatBoostClassifier

In [2]:
df = pd.read_csv('data.csv', usecols=['password', 'strength'])
df.head()

Unnamed: 0,password,strength
0,kzde5577,1
1,kino3434,1
2,visi7k1yr,1
3,megzy123,1
4,lamborghin1,1


### BASIC EDA AND DATA CLEANING

In [3]:
df['strength'].value_counts()

1                            496708
0                             89656
2                             82979
3                                 9
n                                 5
                              ...  
jakuzen.57@hotmail.com            1
axiselo@hotmail.com               1
metaren@yandex.com                1
melchazli@gmail.com               1
efsane.mardinli@gmail.com         1
Name: strength, Length: 443, dtype: int64

In [4]:
df.shape

(669879, 2)

In [5]:
# Convert the 'strength' column to numeric, setting errors='coerce' to convert non-numeric values to NaN
df['strength'] = pd.to_numeric(df['strength'], errors='coerce')

# Use boolean indexing to filter rows where 'strength' is in [0, 1, 2, 3]
valid_strengths = [0, 1, 2, 3]
df = df[df['strength'].isin(valid_strengths)]
df.shape

(669352, 2)

In [6]:
df['strength'].value_counts()

1.0    496708
0.0     89656
2.0     82979
3.0         9
Name: strength, dtype: int64

In [7]:
df.dropna(inplace=True)

### FEATURE ENGINEERING 

##### Character count features

In [10]:
df['length'] = df['password'].apply(len)
df['has_uppercase'] = df['password'].apply(lambda x: 1 if any(char.isupper() for char in x) else 0)
df['has_lowercase'] = df['password'].apply(lambda x: 1 if any(char.islower() for char in x) else 0)
df['has_number'] = df['password'].apply(lambda x: 1 if any(char.isdigit() for char in x) else 0)
df['has_special_char'] = df['password'].apply(lambda x: 1 if (pd.notna(x) and re.search(r'[!@#$%^&*(),.?":{}|<>]', x)) else 0)
df['uppercase_count'] = df['password'].apply(lambda x: sum(1 for char in x if char.isupper()))
df['lowercase_count'] = df['password'].apply(lambda x: sum(1 for char in x if char.islower()))
df['digit_count'] = df['password'].apply(lambda x: sum(1 for char in x if char.isdigit()))
df['special_char_count'] = df['password'].apply(lambda x: len([char for char in x if not char.isalnum()]))

##### Sequential character features

In [None]:
df['consecutive_upper'] = df['password'].apply(lambda x: max(len(run) for run in x.split() if run.isupper()))
df['consecutive_lower'] = df['password'].apply(lambda x: max(len(run) for run in x.split() if run.islower()))
df['consecutive_digits'] = df['password'].apply(lambda x: max(len(run) for run in x.split() if run.isdigit()))

##### Create a feature using the entropy score

In [None]:
# Function to calculate Shannon entropy
def calculate_entropy(password):
    char_count = len(password)
    if char_count == 0:
        return 0.0
    else:
        char_set = set(password)
        entropy_score = -sum((password.count(char) / char_count) * np.log2(password.count(char) / char_count) for char in char_set)
        return entropy_score

In [None]:
df['entropy'] = df['password'].apply(calculate_entropy)

##### Create binary features for each keyboard pattern

In [None]:
# keyboard_patterns = ["12345", "qwerty", "asdf", "password", "admin", "letmein", "123456", "abc123", "iloveyou", "monkey", "sunshine", "welcome", "superman", "princess", "dragon", "michael", "football", "baseball", "starwars", "shadow"]

# for pattern in keyboard_patterns:
#     df[f'has_{pattern}'] = df['password'].apply(lambda x: int(pattern in x))

#### n-grams (Subsequences)

In [None]:
# # Define specific bigrams to check for
# specific_bigrams = ["ab", "12", "zy", "xy", "qw", "as", "de", "56", "78", "cd", "fg", "jk", "mn", "pq", "uv", "wx"]

# specific_bigrams = ["ab", "12", "zy", "xy"]

# # Extract 2-grams (bigrams) from each password
# def extract_bigrams(text):
#     return [text[i:i+2] for i in range(len(text) - 1)]

# # Convert the list of passwords into space-separated strings
# corpus = [' '.join(extract_bigrams(password)) for password in df['password']]

# # Create a CountVectorizer to count the occurrence of specific bigrams
# vectorizer = CountVectorizer(vocabulary=specific_bigrams, binary=True, token_pattern=r'\S+')
# bigram_features = vectorizer.transform(corpus).toarray()

# # Create binary features for each specific bigram
# for i, bigram in enumerate(specific_bigrams):
#     df[f'has_{bigram}_bigram'] = bigram_features[:, i]