### IMPORT LIBRARIES

In [1]:
import re
import joblib
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.model_selection import train_test_split
from catboost import CatBoostClassifier

In [2]:
df = pd.read_csv('data.csv', usecols=['password', 'strength'])
df.head()

Unnamed: 0,password,strength
0,kzde5577,1
1,kino3434,1
2,visi7k1yr,1
3,megzy123,1
4,lamborghin1,1


### BASIC EDA AND DATA CLEANING

In [3]:
df['strength'].value_counts()

1                            496708
0                             89656
2                             82979
3                                 9
n                                 5
                              ...  
jakuzen.57@hotmail.com            1
axiselo@hotmail.com               1
metaren@yandex.com                1
melchazli@gmail.com               1
efsane.mardinli@gmail.com         1
Name: strength, Length: 443, dtype: int64

In [4]:
df.shape

(669879, 2)

In [5]:
# Convert the 'strength' column to numeric, setting errors='coerce' to convert non-numeric values to NaN
df['strength'] = pd.to_numeric(df['strength'], errors='coerce')

# Use boolean indexing to filter rows where 'strength' is in [0, 1, 2]
valid_strengths = [0, 1, 2]
df = df[df['strength'].isin(valid_strengths)]
df.shape

(669343, 2)

In [6]:
df['strength'].value_counts()

1.0    496708
0.0     89656
2.0     82979
Name: strength, dtype: int64

In [7]:
df.dropna(inplace=True)

### FEATURE ENGINEERING 

##### Character count features

In [8]:
df['length'] = df['password'].apply(len)
df['has_uppercase'] = df['password'].apply(lambda x: 1 if any(char.isupper() for char in x) else 0)
df['has_lowercase'] = df['password'].apply(lambda x: 1 if any(char.islower() for char in x) else 0)
df['has_number'] = df['password'].apply(lambda x: 1 if any(char.isdigit() for char in x) else 0)
df['has_special_char'] = df['password'].apply(lambda x: 1 if (pd.notna(x) and re.search(r'[!@#$%^&*(),.?":{}|<>]', x)) else 0)
df['uppercase_count'] = df['password'].apply(lambda x: sum(1 for char in x if char.isupper()))
df['lowercase_count'] = df['password'].apply(lambda x: sum(1 for char in x if char.islower()))
df['digit_count'] = df['password'].apply(lambda x: sum(1 for char in x if char.isdigit()))
df['special_char_count'] = df['password'].apply(lambda x: len([char for char in x if not char.isalnum()]))

##### Sequential character features

In [9]:
df['consecutive_upper'] = df['password'].apply(lambda x: max((len(run) for run in x.split() if run.isupper()), default=0))
df['consecutive_lower'] = df['password'].apply(lambda x: max((len(run) for run in x.split() if run.islower()), default=0))
df['consecutive_digits'] = df['password'].apply(lambda x: max((len(run) for run in x.split() if run.isdigit()), default=0))

##### Create a feature using the entropy score

In [10]:
# Function to calculate Shannon entropy
def calculate_entropy(password):
    char_count = len(password)
    if char_count == 0:
        return 0.0
    else:
        char_set = set(password)
        entropy_score = -sum((password.count(char) / char_count) * np.log2(password.count(char) / char_count) for char in char_set)
        return entropy_score

In [11]:
df['entropy'] = df['password'].apply(calculate_entropy)

##### Create binary features for each keyboard pattern

In [12]:
# keyboard_patterns = ["12345", "qwerty", "asdf", "password", "admin", "letmein", "123456", "abc123", "iloveyou", "monkey", "sunshine", "welcome", "superman", "princess", "dragon", "michael", "football", "baseball", "starwars", "shadow"]

# for pattern in keyboard_patterns:
#     df[f'has_{pattern}'] = df['password'].apply(lambda x: int(pattern in x))

#### n-grams (Subsequences)

In [13]:
# # Define specific bigrams to check for
# specific_bigrams = ["ab", "12", "zy", "xy", "qw", "as", "de", "56", "78", "cd", "fg", "jk", "mn", "pq", "uv", "wx"]

# specific_bigrams = ["ab", "12", "zy", "xy"]

# # Extract 2-grams (bigrams) from each password
# def extract_bigrams(text):
#     return [text[i:i+2] for i in range(len(text) - 1)]

# # Convert the list of passwords into space-separated strings
# corpus = [' '.join(extract_bigrams(password)) for password in df['password']]

# # Create a CountVectorizer to count the occurrence of specific bigrams
# vectorizer = CountVectorizer(vocabulary=specific_bigrams, binary=True, token_pattern=r'\S+')
# bigram_features = vectorizer.transform(corpus).toarray()

# # Create binary features for each specific bigram
# for i, bigram in enumerate(specific_bigrams):
#     df[f'has_{bigram}_bigram'] = bigram_features[:, i]

##### TRAIN MODEL

In [14]:
# Drop original password column
df.drop('password', axis=1, inplace=True)

# Define X and y
X = df.drop('strength', axis=1) 
y = df['strength']

In [15]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=23, stratify=y)

In [16]:
# Train CatBoost classifier
model = CatBoostClassifier(iterations=1000, verbose=100, learning_rate=0.1)
model.fit(x_train, y_train, eval_set=[(x_test, y_test)])

0:	learn: 0.9090852	test: 0.9090735	best: 0.9090735 (0)	total: 507ms	remaining: 8m 26s
100:	learn: 0.0013720	test: 0.0011799	best: 0.0011799 (100)	total: 46.9s	remaining: 6m 57s
200:	learn: 0.0012023	test: 0.0010365	best: 0.0010362 (193)	total: 1m 44s	remaining: 6m 53s
300:	learn: 0.0011548	test: 0.0010110	best: 0.0010110 (300)	total: 2m 26s	remaining: 5m 39s
400:	learn: 0.0011316	test: 0.0010040	best: 0.0010039 (396)	total: 3m 28s	remaining: 5m 11s
500:	learn: 0.0011148	test: 0.0009969	best: 0.0009969 (500)	total: 4m 27s	remaining: 4m 26s
600:	learn: 0.0011002	test: 0.0009948	best: 0.0009944 (596)	total: 5m 23s	remaining: 3m 34s
700:	learn: 0.0010874	test: 0.0009908	best: 0.0009907 (679)	total: 6m 21s	remaining: 2m 42s
800:	learn: 0.0010799	test: 0.0009915	best: 0.0009907 (679)	total: 7m 13s	remaining: 1m 47s
900:	learn: 0.0010707	test: 0.0009898	best: 0.0009897 (896)	total: 7m 56s	remaining: 52.4s
999:	learn: 0.0010655	test: 0.0009902	best: 0.0009897 (896)	total: 8m 42s	remaining: 0u

<catboost.core.CatBoostClassifier at 0x7f46b2a37d30>

In [17]:
# Make predictions
y_pred = model.predict(x_test)

print("Accuracy:", metrics.accuracy_score(y_test, y_pred))
print("F1 Score:", metrics.f1_score(y_test, y_pred, average='micro'))

Accuracy: 0.9998505996387228
F1 Score: 0.9998505996387228


In [18]:
print(metrics.classification_report(y_test, y_pred, zero_division=1))

              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00     29586
         1.0       1.00      1.00      1.00    163914
         2.0       1.00      1.00      1.00     27383

    accuracy                           1.00    220883
   macro avg       1.00      1.00      1.00    220883
weighted avg       1.00      1.00      1.00    220883



In [19]:
# Save model
joblib.dump(model, 'model.joblib')

['model.joblib']