In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split

In [8]:
#Access to Data, that I download from Kaggle-Password Strength Classifier Dataset

In [14]:
data = pd.read_csv('data.csv', sep=',', on_bad_lines='skip')
data.head()

Unnamed: 0,password,strength
0,kzde5577,1
1,kino3434,1
2,visi7k1yr,1
3,megzy123,1
4,lamborghin1,1


In [15]:
#Data Manipulation, Preparing to ML

In [16]:
df = data.copy()
df.info

<bound method DataFrame.info of             password  strength
0           kzde5577         1
1           kino3434         1
2          visi7k1yr         1
3           megzy123         1
4        lamborghin1         1
...              ...       ...
669635    10redtux10         1
669636     infrared1         1
669637  184520socram         1
669638     marken22a         1
669639      fxx4pw4g         1

[669640 rows x 2 columns]>

In [17]:
df["strength"].value_counts()

strength
1    496801
0     89702
2     83137
Name: count, dtype: int64

In [18]:
df.dropna(inplace=True)

In [19]:
df.isnull().sum()

password    0
strength    0
dtype: int64

In [20]:
df["strength"] = data["strength"].map({0: "Weak",
                                      1: "Medium",
                                      2: "Strong"})
df.sample(5)

Unnamed: 0,password,strength
534685,seftya280990,Medium
328941,nurasad123,Medium
422007,jakarta95,Medium
249130,yousef1,Weak
291355,dxg2bzzt,Medium


In [21]:
password_ = np.array(df)

In [22]:
password_

array([['kzde5577', 'Medium'],
       ['kino3434', 'Medium'],
       ['visi7k1yr', 'Medium'],
       ...,
       ['184520socram', 'Medium'],
       ['marken22a', 'Medium'],
       ['fxx4pw4g', 'Medium']], dtype=object)

In [23]:
#Shuffle the dataset to help model to understand patterns and relations within the dataset

In [24]:
import random

In [25]:
random.shuffle(password_)

In [26]:
#Labels for variables and Features for Password

In [27]:
allpasswords = [s[0] for s in password_]
ylabels = [s[1] for s in password_]

In [28]:
len(ylabels), len(allpasswords)

(669639, 669639)

In [29]:
#Password Strength Prediction Model with Sklearn Lib
#Tokenization first before prediction

In [30]:
def word(password):
    character = []
    for i in password:
        character.append(i)
    return character

tdif = TfidfVectorizer(tokenizer = None, analyzer = word, encoding='utf-8')
X = tdif.fit_transform(allpasswords)
X_train, X_test, y_train, y_test = train_test_split(X, ylabels, test_size = 0.2, random_state = 42)

In [31]:
#Building the model

In [32]:
model = RandomForestClassifier()
model.fit(X_train, y_train)
model.score(X_test, y_test) #accuracy

0.9881204826473926

In [33]:
y_pred = model.predict(X_test)
y_pred

array(['Medium', 'Weak', 'Medium', ..., 'Medium', 'Medium', 'Medium'],
      dtype='<U6')

In [34]:
#Prediction with Input data

In [39]:
X_predict = ['sdasdasd',
            '123asd',
            'Asd3045+',
            'qwerhgd34---',
            '126432675',
             'ASDBERW',
            'Ya5f6n2m!',
            'abcdefgh',
            '12345']

In [40]:
X_predict = tdif.transform(X_predict)
y_predict = model.predict(X_predict)
y_predict

array(['Medium', 'Medium', 'Strong', 'Strong', 'Medium', 'Medium',
       'Medium', 'Medium', 'Medium'], dtype='<U6')

In [None]:
#Another test tool

In [42]:
import getpass
user = getpass.getpass("Enter Password: ")
data = tdif.transform([user]).toarray()
output = model.predict(data)
print(output)

['Medium']
