## Data processing

In [43]:
import pandas as pd
import string
import numpy as np
import warnings
warnings.filterwarnings(action='ignore')

In [44]:
df = pd.read_csv("Data/Users.csv")
df.head()

Unnamed: 0,index,password,strength
0,0,zxe870819,1
1,1,xw46454nr23l,1
2,2,soporte13,1
3,3,accounts6000webhost.com,2
4,4,c443balg,1


In [45]:
df.isnull().sum()

index       0
password    0
strength    0
dtype: int64

In [46]:
try:
    df.drop('index', axis = 'columns',inplace=True)
except Exception as e:
    print(e)

In [47]:
df.head()

Unnamed: 0,password,strength
0,zxe870819,1
1,xw46454nr23l,1
2,soporte13,1
3,accounts6000webhost.com,2
4,c443balg,1


In [48]:
def parse_string(password:str)-> dict:
    """ Args: string (password)
        (This function hleps us to count numerical, character, special_symbols uppercase and lowercase patterns from password)
        returns: dict """

    counts = {'zero': 0, 'one': 0, 'two': 0, 'three': 0, 'four': 0, 'five': 0, 'six': 0, 'seven': 0, 'eight': 0, 'nine': 0, 'nums':0, 'chars':0, 'upper':0, 'lower':0, 'special':0}
    for e in password:
        try:
            num = int(e)
            counts['nums'] += 1 
            counts.update({list(counts.keys())[num]:list(counts.values())[num] + 1})
        except:
            if e in string.punctuation:
                counts['special'] += 1
            else:
                counts['chars'] += 1
                if e.isupper():
                    counts['upper'] += 1
                else:
                    counts['lower'] += 1
            
    return counts

In [49]:
res = parse_string('accounts6000webhost.com')
print(res)

{'zero': 3, 'one': 0, 'two': 0, 'three': 0, 'four': 0, 'five': 0, 'six': 1, 'seven': 0, 'eight': 0, 'nine': 0, 'nums': 4, 'chars': 18, 'upper': 0, 'lower': 18, 'special': 1}


In [50]:
pass_patterns = []
for password, strength in zip(df.password, df.strength):
    temp = parse_string(password)
    pass_patterns.append(list(temp.values()) + [strength])
pass_patterns

[[1, 1, 0, 0, 0, 0, 0, 1, 2, 1, 6, 3, 0, 3, 0, 1],
 [0, 0, 1, 1, 3, 1, 1, 0, 0, 0, 7, 5, 0, 5, 0, 1],
 [0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 2, 7, 0, 7, 0, 1],
 [3, 0, 0, 0, 0, 0, 1, 0, 0, 0, 4, 18, 0, 18, 1, 2],
 [0, 0, 0, 1, 2, 0, 0, 0, 0, 0, 3, 5, 0, 5, 0, 1],
 [1, 1, 1, 1, 0, 0, 3, 1, 0, 0, 8, 1, 0, 1, 0, 1],
 [0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 3, 5, 0, 5, 0, 1],
 [1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 3, 12, 9, 3, 0, 2],
 [1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 6, 0, 6, 0, 1],
 [1, 2, 1, 0, 0, 1, 0, 1, 0, 0, 6, 3, 0, 3, 0, 1],
 [1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 4, 6, 0, 6, 0, 1],
 [0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 2, 6, 0, 6, 0, 1],
 [0, 0, 1, 1, 0, 3, 0, 1, 0, 1, 7, 3, 0, 3, 0, 1],
 [2, 0, 1, 0, 0, 1, 0, 0, 0, 0, 4, 4, 0, 4, 0, 1],
 [0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 3, 11, 2, 9, 0, 2],
 [0, 2, 2, 1, 3, 0, 0, 0, 1, 1, 10, 2, 0, 2, 0, 1],
 [0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 3, 7, 0, 7, 0, 1],
 [0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 3, 6, 0, 6, 0, 1],
 [0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 2, 7, 0, 7, 0, 1],
 [0, 0, 0, 1, 0, 0, 0, 1, 

## Creating dataframe

In [51]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [52]:
df = pd.DataFrame(pass_patterns, columns=list(res.keys()) + ['strength'])
df.head()

Unnamed: 0,zero,one,two,three,four,five,six,seven,eight,nine,nums,chars,upper,lower,special,strength
0,1,1,0,0,0,0,0,1,2,1,6,3,0,3,0,1
1,0,0,1,1,3,1,1,0,0,0,7,5,0,5,0,1
2,0,1,0,1,0,0,0,0,0,0,2,7,0,7,0,1
3,3,0,0,0,0,0,1,0,0,0,4,18,0,18,1,2
4,0,0,0,1,2,0,0,0,0,0,3,5,0,5,0,1


In [53]:
df.to_csv("Data//password_params.csv")

In [54]:
X,y = df.drop('strength', axis='columns'), df['strength']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, stratify = df['strength'])
X_train.shape, y_test.shape

((80000, 15), (20000,))

In [55]:
model = LogisticRegression()
model.fit(X_train, y_train)
model.score(X_train, y_train)

0.999825

In [56]:
model.score(X_test, y_test)

0.99985

In [57]:
model.fit(X,y)

In [58]:
import joblib 
joblib.dump(model, 'params//password_strength_predictor_rf.pkl')

['params//password_strength_predictor_rf.pkl']