In [1]:
import pandas as pd
import numpy as np
from sklearn import svm
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')


In [2]:
data=pd.read_csv('final.csv')


In [3]:
data.shape

(454482, 17)

In [4]:
#label encoding the parameters
from sklearn.preprocessing import LabelEncoder
number = LabelEncoder()
data['Passwords'] = number.fit_transform(data['Passwords'].astype('str'))

In [5]:
# Dropping the unnecessary columns
X=data.drop(['Password_Source','Password_Strength','Unnamed: 0'],axis=1)
Y=data.Password_Source

### OverSampling

In [6]:
from imblearn.over_sampling import SMOTE
X, Y = SMOTE().fit_sample(X, Y)



In [7]:
#splitting into traing and testing sets
from sklearn.model_selection import train_test_split

xtrain, xtest, ytrain, ytest = train_test_split(X, Y, test_size=0.3)

### Randomised Search for SVM hyper paramenters

In [8]:
C = [.0001, .001, .01]
gamma = [.0001, .001, .01, .1, 1, 10, 100]
degree = [1, 2, 3, 4, 5]
kernel = ['linear', 'rbf', 'poly']
probability = [True]
random_grid = {'C': C,
              'kernel': kernel,
              'gamma': gamma,
              'degree': degree,
              'probability': probability
             }

print(random_grid)



{'C': [0.0001, 0.001, 0.01], 'kernel': ['linear', 'rbf', 'poly'], 'gamma': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100], 'degree': [1, 2, 3, 4, 5], 'probability': [True]}


In [None]:
svc = svm.SVC(random_state=8,cache_size=7000)
random_search_values = RandomizedSearchCV(estimator=svc,
                                   param_distributions=random_grid,
                                   n_iter=10,
                                   scoring='accuracy',
                                   cv=3, 
                                   verbose=1, 
                                   random_state=8)

random_search_values.fit(xtrain, ytrain)



Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


In [None]:
print("The best hyperparameters for SVM from Randomised search are:")
print(random_search_values.best_params_)
print("")
print("The mean accuracy of a model with the hyperparameters obtained from randomised search is:")
print(random_search_values.best_score_)

### Grid serach for SVM hyperparameters

In [None]:
# Create the parameter grid based on the results of random search 
C = [.0001, .001, .01, .1]
degree = [3, 4, 5]
gamma = [1, 10, 100]
probability = [True]

param_grid = [
  {'C': C, 'kernel':['linear'], 'probability':probability},
  {'C': C, 'kernel':['poly'], 'degree':degree, 'probability':probability},
  {'C': C, 'kernel':['rbf'], 'gamma':gamma, 'probability':probability}
]

svc = svm.SVC(random_state=8)
cv_sets = ShuffleSplit(n_splits = 3, test_size = .33, random_state = 8)
grid_search_values = GridSearchCV(estimator=svc, 
                           param_grid=param_grid,
                           scoring='accuracy',
                           cv=cv_sets,
                           verbose=1)
grid_search_values.fit(xtrain,ytrain)

In [None]:
print("The best hyperparameters for SVM from Grid Search are:")
print(grid_search_values.best_params_)
print("")
print("The mean accuracy of a model with these hyperparameters from grid search is:")
print(grid_search_values.best_score_)

### SVM Without tuning parameters

In [None]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
clf = SVC(kernel='linear',random_state=42)
clf.fit(xtrain,ytrain)


### Accuracy

In [None]:
from sklearn.metrics import accuracy_score

print('Accuracy for randomizedsearch: ', 100 * accuracy_score(ytest, random_search_values.predict(xtest)), '%')
print('Accuracy for gridsearch: ', 100 * accuracy_score(ytest, grid_search_values.predict(xtest)), '%')
print('Accuracy without hyper parameter tuning: ', 100 * accuracy_score(ytest, clf.predict(xtest)), '%')

In [None]:
from sklearn.metrics import confusion_matrix, classification_report

In [None]:
from sklearn.metrics import confusion_matrix
results = confusion_matrix(ytest, random_search_values.predict(xtest))
print(results)

### Classification report for randomised search

In [None]:
print("Classification report for randomised search")
print(classification_report(ytest, random_search_values.predict(xtest)))

### Confusion matrix using randomised search

In [None]:
plt.figure(figsize=(5,5))
sns.heatmap(results,annot=True,fmt="d",
            xticklabels=aux_df['Password_Source'].values, 
            yticklabels=aux_df['Password_Source'].values,
            cmap="Blues")
plt.xlabel('Predicted')
plt.xlabel('truth')

In [None]:
results = confusion_matrix(ytest, grid_search_values.predict(xtest))
print(results)

### Classification report for grid search

In [None]:
print("Classification report for grid search")
print(classification_report(ytest, grid_search_values.predict(xtest)))

### Confusion matrix when grid search

In [None]:
plt.figure(figsize=(5,5))
sns.heatmap(results,annot=True,fmt="d",
            xticklabels=aux_df['Password_Source'].values, 
            yticklabels=aux_df['Password_Source'].values,
            cmap="Blues")
plt.xlabel('Predicted')
plt.xlabel('truth')

In [None]:
results = confusion_matrix(ytest, clf.predict(xtest))
print(results)

### Classification report without tuning

In [None]:
print("Classification report without tuning")
print(classification_report(ytest, clf.predict(xtest)))

In [None]:
### Confusion matrix with out tuning

In [None]:
plt.figure(figsize=(5,5))
sns.heatmap(results,annot=True,fmt="d",
            xticklabels=aux_df['Password_Source'].values, 
            yticklabels=aux_df['Password_Source'].values,
            cmap="Blues")
plt.xlabel('Predicted')
plt.xlabel('truth')