# Import Libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Import the Dataset

In [2]:
# Imports dataset
traindataset = pd.read_parquet('training.parquet', engine='pyarrow')
testdataset = pd.read_parquet('testing.parquet', engine='pyarrow')
#dataset = pd.concat([traindataset, testdataset])
#Imports X and Y being the inputs and outputs of the data
# : is all of the rows, : is all of the columns
# by putting the -1 it removes the last column
Xtrain = traindataset.iloc[:, 1:-1].values
Ytrain = traindataset.iloc[:, -1:].values
Xtest = testdataset.iloc[:, 1:-1].values
Ytest = testdataset.iloc[:, -1:].values
#X = dataset.iloc[:, :-1].values
#Y = dataset.iloc[:, -1:].values

In [26]:
Ytrain

array([0, 1, 1, ..., 1, 0, 0])

In [4]:
Xtrain

array([[ 82.,  23.,   0., ...,   0.,   1.,   4.],
       [ 93.,  14.,   1., ...,   0.,   1.,   2.],
       [121.,  21.,   1., ...,   0.,   1.,   3.],
       ...,
       [ 67.,  15.,   0., ...,   0.,   1.,   0.],
       [ 47.,  20.,   0., ...,   0.,   0.,   6.],
       [ 24.,  15.,   0., ...,   0.,   0.,   3.]])

# Data Preprocessing

## Take care of Missing Data

In [5]:
#runs through Xtrain
#imports the imputer
from sklearn.impute import SimpleImputer
#tells the imputer that you're changing missing_values (which are represented in numpy as nan) to the mean of the column
imputer = SimpleImputer(missing_values = np.nan, strategy = 'mean')
#you're changing values in columns 1-3
imputer.fit(Xtrain[:,1:88])
#replaces X with the changed values
Xtrain[:, 1:88] = imputer.transform (Xtrain[:, 1:88])
print(Xtrain)

[[ 82.  23.   0. ...   0.   1.   4.]
 [ 93.  14.   1. ...   0.   1.   2.]
 [121.  21.   1. ...   0.   1.   3.]
 ...
 [ 67.  15.   0. ...   0.   1.   0.]
 [ 47.  20.   0. ...   0.   0.   6.]
 [ 24.  15.   0. ...   0.   0.   3.]]


In [6]:
#runs through Xtest
#imports the imputer
from sklearn.impute import SimpleImputer
#tells the imputer that you're changing missing_values (which are represented in numpy as nan) to the mean of the column
imputer = SimpleImputer(missing_values = np.nan, strategy = 'mean')
#you're changing values in columns 1-3
imputer.fit(Xtest[:,1:88])
#replaces X with the changed values
Xtest[:, 1:88] = imputer.transform (Xtest[:, 1:88])
print(Xtest)

[[ 36.  19.   0. ...   0.   1.   0.]
 [ 51.  24.   0. ...   0.   1.   6.]
 [ 46.  16.   0. ...   0.   0.   7.]
 ...
 [ 25.  16.   0. ...   0.   0.   3.]
 [550.  25.   1. ...   0.   1.   4.]
 [ 77.  23.   1. ...   0.   1.   2.]]


In [7]:
## Encode Categorical Data

In [8]:
Ytest

array([['phishing'],
       ['legitimate'],
       ['legitimate'],
       ...,
       ['legitimate'],
       ['legitimate'],
       ['phishing']], dtype=object)

In [9]:
Ytrain

array([['legitimate'],
       ['phishing'],
       ['phishing'],
       ...,
       ['phishing'],
       ['legitimate'],
       ['legitimate']], dtype=object)

In [10]:
#Encoding Ytest
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
Ytest = le.fit_transform(Ytest)

print(Ytest)

[1 0 0 ... 0 0 1]


  y = column_or_1d(y, warn=True)


In [11]:
#Encoding Ytrain
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
Ytrain = le.fit_transform(Ytrain)

print(Ytrain)

[0 1 1 ... 1 0 0]


  y = column_or_1d(y, warn=True)


## Feature Scaling

In [12]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
Xtrain = sc.fit_transform(Xtrain)
Xtest = sc.transform(Xtest)

#from sklearn.compose import ColumnTransformer
#ct = ColumnTransformer(transformers = [('skip', 'passthrough', [0,1,2]),
#                                       ('encoder', StandardScaler(), [3,4]),], remainder = 'passthrough')

Xtrain = sc.fit_transform(Xtrain)
Xtest = sc.transform(Xtest)

# Training Models

## Training KNN

In [13]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(Xtrain, Ytrain)

## Training Logistic Regression

In [14]:
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state = 0)
classifier.fit(Xtrain,Ytrain)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [15]:
## Decision Tree

In [16]:
from sklearn.tree import DecisionTreeClassifier
clf_gini = DecisionTreeClassifier(criterion="gini",random_state=42, max_depth=3, min_samples_leaf=5)
clf_gini.fit(Xtrain, Ytrain)

In [17]:
## Random Forest

In [18]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators = 100)
rf.fit(Xtrain, Ytrain)

In [19]:
## Naive Bayes

In [20]:
from sklearn.naive_bayes import GaussianNB
NB = GaussianNB()
NB.fit(Xtrain, Ytrain)

In [21]:
## Support Vector Machine

In [22]:
from sklearn import svm 
clf = svm.SVC(kernel='linear') 
clf.fit(Xtrain, Ytrain)

# Predicting New Data

In [23]:
#KNN
knnYpred = knn.predict(Xtest)
#Logistic Regression
LogRegYpred = classifier.predict(Xtest)
#Decision Tree
DecTreeYpred = clf_gini.predict(Xtest)
#Random Forest
RForestYpred = rf.predict(Xtest)
#Naive Bayes
NBYpred = NB.predict(Xtest)
#Support Vector Machine
SVMYpred = clf.predict(Xtest)

Ypreds = [knnYpred, LogRegYpred, DecTreeYpred, RForestYpred, NBYpred, SVMYpred]
Names = ["KNN", "Logistic Regression", "Decision Tree", "Random Forest", "Naive Bayes", "Support Vector Machine"]

# Visualizing

# checking accuracy

In [24]:
from sklearn import metrics

knnaccuracy = metrics.accuracy_score(Ytest, knnYpred)
print("KNN Accuracy:", knnaccuracy)
LogRegaccuracy = metrics.accuracy_score(Ytest, LogRegYpred)
print("Logistic Regression Accuracy:", LogRegaccuracy)
DecTreeaccuracy = metrics.accuracy_score(Ytest, DecTreeYpred)
print("Decision Tree Accuracy:", DecTreeaccuracy)
RForestaccuracy = metrics.accuracy_score(Ytest, RForestYpred)
print("Random Forest Accuracy:", RForestaccuracy)
NBaccuracy = metrics.accuracy_score(Ytest, NBYpred)
print("Naive Bayes Accuracy:", NBaccuracy)
SVMaccuracy = metrics.accuracy_score(Ytest, SVMYpred)
print("Support Vector Machine Accuracy:", SVMaccuracy)

KNN Accuracy: 0.9382290562036055
Logistic Regression Accuracy: 0.9422057264050901
Decision Tree Accuracy: 0.9088016967126193
Random Forest Accuracy: 0.9634146341463414
Naive Bayes Accuracy: 0.6853128313891834
Support Vector Machine Accuracy: 0.9435312831389183


## Confusion Matrices

In [25]:
from sklearn.metrics import (
    confusion_matrix,
    ConfusionMatrixDisplay
)


for Ypred, name in zip(Ypreds, Names):
    cm = confusion_matrix(Ytest, Ypred)
    print(name,":  \n", cm)

KNN :  
 [[1805   81]
 [ 152 1734]]
Logistic Regression :  
 [[1781  105]
 [ 113 1773]]
Decision Tree :  
 [[1737  149]
 [ 195 1691]]
Random Forest :  
 [[1812   74]
 [  64 1822]]
Naive Bayes :  
 [[1839   47]
 [1140  746]]
Support Vector Machine :  
 [[1786  100]
 [ 113 1773]]
