# Loading Data

In [1]:
import pandas as pd
import numpy as np

data = pd.read_csv('dataset.csv')

X_data = data.iloc[:,:-1]
Y_data = data.iloc[:,-1:]

#print(X_data)
#print(Y_data)


# Section 1: Data Preprocessing
Our selection for the preprocessing of the "Malicious and Benign Websites" dataset is OneHotEncoding. The dataset contains mainy columns that contain categorical data, such as "CHARSET", which contains data like "ascii" or "UTF-8". OrdinalEncoding is another option for categorical data; however, it is a better fit when there is some sort of ordering to the data -- such as the categories being "small", "medium", "large" -- and is not as good of a fit for data with no ordering.

In [2]:
from sklearn.preprocessing import OneHotEncoder

enc = OneHotEncoder(handle_unknown='ignore')

enc.fit(X_data)

X_encoded = enc.transform(X_data)

print(X_encoded)


  (0, 1574)	1.0
  (0, 1781)	1.0
  (0, 1925)	1.0
  (0, 1958)	1.0
  (0, 2163)	1.0
  (0, 2263)	1.0
  (0, 2870)	1.0
  (0, 2988)	1.0
  (0, 3131)	1.0
  (0, 4556)	1.0
  (0, 4564)	1.0
  (0, 4660)	1.0
  (0, 4728)	1.0
  (0, 4808)	1.0
  (0, 5578)	1.0
  (0, 5692)	1.0
  (0, 5954)	1.0
  (0, 6759)	1.0
  (0, 7514)	1.0
  (0, 7619)	1.0
  (1, 796)	1.0
  (1, 1781)	1.0
  (1, 1924)	1.0
  (1, 1957)	1.0
  (1, 2024)	1.0
  :	:
  (1779, 5682)	1.0
  (1779, 5798)	1.0
  (1779, 6683)	1.0
  (1779, 7505)	1.0
  (1779, 7618)	1.0
  (1780, 1246)	1.0
  (1780, 1922)	1.0
  (1780, 1952)	1.0
  (1780, 1960)	1.0
  (1780, 2084)	1.0
  (1780, 2731)	1.0
  (1780, 2883)	1.0
  (1780, 3051)	1.0
  (1780, 3232)	1.0
  (1780, 4220)	1.0
  (1780, 4576)	1.0
  (1780, 4666)	1.0
  (1780, 4737)	1.0
  (1780, 5146)	1.0
  (1780, 5594)	1.0
  (1780, 5710)	1.0
  (1780, 6110)	1.0
  (1780, 7109)	1.0
  (1780, 7530)	1.0
  (1780, 7621)	1.0


# Section 2: Data Splits

In [9]:
from sklearn.model_selection import train_test_split

x_trn, x_part, y_trn, y_part = train_test_split(X_encoded, Y_data,train_size=0.8,random_state=71)

print(x_trn,y_trn)

x_test, x_val, y_test, y_val = train_test_split(x_part,y_part,train_size=0.5, random_state=71)

#print(x_test,y_test)
#print(x_val,y_val)


  (0, 1036)	1.0
  (0, 1802)	1.0
  (0, 1926)	1.0
  (0, 1957)	1.0
  (0, 2163)	1.0
  (0, 2840)	1.0
  (0, 2883)	1.0
  (0, 2903)	1.0
  (0, 3335)	1.0
  (0, 4167)	1.0
  (0, 4601)	1.0
  (0, 4660)	1.0
  (0, 4730)	1.0
  (0, 5340)	1.0
  (0, 5617)	1.0
  (0, 5730)	1.0
  (0, 6649)	1.0
  (0, 7269)	1.0
  (0, 7553)	1.0
  (0, 7620)	1.0
  (1, 90)	1.0
  (1, 1813)	1.0
  (1, 1929)	1.0
  (1, 1957)	1.0
  (1, 2163)	1.0
  :	:
  (1422, 5714)	1.0
  (1422, 6341)	1.0
  (1422, 7250)	1.0
  (1422, 7539)	1.0
  (1422, 7622)	1.0
  (1423, 108)	1.0
  (1423, 1823)	1.0
  (1423, 1926)	1.0
  (1423, 1957)	1.0
  (1423, 1970)	1.0
  (1423, 2213)	1.0
  (1423, 2883)	1.0
  (1423, 2948)	1.0
  (1423, 3171)	1.0
  (1423, 4274)	1.0
  (1423, 4557)	1.0
  (1423, 4660)	1.0
  (1423, 4726)	1.0
  (1423, 4744)	1.0
  (1423, 5569)	1.0
  (1423, 5682)	1.0
  (1423, 5798)	1.0
  (1423, 6683)	1.0
  (1423, 7505)	1.0
  (1423, 7618)	1.0       Type
365      0
828      0
1547     0
1237     0
1137     0
...    ...
1320     0
1576     0
1079     0
1035     1
1

# Section 3: Build Classifiers
The classifiers we have chosen to use for this project are SVM and Feed-forward Neural Network.

In [11]:
from sklearn.svm import SVC


SVM_clf = SVC()

SVM_clf.fit(x_trn,y_trn)

svm_acc = SVM_clf.score(x_val,y_val)

print("SVM accurary with defualt parameters:",svm_acc)

SVM accurary with defualt parameters: 0.9497206703910615


  y = column_or_1d(y, warn=True)


In [13]:
from sklearn.neural_network import MLPClassifier

Neural_clf = MLPClassifier()

Neural_clf.fit(x_trn,y_trn)

neural_acc = Neural_clf.score(x_val,y_val)

print("Neural Network accuracy with default paramters:",neural_acc)

Neural Network accuracy with default paramters: 0.9497206703910615


# Section 4: Hyperparameter Tuning
For the SVM, the hyperparameters that we will be tuning are the C value, the kernel, and the Gamma value.
For the Neural Network, the hyperameters that will be tuning are the alpha value, the learning rate, and the hidden_layer_sizes.

For both classifiers, we will be using a grid search to tune the hyperparameters.

In [12]:
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings('ignore', message='A column-vector y was passed when a 1d array was expected.*')
param_grid = {'C': [0.1, 1, 10, 100],
              'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
              'gamma': ['scale', 'auto'] + list(np.logspace(-3, 3, 7))}

grid_search = GridSearchCV(SVC(),param_grid,cv=5)
grid_search.fit(x_trn,y_trn)

print(grid_search.best_params_)


{'C': 100, 'gamma': 0.01, 'kernel': 'rbf'}


In [15]:

params = {'alpha': [0.01, 0.1, 1],
          'learning_rate': ['constant', 'adaptive'],
          'hidden_layer_sizes': [(10,), (50,), (100,), (10,10), (50,50), (100,100)]}

grid_search = GridSearchCV(MLPClassifier(),params,cv=5)
grid_search.fit(x_trn,y_trn)

print(grid_search.best_params_)



{'alpha': 1, 'hidden_layer_sizes': (100, 100), 'learning_rate': 'adaptive'}
