In [1]:
import tensorflow as tf
from tensorflow import keras
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense


In [2]:
# Reading in dataset from CSV file. This dataset is an updated version of the original Kaggle dataset including 
urldata = pd.read_csv("./Url_Processed.csv")

In [3]:
# Clean up dataset and remove unecessary columns
urldata.drop("Unnamed: 0",axis=1,inplace=True)
urldata.drop(["url","label"],axis=1,inplace=True)

In [4]:
# Configure dependent variables (values used to inform prediction)
x = urldata[['hostname_length',
       'path_length', 'fd_length', 'count-', 'count@', 'count?',
       'count%', 'count.', 'count=', 'count-http','count-https', 'count-www', 'count-digits',
       'count-letters', 'count_dir', 'use_of_ip']]

In [5]:
# Configure independent variable (value to verify prediction)
y = urldata['result']

In [6]:
x_sample, y_sample = SMOTE().fit_resample(x, y.values.ravel())

In [7]:
x_sample = pd.DataFrame(x_sample)
y_sample = pd.DataFrame(y_sample)

In [8]:
# Seperate data into training and testing sets using the 80:20 ratio
x_train, x_test, y_train, y_test = train_test_split(x_sample, y_sample, test_size = 0.2)

In [9]:
model = Sequential()

In [10]:
# first layer of the model. It is a dense layer (fully connected layer) with 32 neurons. It utilizes ReLU (Rectified Linear Activation) which introduces non-linearity and takes in 16 input features
model.add(Dense(32, activation = 'relu', input_shape = (16, )))
model.add(Dense(16, activation='relu'))
model.add(Dense(8, activation='relu'))

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [11]:
# the final layer is an output layer with one neuron which is utilized for binary classification with sigmoid classification that outputs a probability score between 0 and 1 (0 = no probability and 1 = full chance).
model.add(Dense(1, activation='sigmoid'))

In [12]:

model.summary()

In [14]:
# Define an Optimizer
# the following line defines an Adam Optimization algorithm with a learning rate of 0.0001 which defines the step size during optimization of a model's parameters such as weights and biases
opt = keras.optimizers.Adam(learning_rate=0.0001)

In [15]:
model.compile(optimizer= opt ,loss='binary_crossentropy',metrics=['acc'])

In [16]:
class ModelCallback(keras.callbacks.Callback):
    def on_epoch_end(self, epoch, logs={}):
        # checks if the validation loss is less than 0.1
        if(logs.get('val_loss')<0.1):
            print("\nReached 0.1 val_loss! Halting training!")
            self.model.stop_training = True

In [17]:
callback = ModelCallback()


In [18]:
history = model.fit(x_train, y_train, epochs=10,batch_size=256, callbacks=[callback],validation_data=(x_test,y_test),verbose=1)

Epoch 1/10
[1m2161/2161[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 2ms/step - acc: 0.7378 - loss: 0.6011 - val_acc: 0.9636 - val_loss: 0.1618
Epoch 2/10
[1m2137/2161[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 2ms/step - acc: 0.9768 - loss: 0.1090
Reached 0.1 val_loss! Halting training!
[1m2161/2161[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 2ms/step - acc: 0.9768 - loss: 0.1087 - val_acc: 0.9927 - val_loss: 0.0407


In [19]:
# list all data in history
print(history.history.keys())

dict_keys(['acc', 'loss', 'val_acc', 'val_loss'])


In [20]:
# TEST SUITE
pred_test = model.predict(x_test)
for i in range (len(pred_test)):
    if (pred_test[i] < 0.5):
        pred_test[i] = 0
    else:
        pred_test[i] = 1
pred_test = pred_test.astype(int)

[1m4322/4322[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 1ms/step


In [21]:
def view_result(array):
    array = np.array(array)
    for i in range(len(array)):
        if array[i] == 0:
            print("Safe")
        else:
            print("Malicious")


In [23]:
print("PREDICTED RESULTS: ")
view_result(pred_test[:10])
print("\n")
print("ACTUAL RESULTS: ")
view_result(y_test[:20])

PREDICTED RESULTS: 
Malicious
Safe
Malicious
Safe
Malicious
Safe
Safe
Malicious
Malicious
Malicious


ACTUAL RESULTS: 
Malicious
Safe
Malicious
Safe
Malicious
Safe
Safe
Malicious
Malicious
Malicious
Malicious
Malicious
Malicious
Malicious
Safe
Malicious
Safe
Safe
Safe
Malicious


In [24]:
# SAVE MODEL
model.save("Malicious_URL_Prediction.h5")

