In [7]:
# Load Libraries - Make sure to run this cell!
import pandas as pd
import numpy as np
import re, os
from string import printable
from sklearn import model_selection

#import gensim
import tensorflow as tf
from tensorflow.keras.models import Sequential, Model, model_from_json, load_model
from tensorflow.keras import regularizers
from tensorflow.keras.layers import Dense, Dropout, Activation, Lambda, Flatten
from tensorflow.keras.layers import Input, ELU, LSTM, Embedding, Conv2D, MaxPooling2D, \
BatchNormalization, Conv1D, MaxPooling1D, concatenate
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.optimizers import SGD, Adam, RMSprop
# from keras.utils import np_utils
from tensorflow.keras import backend as K
import keras
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score
from pathlib import Path
import json
from itertools import groupby
import warnings
warnings.filterwarnings("ignore")

## Preprocess raw URLs

In [2]:
## Load data URL
# DATA_HOME = 'data/'
# df = pd.read_csv(DATA_HOME + 'url_data_mega_deep_learning.csv')
DATA_HOME = '../URLphishing/data/'
df = pd.read_csv(DATA_HOME + 'URLdatasetX2_1.csv')
labels = df.iloc[:,-1].values
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(labels)
df = pd.DataFrame({'url': df['url'], 'isMalicious': labels})
# df = df.sample(n=1000)
df.sample(n=25).head(2) 

Unnamed: 0,url,isMalicious
2499,https://www.rams.com.au/,0
1576,https://www.mcmakler.de/,0


In [3]:
# Initial Data Preparation URL

# Step 1: Convert raw URL string in list of lists where characters that are contained in "printable" are stored encoded as integer 
url_int_tokens = [[printable.index(x) + 1 for x in url if x in printable] for url in df.url]

# Step 2: Cut URL string at max_len or pad with zeros if shorter
max_len=75
X = sequence.pad_sequences(url_int_tokens, maxlen=max_len)
 
# Step 3: Extract labels form df to numpy array
target = np.array(df.isMalicious)

print('Matrix dimensions of X: ', X.shape, 'Vector dimension of target: ', target.shape)

Matrix dimensions of X:  (2802, 75) Vector dimension of target:  (2802,)


In [4]:
# X

In [5]:
# Simple Cross-Validation: Split the data set into training and test data
# X_train, X_test, target_train, target_test = model_selection.train_test_split(X, target, test_size=0.25, random_state=33)
n_samples = len(df.index)
np.random.seed(0)
train_idx = list(np.random.choice(list(range(n_samples)), int(0.8*n_samples), replace=False))
test_idx = list(set(list(range(n_samples))).difference(set(train_idx)))
X_train = X[train_idx]; X_test = X[test_idx]; target_train = target[train_idx]; target_test = target[test_idx]

In [8]:
print("Train freq: ", [len(list(group)) for key, group in groupby(sorted(target_train))])

Train freq:  [1622, 619]


## Architecture 1 - Simple LSTM

In [9]:
## Deep Learning model Definition --- A --- (Simple LSTM)


def simple_lstm(max_len=75, emb_dim=32, max_vocab_len=100, lstm_output_size=32, W_reg=regularizers.l2(1e-4)):
    # Input
    main_input = Input(shape=(max_len,), dtype='int32', name='main_input')
    # Embedding layer
    emb = Embedding(input_dim=max_vocab_len, output_dim=emb_dim, input_length=max_len, embeddings_regularizer=W_reg)(main_input) 
    emb = Dropout(0.2)(emb)
    # LSTM layer
    lstm = LSTM(lstm_output_size)(emb)
    lstm = Dropout(0.5)(lstm)
    
    # Output layer (last fully connected layer)
    output = Dense(1, activation='sigmoid', name='output')(lstm)

    # Compile model and define optimizer
    model = Model(inputs=main_input, outputs=output)
    # adam = Adam(lr=1e-4, beta_1=0.9, beta_2=0.999, epsilon=1e-08)
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=["accuracy"])
    
    return model

In [10]:
# Fit model and Cross-Validation, ARCHITECTURE 1 SIMPLE LSTM
epochs = 3
batch_size = 32

model = simple_lstm()
model.fit(X_train, target_train, epochs=epochs, batch_size=batch_size)
loss, accuracy = model.evaluate(X_test, target_test, verbose=1)

print('\nFinal Cross-Validation Accuracy', accuracy, '\n')
# print_layers_dims(model)

Epoch 1/3
Epoch 2/3
Epoch 3/3

Final Cross-Validation Accuracy 0.8003565073013306 



In [11]:
y_pred = tf.argmax(model.predict(X_test), axis=-1)
print("F1 score: ", f1_score(y_pred, target_test))

F1 score:  0.0


In [12]:
model_name = "deeplearning_LSTM"
# save_model(DATA_HOME + model_name + ".json", DATA_HOME + model_name + ".h5")
# model = load_model(DATA_HOME + model_name + ".json", DATA_HOME + model_name + ".h5")
model.save(DATA_HOME + model_name + ".keras")
model = keras.models.load_model(DATA_HOME + model_name + ".keras")

## Architecture 2 - 1D Convolution and LSTM

In [15]:
## Deep Learning model Definition --- B --- (1D Convolution and LSTM)

def lstm_conv(max_len=75, emb_dim=32, max_vocab_len=100, lstm_output_size=32, W_reg=regularizers.l2(1e-4)):
    # Input
    main_input = Input(shape=(max_len,), dtype='int32', name='main_input')
    # Embedding layer
    emb = Embedding(input_dim=max_vocab_len, output_dim=emb_dim, input_length=max_len,
                embeddings_regularizer=W_reg)(main_input) 
    emb = Dropout(0.25)(emb)

    # Conv layer
    conv = Conv1D(kernel_size=5, filters=256, \
                     padding ='same')(emb)
    conv = ELU()(conv)

    conv = MaxPooling1D(pool_size=4)(conv)
    conv = Dropout(0.5)(conv)

    # LSTM layer
    lstm = LSTM(lstm_output_size)(conv)
    lstm = Dropout(0.5)(lstm)
    
    # Output layer (last fully connected layer)
    output = Dense(1, activation='sigmoid', name='output')(lstm)

    # Compile model and define optimizer
    model = Model(inputs=main_input, outputs=output)
    adam = Adam(learning_rate=1e-4, beta_1=0.9, beta_2=0.999, epsilon=1e-08)
    model.compile(optimizer=adam, loss='binary_crossentropy', metrics=['accuracy'])
    return model

In [16]:
# Fit model and Cross-Validation, ARCHITECTURE 2 CONV + LSTM
epochs = 5
batch_size = 32

model = lstm_conv()
model.fit(X_train, target_train, epochs=epochs, batch_size=batch_size)
loss, accuracy = model.evaluate(X_test, target_test, verbose=1)

print('\nFinal Cross-Validation Accuracy', accuracy, '\n')
# print_layers_dims(model)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5

Final Cross-Validation Accuracy 0.7807486653327942 



In [17]:
y_pred = tf.argmax(model.predict(X_test), axis=-1)
print("F1 score: ", f1_score(y_pred, target_test))

F1 score:  0.0


In [18]:
model_name = "deeplearning_1DConvLSTM"
# save_model(DATA_HOME + model_name + ".json", DATA_HOME + model_name + ".h5")
# model = load_model(DATA_HOME + model_name + ".json", DATA_HOME + model_name + ".h5")
# #print_layers_dims(model)
model.save(DATA_HOME + model_name + ".keras")
model = keras.models.load_model(DATA_HOME + model_name + ".keras")

## Architecture 3 - 1D Convolutions and Fully Connected Layers

In [19]:
## Deep Learning model Definition --- C --- (1D Convolutions and Fully Connected Layers)

def conv_fully(max_len=75, emb_dim=32, max_vocab_len=100, W_reg=regularizers.l2(1e-4)):
    # Input
    main_input = Input(shape=(max_len,), dtype='int32', name='main_input')
    # Embedding layer
    emb = Embedding(input_dim=max_vocab_len, output_dim=emb_dim, input_length=max_len,
                embeddings_regularizer=W_reg)(main_input) 
    emb = Dropout(0.25)(emb)

    
    def sum_1d(X):
        return K.sum(X, axis=1)
    
    def get_conv_layer(emb, kernel_size=5, filters=256):
        # Conv layer
        conv = Conv1D(kernel_size=kernel_size, filters=filters, \
                     padding='same')(emb)
        conv = ELU()(conv)

        conv = Lambda(sum_1d, output_shape=(filters,))(conv)
        #conv = BatchNormalization(mode=0)(conv)
        conv = Dropout(0.5)(conv)
        return conv
        
    # Multiple Conv Layers
    
    # calling custom conv function from above
    conv1 = get_conv_layer(emb, kernel_size=2, filters=256)
    conv2 = get_conv_layer(emb, kernel_size=3, filters=256)
    conv3 = get_conv_layer(emb, kernel_size=4, filters=256)
    conv4 = get_conv_layer(emb, kernel_size=5, filters=256)

    # Fully Connected Layers
    merged = concatenate([conv1,conv2,conv3,conv4], axis=1)

    hidden1 = Dense(1024)(merged)
    hidden1 = ELU()(hidden1)
    # hidden1 = BatchNormalization()(hidden1)
    hidden1 = Dropout(0.5)(hidden1)

    hidden2 = Dense(1024)(hidden1)
    hidden2 = ELU()(hidden2)
    # hidden2 = BatchNormalization()(hidden2)
    hidden2 = Dropout(0.5)(hidden2)
    
    # Output layer (last fully connected layer)
    output = Dense(1, activation='sigmoid', name='output')(hidden2)

    # Compile model and define optimizer
    model = Model(inputs=main_input, outputs=output)
    adam = Adam(learning_rate=1e-4, beta_1=0.9, beta_2=0.999, epsilon=1e-08)
    model.compile(optimizer=adam, loss='binary_crossentropy', metrics=['accuracy'])
    return model

In [20]:
# Fit model and Cross-Validation, ARCHITECTURE 3 CONV + FULLY CONNECTED
epochs = 1
batch_size = 64

model = conv_fully()
model.fit(X_train, target_train, epochs=epochs, batch_size=batch_size)
loss, accuracy = model.evaluate(X_test, target_test, verbose=1)

print('\nFinal Cross-Validation Accuracy', accuracy, '\n')
# print_layers_dims(model)


Final Cross-Validation Accuracy 0.7843137383460999 



In [21]:
y_pred = tf.argmax(model.predict(X_test), axis=-1)
print("F1 score: ", f1_score(y_pred, target_test))

F1 score:  0.0


In [24]:
model_name = "deeplearning_1DConv"
# save_model(DATA_HOME + model_name + ".json", DATA_HOME + model_name + ".h5")
# model = load_model(DATA_HOME + model_name + ".json", DATA_HOME + model_name + ".h5")
#print_layers_dims(model)
# model.save(DATA_HOME + model_name + ".keras")
# model = keras.models.load_model(DATA_HOME + model_name + ".keras")

## Making a new prediction

In [26]:
test_url_mal = "naureen.net/etisalat.ae/index2.php"
test_url_benign = "sixt.com/php/reservation?language=en_US"

url = test_url_mal

In [27]:
# Step 1: Convert raw URL string in list of lists where characters that are contained in "printable" are stored encoded as integer 
url_int_tokens = [[printable.index(x) + 1 for x in url if x in printable]]

# Step 2: Cut URL string at max_len or pad with zeros if shorter
max_len=75
XY = sequence.pad_sequences(url_int_tokens, maxlen=max_len)

In [28]:
target_proba = model.predict(XY, batch_size=1)
def print_result(proba):
    if proba > 0.5:
        return "malicious"
    else:
        return "benign"
print("Test URL:", url, "is", print_result(target_proba[0]))

Test URL: naureen.net/etisalat.ae/index2.php is benign
