In [7]:
# Load Libraries - Make sure to run this cell!
import pandas as pd
import numpy as np
import re, os
from string import printable
from sklearn import model_selection

#import gensim
import tensorflow as tf
from tensorflow.keras.models import Sequential, Model, model_from_json, load_model
from tensorflow.keras import regularizers
from tensorflow.keras.layers import Dense, Dropout, Activation, Lambda, Flatten
from tensorflow.keras.layers import Input, ELU, LSTM, Embedding, Conv2D, MaxPooling2D, \
BatchNormalization, Conv1D, MaxPooling1D, concatenate
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.optimizers import SGD, Adam, RMSprop
# from keras.utils import np_utils
from tensorflow.keras import backend as K
import keras
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score
from pathlib import Path
import json
from itertools import groupby
import warnings
warnings.filterwarnings("ignore")

## Preprocess raw URLs

In [29]:
## Load data URL
# DATA_HOME = 'data/'
# df = pd.read_csv(DATA_HOME + 'url_data_mega_deep_learning.csv')
#DATA_HOME = '../URLphishing/data/'
df = pd.read_csv('URLdatasetX2_1sub5.csv')
labels = df.iloc[:,-1].values
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(labels)
df = pd.DataFrame({'url': df['url'], 'isMalicious': labels})
# df = df.sample(n=1000)
#df.sample(n=25).head(2)
print(len(df))

2135


In [None]:
df.head(5)

Unnamed: 0,url,isMalicious
0,http://www.crestonwood.com/router.php,0
1,http://vamoaestudiarmedicina.blogspot.com/,0
2,https://www.astrologyonline.eu/Astro_MemoNew/P...,0
3,https://www.lifewire.com/tcp-port-21-818146,0
4,https://technofizi.net/top-best-mp3-downloader...,0


In [None]:
print(f"Số lượng mẫu ban đầu: {len(df)}")
print(f"Số lượng nhãn 0: {sum(df['isMalicious'] == 0)}")
print(f"Số lượng nhãn 1: {sum(df['isMalicious'] == 1)}")
print(f"Số lượng nhãn 2: {sum(df['isMalicious'] == 2)}")

Số lượng mẫu ban đầu: 9586
Số lượng nhãn 0: 7346
Số lượng nhãn 1: 1397
Số lượng nhãn 2: 426


In [30]:
# Initial Data Preparation URL

# Step 1: Convert raw URL string in list of lists where characters that are contained in "printable" are stored encoded as integer
url_int_tokens = [[printable.index(x) + 1 for x in url if x in printable] for url in df.url]

# Step 2: Cut URL string at max_len or pad with zeros if shorter
max_len=75
X = sequence.pad_sequences(url_int_tokens, maxlen=max_len)

# Step 3: Extract labels form df to numpy array
target = np.array(df.isMalicious)

print('Matrix dimensions of X: ', X.shape, 'Vector dimension of target: ', target.shape)

Matrix dimensions of X:  (2135, 75) Vector dimension of target:  (2135,)


In [22]:
X

array([[ 0,  0,  0, ..., 26, 18, 26],
       [ 0,  0,  0, ..., 25, 23, 77],
       [ 0,  0,  0, ..., 11, 29, 26],
       ...,
       [ 0,  0,  0, ..., 30, 23, 22],
       [77, 18, 25, ..., 24, 30, 77],
       [11, 13, 30, ..., 24, 11, 22]], dtype=int32)

In [None]:
# Simple Cross-Validation: Split the data set into training and test data
# X_train, X_test, target_train, target_test = model_selection.train_test_split(X, target, test_size=0.25, random_state=33)
n_samples = len(df.index)
np.random.seed(0)
train_idx = list(np.random.choice(list(range(n_samples)), int(0.8*n_samples), replace=False))
test_idx = list(set(list(range(n_samples))).difference(set(train_idx)))
X_train = X[train_idx]; X_test = X[test_idx]; target_train = target[train_idx]; target_test = target[test_idx]

In [None]:
print("Train freq: ", [len(list(group)) for key, group in groupby(sorted(target_train))])

Train freq:  [325, 123]


## Architecture 1 - Simple LSTM

In [31]:
## Deep Learning model Definition --- A --- (Simple LSTM)


def simple_lstm(max_len=75, emb_dim=32, max_vocab_len=100, lstm_output_size=32, W_reg=regularizers.l2(1e-4)):
    # Input
    main_input = Input(shape=(max_len,), dtype='int32', name='main_input')
    # Embedding layer
    emb = Embedding(input_dim=max_vocab_len, output_dim=emb_dim, input_length=max_len, embeddings_regularizer=W_reg)(main_input)
    emb = Dropout(0.2)(emb)
    # LSTM layer
    lstm = LSTM(lstm_output_size)(emb)
    lstm = Dropout(0.5)(lstm)

    # Output layer (last fully connected layer)
    output = Dense(1, activation='sigmoid', name='output')(lstm)

    # Compile model and define optimizer
    model = Model(inputs=main_input, outputs=output)
    # adam = Adam(lr=1e-4, beta_1=0.9, beta_2=0.999, epsilon=1e-08)
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=["accuracy"])

    return model

In [None]:
# Fit model and Cross-Validation, ARCHITECTURE 1 SIMPLE LSTM
epochs = 10
batch_size = 32

model = simple_lstm()
model.fit(X_train, target_train, epochs=epochs, batch_size=batch_size)
loss, accuracy = model.evaluate(X_test, target_test, verbose=1)

print('\nFinal Cross-Validation Accuracy', accuracy, '\n')
# print_layers_dims(model)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

Final Cross-Validation Accuracy 0.7914110422134399 



In [32]:

from sklearn.model_selection import KFold
base_dir = 'results'
n_loops = 1
n_folds = 5
n_samples = X.shape[0]
for i in range(n_loops):
  cv = KFold(n_splits=n_folds, shuffle=True, random_state = i)
  for fold, (train_idx, test_idx) in enumerate(cv.split(X,target)):
    path_dir = base_dir +'/' + "URLdatasetX2_1sub5" + '_run_'+str(i)+'_'+ 'fold_'+str(fold)+'_'
    print('Run: ', i, ', fold: ', fold)
    X_train = X[train_idx]
    X_test = X[test_idx]
    y_train = target[train_idx]
    y_test = target[test_idx]
    model = simple_lstm()
    model.fit(X_train, y_train)
    y_pred_score = model.predict(X_test).flatten()

    # Chia giá trị từ min đến max thành 20 đoạn
    thresholds = np.linspace(y_pred_score.min(), y_pred_score.max(), 20)

    best_threshold = 0
    best_f1 = 0

    # Duyệt qua các threshold để tìm threshold có f1_score tốt nhất
    for threshold in thresholds:
        y_predict = (y_pred_score > threshold).astype(int)
        score = f1_score(y_test, y_predict, pos_label=1, zero_division=0.0)
        if score > best_f1:
            best_f1 = score
            best_threshold = threshold

    print("Best threshold: ", best_threshold)
    print("Best F1 score: ", best_f1)
    y_predict = (y_pred_score > best_threshold).astype(int)
    print(y_predict)

    df = pd.DataFrame({'Actual': y_test, 'Predicted': y_predict, 'y_score': y_pred_score})
    df.to_csv(path_dir + "LSTM_labels.csv", index=False)



Run:  0 , fold:  0
Best threshold:  0.0503397005561151
Best F1 score:  0.16666666666666666
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0
 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0]
Run:  0 , fold:  1
Best threshold:  0.0284

In [None]:
def save_predictions_to_csv(true_labels, predicted_labels, filename):
    # Tạo dataframe từ danh sách các nhãn
    df = pd.DataFrame({
        'True Label': true_labels,
        'Predicted Label': predicted_labels
    })

    # Lưu dataframe vào file CSV
    df.to_csv(filename, index=False)



In [None]:
y_pred = tf.argmax(model.predict(X_test), axis=-1)
print("F1 score: ", f1_score(y_pred, target_test, average='weighted'))
# Kiểm tra định dạng của target_test
if target_test.ndim > 1 and target_test.shape[1] > 1:
    target_test = tf.argmax(target_test, axis=-1)
    print("true")

save_predictions_to_csv(target_test, y_pred, 'PredictionsSimpleLSTM.csv')

F1 score:  0.8409090909090908


In [None]:
print(len(target_test))
print(len(target_train))
print(target_test[1])

561
2241
0


In [None]:
model_name = "deeplearning_LSTM"
# save_model(DATA_HOME + model_name + ".json", DATA_HOME + model_name + ".h5")
# model = load_model(DATA_HOME + model_name + ".json", DATA_HOME + model_name + ".h5")
model.save(DATA_HOME + model_name + ".keras")
model = keras.models.load_model(DATA_HOME + model_name + ".keras")

NameError: name 'DATA_HOME' is not defined

## Architecture 2 - 1D Convolution and LSTM

In [33]:
## Deep Learning model Definition --- B --- (1D Convolution and LSTM)

def lstm_conv(max_len=75, emb_dim=32, max_vocab_len=100, lstm_output_size=32, W_reg=regularizers.l2(1e-4)):
    # Input
    main_input = Input(shape=(max_len,), dtype='int32', name='main_input')
    # Embedding layer
    emb = Embedding(input_dim=max_vocab_len, output_dim=emb_dim, input_length=max_len,
                embeddings_regularizer=W_reg)(main_input)
    emb = Dropout(0.25)(emb)

    # Conv layer
    conv = Conv1D(kernel_size=5, filters=256, \
                     padding ='same')(emb)
    conv = ELU()(conv)

    conv = MaxPooling1D(pool_size=4)(conv)
    conv = Dropout(0.5)(conv)

    # LSTM layer
    lstm = LSTM(lstm_output_size)(conv)
    lstm = Dropout(0.5)(lstm)

    # Output layer (last fully connected layer)
    output = Dense(1, activation='sigmoid', name='output')(lstm)

    # Compile model and define optimizer
    model = Model(inputs=main_input, outputs=output)
    adam = Adam(learning_rate=1e-4, beta_1=0.9, beta_2=0.999, epsilon=1e-08)
    model.compile(optimizer=adam, loss='binary_crossentropy', metrics=['accuracy'])
    return model

In [None]:
# Fit model and Cross-Validation, ARCHITECTURE 2 CONV + LSTM
epochs = 5
batch_size = 32

model2 = lstm_conv()
model2.fit(X_train, target_train, epochs=epochs, batch_size=batch_size)
loss, accuracy = model.evaluate(X_test, target_test, verbose=1)

print('\nFinal Cross-Validation Accuracy', accuracy, '\n')
# print_layers_dims(model)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5

Final Cross-Validation Accuracy 0.9196428656578064 



In [34]:
from sklearn.model_selection import KFold
base_dir = 'results'
n_loops = 1
n_folds = 5
n_samples = X.shape[0]
for i in range(n_loops):
  cv = KFold(n_splits=n_folds, shuffle=True, random_state = i)
  for fold, (train_idx, test_idx) in enumerate(cv.split(X,target)):
    path_dir = base_dir +'/' + "URLdatasetX2_1sub5" + '_run_'+str(i)+'_'+ 'fold_'+str(fold)+'_'
    print('Run: ', i, ', fold: ', fold)
    X_train = X[train_idx]
    X_test = X[test_idx]
    y_train = target[train_idx]
    y_test = target[test_idx]
    model2 = lstm_conv()
    model2.fit(X_train, y_train)
    y_pred_score = model.predict(X_test).flatten()


    # Chia giá trị từ min đến max thành 20 đoạn
    thresholds = np.linspace(y_pred_score.min(), y_pred_score.max(), 20)

    best_threshold = 0
    best_f1 = 0

    # Duyệt qua các threshold để tìm threshold có f1_score tốt nhất
    for threshold in thresholds:
        y_predict = (y_pred_score > threshold).astype(int)
        score = f1_score(y_test, y_predict, pos_label=1, zero_division=0.0)
        if score > best_f1:
            best_f1 = score
            best_threshold = threshold

    print("Best threshold: ", best_threshold)
    print("Best F1 score: ", best_f1)
    y_predict = (y_pred_score > best_threshold).astype(int)
    print(y_predict)


    df = pd.DataFrame({'Actual': y_test, 'Predicted': y_predict, 'y_score': y_pred_score})
    df.to_csv(path_dir + "LSTMConv_labels.csv", index=False)

Run:  0 , fold:  0
Best threshold:  0.0353375628198448
Best F1 score:  0.20512820512820512
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0
 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0]
Run:  0 , fold:  1
Best threshold:  0.0351

In [None]:
y_pred = tf.argmax(model.predict(X_test), axis=-1)
#y_pred = model.predict(X_test)
print("F1 score: ", f1_score(y_pred, target_test))
save_predictions_to_csv(target_test, y_pred, 'PredictionsSimpleLSTM2.csv')



ValueError: Per-column arrays must each be 1-dimensional

In [None]:
model_name = "deeplearning_1DConvLSTM"
# save_model(DATA_HOME + model_name + ".json", DATA_HOME + model_name + ".h5")
# model = load_model(DATA_HOME + model_name + ".json", DATA_HOME + model_name + ".h5")
# #print_layers_dims(model)
model.save(DATA_HOME + model_name + ".keras")
model = keras.models.load_model(DATA_HOME + model_name + ".keras")

NameError: name 'DATA_HOME' is not defined

## Architecture 3 - 1D Convolutions and Fully Connected Layers

In [35]:
## Deep Learning model Definition --- C --- (1D Convolutions and Fully Connected Layers)

def conv_fully(max_len=75, emb_dim=32, max_vocab_len=100, W_reg=regularizers.l2(1e-4)):
    # Input
    main_input = Input(shape=(max_len,), dtype='int32', name='main_input')
    # Embedding layer
    emb = Embedding(input_dim=max_vocab_len, output_dim=emb_dim, input_length=max_len,
                embeddings_regularizer=W_reg)(main_input)
    emb = Dropout(0.25)(emb)


    def sum_1d(X):
        return K.sum(X, axis=1)

    def get_conv_layer(emb, kernel_size=5, filters=256):
        # Conv layer
        conv = Conv1D(kernel_size=kernel_size, filters=filters, \
                     padding='same')(emb)
        conv = ELU()(conv)

        conv = Lambda(sum_1d, output_shape=(filters,))(conv)
        #conv = BatchNormalization(mode=0)(conv)
        conv = Dropout(0.5)(conv)
        return conv

    # Multiple Conv Layers

    # calling custom conv function from above
    conv1 = get_conv_layer(emb, kernel_size=2, filters=256)
    conv2 = get_conv_layer(emb, kernel_size=3, filters=256)
    conv3 = get_conv_layer(emb, kernel_size=4, filters=256)
    conv4 = get_conv_layer(emb, kernel_size=5, filters=256)

    # Fully Connected Layers
    merged = concatenate([conv1,conv2,conv3,conv4], axis=1)

    hidden1 = Dense(1024)(merged)
    hidden1 = ELU()(hidden1)
    # hidden1 = BatchNormalization()(hidden1)
    hidden1 = Dropout(0.5)(hidden1)

    hidden2 = Dense(1024)(hidden1)
    hidden2 = ELU()(hidden2)
    # hidden2 = BatchNormalization()(hidden2)
    hidden2 = Dropout(0.5)(hidden2)

    # Output layer (last fully connected layer)
    output = Dense(1, activation='sigmoid', name='output')(hidden2)

    # Compile model and define optimizer
    model = Model(inputs=main_input, outputs=output)
    adam = Adam(learning_rate=1e-4, beta_1=0.9, beta_2=0.999, epsilon=1e-08)
    model.compile(optimizer=adam, loss='binary_crossentropy', metrics=['accuracy'])
    return model

In [None]:
# Fit model and Cross-Validation, ARCHITECTURE 3 CONV + FULLY CONNECTED
epochs = 10
batch_size = 64

model = conv_fully()
model.fit(X_train, target_train, epochs=epochs, batch_size=batch_size)
loss, accuracy = model.evaluate(X_test, target_test, verbose=1)

print('\nFinal Cross-Validation Accuracy', accuracy, '\n')
# print_layers_dims(model)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

Final Cross-Validation Accuracy 0.8482142686843872 



In [36]:
from sklearn.model_selection import KFold
base_dir = 'results'
n_loops = 1
n_folds = 5
n_samples = X.shape[0]
for i in range(n_loops):
  cv = KFold(n_splits=n_folds, shuffle=True, random_state = i)
  for fold, (train_idx, test_idx) in enumerate(cv.split(X,target)):
    path_dir = base_dir +'/' + "URLdatasetX2_1sub5" + '_run_'+str(i)+'_'+ 'fold_'+str(fold)+'_'
    print('Run: ', i, ', fold: ', fold)
    X_train = X[train_idx]
    X_test = X[test_idx]
    y_train = target[train_idx]
    y_test = target[test_idx]
    model = conv_fully()
    model.fit(X_train, y_train)
    #y_predict = tf.argmax(model.predict(X_test), axis=-1)
    y_pred_score = model.predict(X_test).flatten()


    # Chia giá trị từ min đến max thành 20 đoạn
    thresholds = np.linspace(y_pred_score.min(), y_pred_score.max(), 20)

    best_threshold = 0
    best_f1 = 0

    # Duyệt qua các threshold để tìm threshold có f1_score tốt nhất
    for threshold in thresholds:
        y_predict = (y_pred_score > threshold).astype(int)
        score = f1_score(y_test, y_predict, pos_label=1, zero_division=0.0)
        if score > best_f1:
            best_f1 = score
            best_threshold = threshold

    print("Best threshold: ", best_threshold)
    print("Best F1 score: ", best_f1)
    y_predict = (y_pred_score > best_threshold).astype(int)
    print(y_predict)

    df = pd.DataFrame({'Actual': y_test, 'Predicted': y_predict, 'y_score': y_pred_score})
    df.to_csv(path_dir + "LSTMConvFully_labels.csv", index=False)


Run:  0 , fold:  0
Best threshold:  0.1772897768961756
Best F1 score:  0.1818181818181818
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
Run:  0 , fold:  1
Best threshold:  0.10744

In [None]:
y_pred = tf.argmax(model.predict(X_test), axis=-1)
print("F1 score: ", f1_score(y_pred, target_test))
save_predictions_to_csv(target_test, y_pred, 'PredictionsSimpleLSTM3.csv')

F1 score:  0.0


In [None]:
model_name = "deeplearning_1DConv"
# save_model(DATA_HOME + model_name + ".json", DATA_HOME + model_name + ".h5")
# model = load_model(DATA_HOME + model_name + ".json", DATA_HOME + model_name + ".h5")
#print_layers_dims(model)
# model.save(DATA_HOME + model_name + ".keras")
# model = keras.models.load_model(DATA_HOME + model_name + ".keras")

## Making a new prediction

In [None]:
test_url_mal = "naureen.net/etisalat.ae/index2.php"
test_url_benign = "sixt.com/php/reservation?language=en_US"

url = test_url_mal

In [None]:
# Step 1: Convert raw URL string in list of lists where characters that are contained in "printable" are stored encoded as integer
url_int_tokens = [[printable.index(x) + 1 for x in url if x in printable]]

# Step 2: Cut URL string at max_len or pad with zeros if shorter
max_len=75
XY = sequence.pad_sequences(url_int_tokens, maxlen=max_len)

In [None]:
target_proba = model.predict(XY, batch_size=1)
def print_result(proba):
    if proba > 0.5:
        return "malicious"
    else:
        return "benign"
print("Test URL:", url, "is", print_result(target_proba[0]))

Test URL: naureen.net/etisalat.ae/index2.php is benign


In [None]:
import shutil

# Nén thư mục /content/results thành tệp results.zip
shutil.make_archive('/content/LSTMConv', 'zip', '/content/LSTMConv')

# Sử dụng hàm download của Google Colab để tải tệp .zip về máy
from google.colab import files
files.download('/content/LSTMConv.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>