In [12]:
# Importing necessary libraries
import numpy as np
import pandas as pd
import urllib.parse
import tensorflow as tf
from tensorflow.keras import regularizers
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Flatten, Embedding, Dense, Dropout, Reshape
from tensorflow.keras.callbacks import EarlyStopping

# Previous pre-processing unit 
def convert_url(raw_url):
    # Tokenize the URL
    tokenized_url = urllib.parse.quote(raw_url)

    # Standardize the URL
    standardized_url = urllib.parse.urlsplit(tokenized_url).geturl()

    # Truncate or pad the URL
    max_url_length = 200  # Maximum length of the padded URL

    if len(standardized_url) > max_url_length:
        # Truncate the URL if it is longer than the maximum length
        truncated_url = standardized_url[-max_url_length:]
        padded_url = [ord(char) for char in truncated_url]
    else:
        # Pad the URL with zeros if it is shorter than the maximum length
        padded_url = [0] * (max_url_length - len(standardized_url)) + [ord(char) for char in standardized_url]
    
    return padded_url



# Example usage
raw_url = "https://example.com/path?param1=value1&param2=value2"
padded_url = convert_url(raw_url)
print(padded_url)

# Reading the training data
urldata = pd.read_csv('C:\\Users\\karan\\Downloads\\urldata.csv', encoding = "ISO-8859-1")

# taking only required columns into a new Dataframe
urldata = urldata.filter(['domain','label'],axis=1)

# Shuffle input randomly and then take ratio 0.7, 0.15, 0.15
data = urldata.sample(frac = 1, random_state = 9)

# Split the data into training, cross-validation and test datasets
train_ratio = 0.7
cv_ratio = 0.15
test_ratio = 0.15

# Compute number of each type of data 
total = len(data)
num_train = int(total * train_ratio)
num_cv = int(total * cv_ratio)
num_test = int(total * test_ratio)

# Split data into three
train_data = data[:num_train]
val_data = data[num_train:num_train + num_cv]
test_data = data[num_train + num_cv:]

print("num_train: ", num_train)
print("num_cv: ", num_cv)
print("num_test: ", num_test)

#print(train_data)

# Get the X_train, y_train, X_cv, y_cv, X_test, y_test
X_train = train_data['domain'].values
y_train = train_data['label'].values
X_train = list(X_train)
y_train = list(y_train)
X_cv = val_data['domain'].values
y_cv = val_data['label'].values
X_cv = list(X_cv)
y_cv = list(y_cv)
X_test = test_data['domain'].values
y_test = test_data['label'].values
X_test = list(X_test)
y_test = list(y_test)
#print(X_train[:5])
#print(y_train[:5])

# Convert the URLs into padded URLs using a function(to be vectorized)
for i in range(len(X_train)):
    X_train[i] = convert_url(X_train[i])
for i in range(len(X_cv)):
    X_cv[i] = convert_url(X_cv[i])
for i in range(len(X_test)):
    X_test[i] = convert_url(X_test[i])
print(X_train[:5])

for i in range(len(y_train)):
    y_train[i] = [y_train[i]]
for i in range(len(y_cv)):
    y_cv[i] = [y_cv[i]]
for i in range(len(y_test)):
    y_test[i] = [y_test[i]]

# Convert the inputs to numpy arrays
X_train = np.array(X_train)
y_train = np.array(y_train)
X_cv = np.array(X_cv)
y_cv = np.array(y_cv)
X_test = np.array(X_test)
y_test = np.array(y_test)
print("Shape of X_train: ", X_train.shape)
print("Shape of y_train: ", y_train.shape)

# Model making
inp_dim = 256
emb_dim = 32
seq_len = 200

model = Sequential([
    Embedding(input_dim = inp_dim, output_dim = emb_dim, input_length = seq_len),
    Flatten(),
    Dense(units = 512, activation = 'relu', kernel_regularizer=regularizers.l2(0.001)),
    Dense(units = 256, activation = 'relu', kernel_regularizer=regularizers.l2(0.001)),
    Dense(units = 128, activation = 'relu', kernel_regularizer=regularizers.l2(0.001)),
    Dense(units = 1, activation = 'linear', kernel_regularizer=regularizers.l2(0.001))
])

model.compile(
    loss = tf.keras.losses.BinaryCrossentropy(from_logits = True),
    optimizer = tf.keras.optimizers.Adam(0.001)
)

# Define early stopping callback
# patience - number of epochs to wait for a significant improvement in the loss
early_stop = EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)

history = model.fit(
    X_train, y_train, validation_data=(X_cv, y_cv), epochs = 10, callbacks=[early_stop]
)

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 104, 116, 116, 112, 115, 37, 51, 65, 47, 47, 101, 120, 97, 109, 112, 108, 101, 46, 99, 111, 109, 47, 112, 97, 116, 104, 37, 51, 70, 112, 97, 114, 97, 109, 49, 37, 51, 68, 118, 97, 108, 117, 101, 49, 37, 50, 54, 112, 97, 114, 97, 109, 50, 37, 51, 68, 118, 97, 108, 117, 101, 50]
num_train:  67139
num_cv:  14386
num_test:  14386


  urldata = pd.read_csv('C:\\Users\\karan\\Downloads\\urldata.csv', encoding = "ISO-8859-1")


[[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 119, 119, 119, 46, 115, 101, 114, 118, 101, 114, 98, 114, 115, 112, 46, 99, 111, 109, 46, 98, 114, 47, 115, 101, 110, 100, 101, 114, 47, 114, 101, 109, 46, 112, 104, 112], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [13]:
# Accuracy obtained on training and cross-validation set
output = model.predict(X_train)
for i in range(len(output)):
    if output[i] >= 0.5:
        output[i] = 1
    else:
        output[i] = 0
output = np.array(output)
res = sum(y_train == output)/len(y_train)
print("Accuracy on training set: ", res)
print(len(y_cv))

output = model.predict(X_cv)
for i in range(len(output)):
    if output[i] >= 0.5:
        output[i] = 1
    else:
        output[i] = 0
output = np.array(output)
res = sum(y_cv == output)/len(y_cv)
print("Accuracy on cross-validation set: ", res)

Accuracy on training set:  [0.94760125]
14386
Accuracy on cross-validation set:  [0.92026971]
