In [6]:
# build character map function for encoding URL string

import string
ascii_letters = string.ascii_letters # 1~52
digits = string.digits # 53~62
punctuation = string.punctuation # 63~94
total_char = ascii_letters + digits + punctuation    

UNKNOWN_CHAR = len(total_char) + 1
TOTAL_FEATURES = UNKNOWN_CHAR + 1 # include the default padding integer 0 
charmap = {
    c: idx+1
    for idx, c in enumerate(total_char)
}

def encodeChar(c):
    return charmap.get(c, UNKNOWN_CHAR)

encodeChar("x"), encodeChar("a"), encodeChar("我")

(24, 1, 95)

In [7]:
# load dataset

import pandas
import statistics
df = pandas.read_csv("data.csv")

df["len"] = df.url.apply(lambda s: len(s))

In [8]:
# view the length stats
df.len.describe()

count    420464.000000
mean         48.342005
std          35.021279
min           1.000000
25%          29.000000
50%          41.000000
75%          58.000000
max        2307.000000
Name: len, dtype: float64

In [9]:
# find a Length for large coverage for all sample URL
# Length = 400 has ~98% coverage
for t in [200, 300, 400, 500, 600, 700, 800, 900, 1000]:
    print("x={} {:.5f}%".format(t, 100 * sum(df.len.apply(lambda x: x > t)) / len(df.len)))

x=200 0.60172%
x=300 0.12034%
x=400 0.07420%
x=500 0.06469%
x=600 0.04614%
x=700 0.02545%
x=800 0.01784%
x=900 0.00880%
x=1000 0.00476%


In [29]:
# sampling train/test dataset

from sklearn.model_selection import train_test_split
from keras.utils import np_utils
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df1 = df
df1.label = le.fit_transform(df1.label)

sub_df, preserved_df = train_test_split(df1, test_size=0.2, random_state=1)
print(len(sub_df), len(preserved_df), len(df1))
categorical_label = np_utils.to_categorical(sub_df.label)

url_train, url_test, y_train, y_test \
    = train_test_split(sub_df.url, categorical_label, test_size=0.2, random_state=1)

url_train, url_val, y_train, y_val \
    = train_test_split(url_train, y_train, test_size=0.25, random_state=1) # 0.25 x 0.8 = 0.2

print('Loading data...')
print(len(url_train), 'train sequences')
print(len(url_test), 'test sequences')
print(len(url_val), 'val sequences')

336371 84093 420464
Loading data...
201822 train sequences
67275 test sequences
67274 val sequences


In [30]:
# Some configurations 

# Embedding
max_features = TOTAL_FEATURES
maxlen = 400 # ~98% coverage, paper uses 96% coverage
embedding_size = 128

# Training
batch_size = 64 # paper param
epochs = 20 # paper param

# Convolution
kernel_size = 5
filters = 64
pool_size = 2

# LSTM
lstm_output_size = 70

# Dropout ratio
Dropout_ratio = 0.25

In [31]:
# encode the URL by one-hot encoding and padding feature vector by 'pre'

from keras.preprocessing.sequence import pad_sequences
import numpy

print('Pad sequences (samples x time)')

x_train = pad_sequences(url_train.apply(lambda url: numpy.array([encodeChar(c) for c in url])), 
              maxlen=maxlen, 
              padding='pre')
x_test = pad_sequences(url_test.apply(lambda url: numpy.array([encodeChar(c) for c in url])), 
              maxlen=maxlen, 
              padding='pre')

x_val = pad_sequences(url_val.apply(lambda url: numpy.array([encodeChar(c) for c in url])), 
              maxlen=maxlen, 
              padding='pre')
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)
print('x_val shape:', x_val.shape)

Pad sequences (samples x time)
x_train shape: (201822, 400)
x_test shape: (67275, 400)
x_val shape: (67274, 400)


In [39]:
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers import Embedding
from keras.layers import LSTM
from keras.layers import Conv1D, MaxPooling1D, GlobalMaxPool1D
from keras.optimizers import SGD

print('Build model...')

model = Sequential(name="CNN-LSTM")
model.add(Embedding(max_features, embedding_size, input_length=maxlen, trainable=True))
model.add(Conv1D(62,
                 3,
                 padding='valid',
                 activation='relu',
                 strides=1))
model.add(MaxPooling1D(pool_size=2))
model.add(LSTM(64))
model.add(Dropout(Dropout_ratio))
model.add(Dense(2, activation='softmax'))

model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
model.summary()

Build model...
Model: "CNN-LSTM"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 400, 128)          12288     
_________________________________________________________________
conv1d_4 (Conv1D)            (None, 398, 62)           23870     
_________________________________________________________________
max_pooling1d_4 (MaxPooling1 (None, 199, 62)           0         
_________________________________________________________________
lstm_4 (LSTM)                (None, 64)                32512     
_________________________________________________________________
dropout_4 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_4 (Dense)              (None, 2)                 130       
Total params: 68,800
Trainable params: 68,800
Non-trainable params: 0
_______________________________________

In [41]:
print('Train...')
model.fit(x_train, y_train,batch_size=batch_size,epochs=3,validation_data=(x_val, y_val))
score, acc = model.evaluate(x_test, y_test, batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)

Train...
Epoch 1/3
Epoch 2/3
Epoch 3/3
Test score: 0.0648501068353653
Test accuracy: 0.9780601859092712


In [42]:
model.evaluate(x_train, y_train, batch_size=2)



[0.05257173255085945, 0.9825341105461121]

In [43]:
model.evaluate(x_val, y_val, batch_size=batch_size)



[0.06458394974470139, 0.9777477383613586]

In [44]:
# evaluate preserved_df
preserved_x = pad_sequences(preserved_df.url.apply(lambda url: numpy.array([encodeChar(c) for c in url])), 
              maxlen=maxlen, 
              padding='pre')

preserved_y = np_utils.to_categorical(preserved_df.label)
model.evaluate(preserved_x, preserved_y, batch_size=batch_size)



[0.06678904592990875, 0.9772513508796692]