In [1]:
from __future__ import print_function

import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report,confusion_matrix

import tldextract


def url2domain(url, exclude_subdomains=False):
    """Extract the domain from URL.
    """
    tld = tldextract.extract(url)
    a = []
    if tld.subdomain != '':
        if isinstance(exclude_subdomains, list):
            if tld.subdomain not in exclude_subdomains:
                a.append(tld.subdomain)
        elif not exclude_subdomains :
            a.append(tld.subdomain)
    a.append(tld.domain)
    if tld.suffix != '':
        a.append(tld.suffix)
    domain = '.'.join(a)
    return domain


NGRAMS = 2
FEATURE_LEN = 128
EPOCHS = 15
SAMPLES = 50000

df = pd.read_csv('./data/phishtank_2017.csv.bz2')
df.dropna(subset=['domain'], inplace=True)
df['domain'] = df.domain.apply(lambda c: url2domain(c, exclude_subdomains=['www']))
df

Unnamed: 0,ID,Phish URL,Submitted,Valid?,Online?,url,date,domain
0,4717300,http://g28.ycxafz.biz/added on Jan 1st 2017 6:...,by cleanmx,VALID PHISH,Offline,http://g28.ycxafz.biz/,2017-01-01 06:45:00,g28.ycxafz.biz
1,4717281,http://sistemas.miranda.gob.ve/1/Sign%20on/add...,by cleanmx,VALID PHISH,Offline,http://sistemas.miranda.gob.ve/1/Sign%20on/,2017-01-01 05:46:00,sistemas.miranda.gob.ve
2,4717246,http://sheehydaringproject.com/PDFILES/added o...,by cleanmx,VALID PHISH,Offline,http://sheehydaringproject.com/PDFILES/,2017-01-01 04:15:00,sheehydaringproject.com
3,4717232,http://monar-kielce.pl/templates/beez/gdoc/ind...,by cleanmx,VALID PHISH,Offline,http://monar-kielce.pl/templates/beez/gdoc/ind...,2017-01-01 03:15:00,monar-kielce.pl
4,4717222,http://www.w-reia.com/bremer/new.php?cmd=login...,by cleanmx,VALID PHISH,Offline,http://www.w-reia.com/bremer/new.php?cmd=login...,2017-01-01 02:46:00,w-reia.com
5,4717219,http://fm.registrovotorantim.com.br/.connect.h...,by cleanmx,VALID PHISH,ONLINE,http://fm.registrovotorantim.com.br/.connect.html,2017-01-01 02:46:00,fm.registrovotorantim.com.br
6,4717215,http://etiissallat.bugs3.com/Alibaba/alibaba/a...,by cleanmx,VALID PHISH,Offline,http://etiissallat.bugs3.com/Alibaba/alibaba/a...,2017-01-01 02:46:00,etiissallat.bugs3.com
7,4717170,http://iniciasecion4.webcindario.com/app/faceb...,by prebytes,VALID PHISH,Offline,http://iniciasecion4.webcindario.com/app/faceb...,2017-01-01 01:18:00,iniciasecion4.webcindario.com
8,4717169,http://collectsuccess.com.au/language/CD/CD/in...,by prebytes,VALID PHISH,Offline,http://collectsuccess.com.au/language/CD/CD/in...,2017-01-01 01:18:00,collectsuccess.com.au
9,4717161,http://www.vaastumahajeevan.com/TT/Submit/Offi...,by prebytes,VALID PHISH,Offline,http://www.vaastumahajeevan.com/TT/Submit/Offi...,2017-01-01 01:17:00,vaastumahajeevan.com


In [2]:
sdf = df.drop_duplicates('domain')
sdf

Unnamed: 0,ID,Phish URL,Submitted,Valid?,Online?,url,date,domain
0,4717300,http://g28.ycxafz.biz/added on Jan 1st 2017 6:...,by cleanmx,VALID PHISH,Offline,http://g28.ycxafz.biz/,2017-01-01 06:45:00,g28.ycxafz.biz
1,4717281,http://sistemas.miranda.gob.ve/1/Sign%20on/add...,by cleanmx,VALID PHISH,Offline,http://sistemas.miranda.gob.ve/1/Sign%20on/,2017-01-01 05:46:00,sistemas.miranda.gob.ve
2,4717246,http://sheehydaringproject.com/PDFILES/added o...,by cleanmx,VALID PHISH,Offline,http://sheehydaringproject.com/PDFILES/,2017-01-01 04:15:00,sheehydaringproject.com
3,4717232,http://monar-kielce.pl/templates/beez/gdoc/ind...,by cleanmx,VALID PHISH,Offline,http://monar-kielce.pl/templates/beez/gdoc/ind...,2017-01-01 03:15:00,monar-kielce.pl
4,4717222,http://www.w-reia.com/bremer/new.php?cmd=login...,by cleanmx,VALID PHISH,Offline,http://www.w-reia.com/bremer/new.php?cmd=login...,2017-01-01 02:46:00,w-reia.com
5,4717219,http://fm.registrovotorantim.com.br/.connect.h...,by cleanmx,VALID PHISH,ONLINE,http://fm.registrovotorantim.com.br/.connect.html,2017-01-01 02:46:00,fm.registrovotorantim.com.br
6,4717215,http://etiissallat.bugs3.com/Alibaba/alibaba/a...,by cleanmx,VALID PHISH,Offline,http://etiissallat.bugs3.com/Alibaba/alibaba/a...,2017-01-01 02:46:00,etiissallat.bugs3.com
7,4717170,http://iniciasecion4.webcindario.com/app/faceb...,by prebytes,VALID PHISH,Offline,http://iniciasecion4.webcindario.com/app/faceb...,2017-01-01 01:18:00,iniciasecion4.webcindario.com
8,4717169,http://collectsuccess.com.au/language/CD/CD/in...,by prebytes,VALID PHISH,Offline,http://collectsuccess.com.au/language/CD/CD/in...,2017-01-01 01:18:00,collectsuccess.com.au
9,4717161,http://www.vaastumahajeevan.com/TT/Submit/Offi...,by prebytes,VALID PHISH,Offline,http://www.vaastumahajeevan.com/TT/Submit/Offi...,2017-01-01 01:17:00,vaastumahajeevan.com


In [3]:
try:
    sdf.groupby('target').agg({'domain': 'count'})
except:
    pass

In [4]:
adf = pd.read_csv('./data/top-1m.csv.zip', header=None)
adf.columns = ['rank', 'domain']
adf

Unnamed: 0,rank,domain
0,1,google.com
1,2,youtube.com
2,3,facebook.com
3,4,baidu.com
4,5,wikipedia.org
5,6,yahoo.com
6,7,google.co.in
7,8,reddit.com
8,9,qq.com
9,10,taobao.com


In [5]:
ldf = adf[['domain']].head(SAMPLES)
pdf = sdf[['domain']].sample(SAMPLES, random_state=21)

In [6]:
ldf['phishing'] = False
pdf['phishing'] = True
tdf = pd.concat([ldf, pdf])
tdf

Unnamed: 0,domain,phishing
0,google.com,False
1,youtube.com,False
2,facebook.com,False
3,baidu.com,False
4,wikipedia.org,False
5,yahoo.com,False
6,google.co.in,False
7,reddit.com,False
8,qq.com,False
9,taobao.com,False


## Preprocessing the input data

In [7]:
# build n-gram list
vect = CountVectorizer(analyzer='char', max_df=0.3, min_df=3, ngram_range=(NGRAMS, NGRAMS), lowercase=False) 
#vect = CountVectorizer(analyzer='char', ngram_range=(NGRAMS, NGRAMS), lowercase=False) 
a = vect.fit_transform(tdf.domain)
vocab = vect.vocabulary_

# sort n-gram by freq (highest -> lowest)
words = []
for b in vocab:
    c = vocab[b]
    #print(b, c, a[:, c].sum())
    words.append((a[:, c].sum(), b))
    #break
words = sorted(words, reverse=True)
words_list = [w[1] for w in words]
num_words = len(words_list)
print("num_words = %d" % num_words)


def find_ngrams(text, n):
    a = zip(*[text[i:] for i in range(n)])
    wi = []
    for i in a:
        w = ''.join(i)
        try:
            idx = words_list.index(w)
        except:
            idx = 0
        wi.append(idx)
    return wi

# build X from index of n-gram sequence
X = np.array(tdf.domain.apply(lambda c: find_ngrams(c, NGRAMS)))
X

num_words = 1464


array([list([124, 100, 129, 312, 38, 6, 0, 0, 0]),
       list([290, 66, 140, 165, 214, 122, 6, 0, 0, 0]),
       list([186, 37, 62, 56, 159, 100, 215, 139, 0, 0, 0]), ...,
       list([82, 7, 18, 15, 7, 76, 32, 2, 60, 144, 103, 176, 180, 198, 57, 385, 108, 210, 107, 9, 1, 82, 13, 31, 80, 14, 61, 144, 406]),
       list([304, 207, 14, 21, 28, 188, 811, 1169, 58, 1, 11, 23, 0, 0, 0]),
       list([288, 45, 53, 8, 4, 1, 344, 88, 331, 127, 229, 207, 27, 0, 0, 0, 19, 73, 96])],
      dtype=object)

In [8]:
# check max/avg feature
X_len = []
for x in X:
    X_len.append(len(x))

max_feature_len = max(X_len)
avg_feature_len = int(np.mean(X_len))

print("Max feature len = %d, Avg. feature len = %d" % (max_feature_len, avg_feature_len))

Max feature len = 62, Avg. feature len = 15


In [9]:
y = np.array(tdf.phishing.astype('category').cat.codes)

# Split train and test dataset
X_train,  X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21, stratify=y)

## Train a LSTM model

In [10]:
import keras
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Embedding, Dropout, Activation
from keras.layers import LSTM
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.models import load_model

max_features = num_words # 20000
feature_len = FEATURE_LEN # avg_feature_len # cut texts after this number of words (among top max_features most common words)
batch_size = 32

print(len(X_train), 'train sequences')
print(len(X_test), 'test sequences')

print('Pad sequences (samples x time)')
X_train = sequence.pad_sequences(X_train, maxlen=feature_len)
X_test = sequence.pad_sequences(X_test, maxlen=feature_len)
print('X_train shape:', X_train.shape)
print('X_test shape:', X_test.shape)

num_classes = np.max(y_train) + 1
print(num_classes, 'classes')

print('Convert class vector to binary class matrix '
      '(for use with categorical_crossentropy)')
y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)
print('y_train shape:', y_train.shape)
print('y_test shape:', y_test.shape)

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


80000 train sequences
20000 test sequences
Pad sequences (samples x time)
X_train shape: (80000, 128)
X_test shape: (20000, 128)
2 classes
Convert class vector to binary class matrix (for use with categorical_crossentropy)
y_train shape: (80000, 2)
y_test shape: (20000, 2)


In [11]:
print('Build model...')

model = Sequential()
model.add(Embedding(num_words, 32, input_length=feature_len))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(num_classes, activation='softmax'))

# try using different optimizers and different optimizer configs
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
print(model.summary())

Build model...
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 128, 32)           46848     
_________________________________________________________________
lstm_1 (LSTM)                (None, 128)               82432     
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 258       
Total params: 129,538
Trainable params: 129,538
Non-trainable params: 0
_________________________________________________________________
None


In [12]:
print('Train...')
model.fit(X_train, y_train, batch_size=batch_size, epochs=EPOCHS,
          validation_split=0.1, verbose=2)
score, acc = model.evaluate(X_test, y_test,
                            batch_size=batch_size, verbose=2)
print('Test score:', score)
print('Test accuracy:', acc)

Train...
Train on 72000 samples, validate on 8000 samples
Epoch 1/15
 - 565s - loss: 0.4667 - acc: 0.7718 - val_loss: 0.4319 - val_acc: 0.7883
Epoch 2/15
 - 550s - loss: 0.4311 - acc: 0.7917 - val_loss: 0.4207 - val_acc: 0.7989
Epoch 3/15
 - 539s - loss: 0.4201 - acc: 0.7970 - val_loss: 0.4203 - val_acc: 0.7910
Epoch 4/15
 - 557s - loss: 0.4112 - acc: 0.8028 - val_loss: 0.4156 - val_acc: 0.8009
Epoch 5/15
 - 546s - loss: 0.4044 - acc: 0.8054 - val_loss: 0.4235 - val_acc: 0.7963
Epoch 6/15
 - 542s - loss: 0.3974 - acc: 0.8083 - val_loss: 0.4121 - val_acc: 0.8039
Epoch 7/15
 - 560s - loss: 0.3917 - acc: 0.8115 - val_loss: 0.4199 - val_acc: 0.7990
Epoch 8/15
 - 549s - loss: 0.3865 - acc: 0.8151 - val_loss: 0.4119 - val_acc: 0.8011
Epoch 9/15
 - 553s - loss: 0.3823 - acc: 0.8166 - val_loss: 0.4138 - val_acc: 0.8003
Epoch 10/15
 - 552s - loss: 0.3761 - acc: 0.8203 - val_loss: 0.4086 - val_acc: 0.8115
Epoch 11/15
 - 526s - loss: 0.3721 - acc: 0.8229 - val_loss: 0.4139 - val_acc: 0.8107
Epoch

## Confusion Matrix

In [13]:
y_pred = model.predict_classes(X_test, verbose=2)
p = model.predict_proba(X_test, verbose=2) # to predict probability
target_names = list(tdf.phishing.astype('category').cat.categories)

In [14]:
target_names = [str(t) for t in target_names]
print(classification_report(np.argmax(y_test, axis=1), y_pred, target_names=target_names))
print(confusion_matrix(np.argmax(y_test, axis=1), y_pred))


             precision    recall  f1-score   support

      False       0.78      0.84      0.81     10000
       True       0.83      0.76      0.79     10000

avg / total       0.80      0.80      0.80     20000

[[8429 1571]
 [2432 7568]]


## Save model

In [15]:
model.save('./models/phish_cat_lstm_2017.h5')

In [16]:
words_df = pd.DataFrame(words_list, columns=['vocab'])
words_df.to_csv('./models/phish_cat_vocab_2017.csv', index=False, encoding='utf-8')