In [None]:
# Thank you Vincent!

# Using Deep Learning and Machine Learning to predict malicious urls
# https://github.com/joshsaxe/eXposeDeepNeuralNetwork

In [1]:
# Load the data
from urlparse import urlparse
import pandas as pd
urls = pd.read_json("data/url_nn.json")
print urls.shape
urls['string'] = "http://" + urls['string']

(5000, 3)


In [146]:
urls.head(10)

Unnamed: 0,pred,string,truth
0,1.574204e-05,http://startbuyingstocks.com/,0
1,1.840909e-05,http://qqcvk.com/,0
2,1.84208e-05,http://432parkavenue.com/,0
3,7.954729e-07,http://gamefoliant.ru/,0
4,3.239338e-06,http://orka.cn/,0
5,0.0003043137,http://media2.mercola.com/,0
6,4.107331e-37,http://ping.chartbeat.net/ping?h=sltrib.com&p=...,0
7,1.664497e-07,http://stephensteels.com/,0
8,1.400715e-05,http://kbd-eko.pl/,0
9,2.273991e-05,http://ceskaposta.cz/,0


In [3]:
X, y = urls['string'], urls['truth']

In [203]:
X.head()  # look at X

0    http://startbuyingstocks.com/
1                http://qqcvk.com/
2        http://432parkavenue.com/
3           http://gamefoliant.ru/
4                  http://orka.cn/
Name: string, dtype: object

In [204]:
y.head()

0    0
1    0
2    0
3    0
4    0
Name: truth, dtype: int64

In [4]:
y.value_counts(normalize=True)  # get our null accuracy because we are interested in prediction

0    0.9694
1    0.0306
Name: truth, dtype: float64

In [66]:
from sklearn.feature_extraction.text import CountVectorizer

In [206]:
# EXERCISE

# Create a fucntion called custom_tokenizer that takes in a string and outputs a list of tokens of the string.
# You may choose how few or how many tokens you can choose to return.

# eg.

print custom_tokenizer('https://google.com')

print custom_tokenizer('https://google-so-not-fake.com?fake=False&seriously=True')



['https', 'google', 'com']
['https', 'google', 'so', 'not', 'fake', 'com', 'fake=False&seriously=True']


In [172]:
import re

def custom_tokenizer(string):
    final = []
    tokens = [a for a in list(urlparse(string)) if a]
    for t in tokens:
        final.extend(re.compile("[.-]").split(t))
    return final

In [173]:
custom_tokenizer('google.com')

['google', 'com']

In [174]:
custom_tokenizer('https://google.com')

['https', 'google', 'com']

In [175]:
custom_tokenizer('https://google-so-not-fake.com?fake=False&seriously=True')

['https', 'google', 'so', 'not', 'fake', 'com', 'fake=False&seriously=True']

In [176]:
vect = CountVectorizer(tokenizer=custom_tokenizer)

In [177]:
vect.fit_transform(X)

<5000x11502 sparse matrix of type '<type 'numpy.int64'>'
	with 31293 stored elements in Compressed Sparse Row format>

In [178]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
lr_pipe = Pipeline([('vect', vect), ('model', lr)])

In [179]:
from sklearn.model_selection import cross_val_score, GridSearchCV, train_test_split

In [180]:
scores = cross_val_score(lr_pipe, X, y, cv=5)

scores.mean()  # not good enough!!

0.98000238400238404

In [200]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

rf_pipe = Pipeline([('vect', vect), ('model', RandomForestClassifier(n_estimators=500))])
scores = cross_val_score(rf_pipe, X, y, cv=5)

scores.mean()  # not as good

0.98100278480278469

In [183]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

from sklearn.metrics import confusion_matrix

rf_pipe.fit(X_train, y_train)

preds = rf_pipe.predict(X_test)
print confusion_matrix(y_test, preds)  # hmmmm

[[1214    0]
 [  25   11]]


In [184]:
probs = rf_pipe.predict_proba(X_test)[:,1]

In [185]:
import numpy as np
for thresh in [.1, .2, .3, .4, .5, .6, .7, .8, .9]:
    preds = np.where(probs >= thresh, 1, 0)
    print thresh
    print confusion_matrix(y_test, preds)
    print

0.1
[[1185   29]
 [  15   21]]

0.2
[[1206    8]
 [  20   16]]

0.3
[[1210    4]
 [  21   15]]

0.4
[[1213    1]
 [  22   14]]

0.5
[[1213    1]
 [  24   12]]

0.6
[[1214    0]
 [  25   11]]

0.7
[[1214    0]
 [  25   11]]

0.8
[[1214    0]
 [  27    9]]

0.9
[[1214    0]
 [  27    9]]



In [186]:
from keras.utils.np_utils import to_categorical
y_train_dummy = to_categorical(y_train)
num_classes = y_train_dummy.shape[1]
y_test_dummy = to_categorical(y_test)

In [187]:
from keras.models import Model
from keras.layers import Dense, Dropout, Activation
from keras.models import Sequential
from keras.models import Model
from keras.optimizers import SGD

def mlp_model(input_layer, num_classes, optimizer="adam", epochs=25):
    model = Sequential()
    model.add(Dense(1024, input_dim=input_layer))
    model.add(Activation('relu'))
    model.add(Dropout(0.25))
    
    model.add(Dense(1024, input_dim=input_layer))
    model.add(Activation('relu'))
    model.add(Dropout(0.25))
    
    model.add(Dense(1024))
    model.add(Activation('relu'))
    model.add(Dropout(0.25))
    
    model.add(Dense(num_classes, activation='softmax'))

    epochs = epochs
    lrate = 0.01
    decay = lrate/epochs
    sgd = SGD(lr=lrate, momentum=0.9, decay=decay, nesterov=False)


    model.compile(loss='categorical_crossentropy', optimizer=sgd, metrics=['accuracy'])
    return model

In [188]:
num_epochs = 25

vect = CountVectorizer(tokenizer=custom_tokenizer)

X_train_dtm = vect.fit_transform(X_train).toarray()
X_test_dtm = vect.transform(X_test).toarray()
model = mlp_model(len(vect.get_feature_names()), num_classes, epochs=num_epochs)

print model.summary()

# Fit the model
model.fit(X_train_dtm, y_train_dummy, validation_data=(X_test_dtm, y_test_dummy), batch_size=32, nb_epoch=num_epochs)
# Final evaluation of the model

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_9 (Dense)              (None, 1024)              9228288   
_________________________________________________________________
activation_7 (Activation)    (None, 1024)              0         
_________________________________________________________________
dropout_7 (Dropout)          (None, 1024)              0         
_________________________________________________________________
dense_10 (Dense)             (None, 1024)              1049600   
_________________________________________________________________
activation_8 (Activation)    (None, 1024)              0         
_________________________________________________________________
dropout_8 (Dropout)          (None, 1024)              0         
_________________________________________________________________
dense_11 (Dense)             (None, 1024)              1049600   
__________

<keras.callbacks.History at 0x12351f850>

In [189]:
scores = model.evaluate(X_test_dtm, y_test_dummy, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

Accuracy: 98.40%


In [190]:
model.predict(vect.transform(['http://creditscore.com/', 'http://acharefacil.com.br/app/webroot/js/latestfnb/']).toarray())

array([[  9.99973536e-01,   2.64957744e-05],
       [  3.43389392e-01,   6.56610668e-01]], dtype=float32)

In [191]:
probs = model.predict(vect.transform(X_test).toarray())[:,1]
preds = probs >= .5
print confusion_matrix(y_test, preds)

[[1213    1]
 [  19   17]]


In [192]:
probs = rf_pipe.predict_proba(X_test)[:,1]
preds = probs >= .5
print confusion_matrix(y_test, preds)

[[1213    1]
 [  24   12]]


In [193]:
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier

In [194]:
nn_pipe = Pipeline([('vect', vect), ('model', MLPClassifier())])
scores = cross_val_score(nn_pipe, X, y, cv=5)

scores.mean()  # better I guess..

0.98420158720158724

In [195]:
nn_pipe.fit(X_train, y_train)
probs = nn_pipe.predict_proba(X_test)[:,1]
preds = probs >= .5
print confusion_matrix(y_test, preds)

[[1213    1]
 [  20   16]]


In [196]:
false_negatives = X_test[(preds==0) & (y_test==1)]  # show the false positives, the urls that "got through"

In [202]:
list(false_negatives)

[u'http://instalacionesmarmenor.com/instt/',
 u'http://datatech-ma.com/imagens/autorizados/wwww_dotz.com/acumulo_de_pontos/area_do_cliente/public_html/',
 u'http://220.243.235.13/6843319.s21d-6.faiusrd.com/0/abuiabblgaag9qgfrwuoxeygnqe?f=a&%5cu0005&wsiphost=local',
 u'http://get.ddlmedia1012.info/downloadmanager/get?clickid=y810ifrcxc',
 u'http://shatteredrealities.net/support/activate/information/11d37cf69e7e159ddfc549a51ca901c8/login.php',
 u'http://scholigoneon.org.cy/dropboxdocument/b642ffe1dd813bc5d6590a66ad16927c/',
 u'http://acharefacil.com.br/app/webroot/js/latestfnb/',
 u'http://usherandson.com/news/wp-content/plugins/jetpack/views/hotmail2/security.html',
 u'http://paypal1.com.case.507-917-425.jjfqmrbi5n76yw.pwx4qurtj2t.com.ve/cgi-bin/webscr/',
 u'http://malkarafestival.com/images/bottom.gif/?184ff=398332',
 u'http://purvanchalseva.in/wp-content/upgrade/images/applen/apple-en/en/978913f1b016db9fe72a4dcf841a98f4/',
 u'http://www.marcinderegowski.pl/content/galeria/11/0/min/d72