In [None]:
!pip install tldextract

In [None]:
import pandas as pd
from tldextract import extract
from google.colab import drive
drive.mount('/content/drive')

**Malicious URLs**

In [None]:
malicious_df = pd.read_csv("drive/My Drive/Colab Notebooks/Phishing_URL/verified_online.csv")
malicious_df.head()

Unnamed: 0,phish_id,url,phish_detail_url,submission_time,verified,verification_time,online,target
0,6471412,https://www.accesodigitalfirma.com//bancos.php,http://www.phishtank.com/phish_detail.php?phis...,2020-03-27T02:32:06+00:00,yes,2020-03-27T02:34:33+00:00,yes,Other
1,6471402,https://iticadregisterbonus.ga/,http://www.phishtank.com/phish_detail.php?phis...,2020-03-27T02:00:23+00:00,yes,2020-03-27T02:01:46+00:00,yes,Other
2,6471389,https://www.mibcp.idlemovil.com/iniciar-sesion,http://www.phishtank.com/phish_detail.php?phis...,2020-03-27T01:45:10+00:00,yes,2020-03-27T01:45:24+00:00,yes,Other
3,6471388,https://jhtys.net/index/login/index.html,http://www.phishtank.com/phish_detail.php?phis...,2020-03-27T01:39:35+00:00,yes,2020-03-27T01:40:24+00:00,yes,Other
4,6471386,https://sicurezza-covid19.com/,http://www.phishtank.com/phish_detail.php?phis...,2020-03-27T01:36:00+00:00,yes,2020-03-27T01:39:08+00:00,yes,Other


In [None]:
new_mal = malicious_df.drop(malicious_df.columns[[0, 2, 3, 4, 5, 6, 7]], axis=1) 
new_mal['label'] = 1
new_mal.head()

Unnamed: 0,url,label
0,https://www.accesodigitalfirma.com//bancos.php,1
1,https://iticadregisterbonus.ga/,1
2,https://www.mibcp.idlemovil.com/iniciar-sesion,1
3,https://jhtys.net/index/login/index.html,1
4,https://sicurezza-covid19.com/,1


In [None]:
print( str(new_mal.shape[0]) + " phishing URLs")

13098 phishing URLs


**Handling Benign URLs**

In [None]:
benign_df = pd.read_csv("drive/My Drive/Colab Notebooks/Phishing_URL/top-1m.csv", names=["url"])
benign_df['label'] = 0
benign_df.head()

Unnamed: 0,url,label
0,google.com,0
1,microsoft.com,0
2,www.google.com,0
3,windowsupdate.com,0
4,netflix.com,0


In [None]:
new_ben = benign_df.sample(new_mal.shape[0])
print( str(new_ben.shape[0]) + " normal URLs")

13098 normal URLs


In [None]:
dataframes = [new_mal, new_ben]
dataset = pd.concat(dataframes)
dataset.tail()

Unnamed: 0,url,label
945097,depkominfo.go.id,0
396710,realdmp.realclick.co.kr,0
607968,storage-ec2-924.sharefile.com,0
215241,autoimg.cn.ccgslb.com.cn,0
825069,www.hoopsrumors.com,0


**Extract the TLD**

In [None]:
dataset.url = dataset.url.map(lambda u : extract(u).domain)
dataset.head()

Unnamed: 0,url,label
0,accesodigitalfirma,1
1,iticadregisterbonus,1
2,idlemovil,1
3,jhtys,1
4,sicurezza-covid19,1


In [None]:
domain_char_list = ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z",
                    "A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z", 
                    "0", "1", "2", "3", "4", "5", "6", "7", "8", "9",
                    "-", "_", "."]
len(domain_char_list)


65

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
dataset.shape[0]

26196

In [None]:
# Filtering out IP addresses
dataset = dataset[~dataset['url'].str.contains("\\.")]
dataset.shape[0]

25888

In [None]:
dataset['url'] = dataset['url'].map(lambda u : [domain_char_list.index(c) + 1 for c in u])
dataset

Unnamed: 0,url,label
0,"[1, 3, 3, 5, 19, 15, 4, 9, 7, 9, 20, 1, 12, 6,...",1
1,"[9, 20, 9, 3, 1, 4, 18, 5, 7, 9, 19, 20, 5, 18...",1
2,"[9, 4, 12, 5, 13, 15, 22, 9, 12]",1
3,"[10, 8, 20, 25, 19]",1
4,"[19, 9, 3, 21, 18, 5, 26, 26, 1, 63, 3, 15, 22...",1
...,...,...
945097,"[4, 5, 16, 11, 15, 13, 9, 14, 6, 15]",0
396710,"[18, 5, 1, 12, 3, 12, 9, 3, 11]",0
607968,"[19, 8, 1, 18, 5, 6, 9, 12, 5]",0
215241,"[3, 3, 7, 19, 12, 2]",0


In [None]:
features = pad_sequences(dataset['url'], padding='post')
labels = dataset['label']

In [None]:
features.shape
labels.head()

0    1
1    1
2    1
3    1
4    1
Name: label, dtype: int64

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

def build_model():
  model = Sequential()
  num_chars = len(domain_char_list)
  model.add(Embedding(input_dim=num_chars+1, output_dim=256, mask_zero=True))
  model.add(LSTM(units=128, dropout=0.2, recurrent_dropout=0.0, return_sequences=False))
  model.add(Dense(units=1, activation='sigmoid'))
  model.compile(optimizer='adam', loss='binary_crossentropy', metrics=["accuracy"])
  return model

In [None]:
from sklearn.model_selection import train_test_split
X, y = features, labels
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=28)

In [None]:
model=build_model()
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=16, batch_size=16)

Epoch 1/16
Epoch 2/16
Epoch 3/16
Epoch 4/16
Epoch 5/16
Epoch 6/16
Epoch 7/16
Epoch 8/16
Epoch 9/16
Epoch 10/16
Epoch 11/16
Epoch 12/16
Epoch 13/16
Epoch 14/16
Epoch 15/16
Epoch 16/16


<tensorflow.python.keras.callbacks.History at 0x7f4c48835198>

In [None]:
test_domains = ["https://www.google.com", "https://www.quora.com/How-do-I-calculate-the-entropy-of-words", "https://www.sdjfsid983r8ff328h.com/"]
test_domains_df = pd.DataFrame({"domain" : test_domains})
test_domains_df.domain = test_domains_df.domain.map(lambda d: extract(d).domain)
test_domains_df.domain = test_domains_df.domain.map(lambda d : [domain_char_list.index(c) + 1 for c in d])
test_domains_X = pad_sequences(test_domains_df.domain, padding='post')
test_domains_df["predictions"] = model.predict(test_domains_X)
test_domains_df["domain"] = test_domains
test_domains_df.head()

Unnamed: 0,domain,predictions
0,https://www.google.com,0.954598
1,https://www.quora.com/How-do-I-calculate-the-e...,0.936769
2,https://www.sdjfsid983r8ff328h.com/,0.008544
