# In-built Library imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
import math
from sklearn.model_selection import train_test_split
import csv

# importing required packages for this URL Feature Extraction
from urllib.parse import urlparse,urlencode
import ipaddress
import re

# Import the Dataset

In [2]:
phishing_urls = pd.read_csv('./ISCXURL2016/FinalDataset/URL/phishing_dataset.csv', names=['URL'])
print (phishing_urls.shape)

benign_urls = pd.read_csv('./ISCXURL2016/FinalDataset/URL/Benign_list_big_final.csv', names=['URL'])
print (benign_urls.shape)

(9965, 1)
(35378, 1)


# Randomly sample 10000 benign and 5000 phishing URLs with some state to re-produce the sampling

In [3]:
phishing_urls = phishing_urls.sample(n=5000, replace = False, random_state= 53, ignore_index=True)
benign_urls = benign_urls.sample(n=10000, replace = False, random_state= 67, ignore_index=True)

print (phishing_urls.shape)
print (benign_urls.shape)

(5000, 1)
(10000, 1)


# Add The Y_label to the data

In [4]:
benign_labels = np.array([0]*benign_urls.shape[0])
benign_urls['isPhishing'] = benign_labels
display (benign_urls.head(5))

phishing_labels = np.array([1]*phishing_urls.shape[0])
phishing_urls['isPhishing'] = phishing_labels
display (phishing_urls.head(5))

Unnamed: 0,URL,isPhishing
0,http://yourlust.com/videos/leggy-blonde-terry-...,0
1,http://olx.ro/i2/electronice-si-electrocasnice...,0
2,http://variety.com/2015/film/news/broken-holly...,0
3,http://mylust.com/videos/227407/bbw-mature-slu...,0
4,http://olx.in/hi/item/aditya-raicall-center-jo...,0


Unnamed: 0,URL,isPhishing
0,http://www.gtbusiness.com.au/wp-content/plugin...,1
1,http://localartgallery.ca/wp-content/remax/ind...,1
2,http://www.agrimilk.com.br/verificar/dados1/ca...,1
3,http://galdemchoice.com/css/secure.visa.dk/upd...,1
4,http://www.ltunes-updates.com/b01a5440d7ac077d...,1


# Combine and randomize the the data

In [5]:
data = pd.concat([benign_urls,phishing_urls], ignore_index = True)
data = data.sample(frac=1).reset_index(drop=True)
data

Unnamed: 0,URL,isPhishing
0,http://superuser.com/questions/630156/how-do-i...,0
1,http://olx.ua/uk/i2/disable/m/?ref%5B0%5D%5Baj...,0
2,http://hiroba.dqx.jp.su.qqibhd.com/account/app...,1
3,http://techcrunch.com/video/windows-10-will-be...,0
4,http://torcache.net/torrent/6BEC67D4279BDEF07B...,0
...,...,...
14995,https://archive.org/search.php?query=mediatype...,0
14996,http://appleid.apple.co.uk.cgi-bin.webobjects....,1
14997,http://www.czmdzx.cn/Img/index.htm?us.battle.n...,1
14998,http://85.125.204.59/jod-fcc/fcc-authenticatio...,1


# Generate Features from URLS

### URL Length

In [6]:
url_length = []

for i in range(data.shape[0]):
    url = data.iloc[i].URL
    url_length.append(len(url))

data['url_length'] = url_length
display (data.head(5))

Unnamed: 0,URL,isPhishing,url_length
0,http://superuser.com/questions/630156/how-do-i...,0,115
1,http://olx.ua/uk/i2/disable/m/?ref%5B0%5D%5Baj...,0,111
2,http://hiroba.dqx.jp.su.qqibhd.com/account/app...,1,61
3,http://techcrunch.com/video/windows-10-will-be...,0,92
4,http://torcache.net/torrent/6BEC67D4279BDEF07B...,0,140


### Get domain name from URL

In [7]:
domain_names = []
ctr = 0
benign_ctr = 0

for i in range(data.shape[0]):
    url = data.iloc[i].URL
    domain = urlparse(url).netloc
    if re.match(r"^www.",domain):
        domain = domain.replace("www.","")
        ctr += 1
        if data.iloc[i].isPhishing == 0:
            benign_ctr += 1
#             print (urlparse(url))
    domain_names.append(domain)
    
# print (ctr, benign_ctr)
data['domain_names'] = domain_names
display (data.head(5))

Unnamed: 0,URL,isPhishing,url_length,domain_names
0,http://superuser.com/questions/630156/how-do-i...,0,115,superuser.com
1,http://olx.ua/uk/i2/disable/m/?ref%5B0%5D%5Baj...,0,111,olx.ua
2,http://hiroba.dqx.jp.su.qqibhd.com/account/app...,1,61,hiroba.dqx.jp.su.qqibhd.com
3,http://techcrunch.com/video/windows-10-will-be...,0,92,techcrunch.com
4,http://torcache.net/torrent/6BEC67D4279BDEF07B...,0,140,torcache.net


### Number of @ symbols

In [8]:
count_of_at_symbols = []

for i in range(data.shape[0]):
    url = data.iloc[i].URL
    count_at = url.count('@')
    count_of_at_symbols.append(count_at)

data['count_of_at_symbols'] = count_of_at_symbols
display (data.head(5))

Unnamed: 0,URL,isPhishing,url_length,domain_names,count_of_at_symbols
0,http://superuser.com/questions/630156/how-do-i...,0,115,superuser.com,0
1,http://olx.ua/uk/i2/disable/m/?ref%5B0%5D%5Baj...,0,111,olx.ua,0
2,http://hiroba.dqx.jp.su.qqibhd.com/account/app...,1,61,hiroba.dqx.jp.su.qqibhd.com,0
3,http://techcrunch.com/video/windows-10-will-be...,0,92,techcrunch.com,0
4,http://torcache.net/torrent/6BEC67D4279BDEF07B...,0,140,torcache.net,0


### number of / symbols in path of url

In [9]:
count_of_fwd_slash_symbols = []

for i in range(data.shape[0]):
    url = data.iloc[i].URL
    
    splits = urlparse(url).path.split('/')
    depth = len(splits)
    count_of_fwd_slash_symbols.append(depth)

data['count_of_fwd_slash_symbols'] = count_of_fwd_slash_symbols
display (data.head(5))

Unnamed: 0,URL,isPhishing,url_length,domain_names,count_of_at_symbols,count_of_fwd_slash_symbols
0,http://superuser.com/questions/630156/how-do-i...,0,115,superuser.com,0,4
1,http://olx.ua/uk/i2/disable/m/?ref%5B0%5D%5Baj...,0,111,olx.ua,0,6
2,http://hiroba.dqx.jp.su.qqibhd.com/account/app...,1,61,hiroba.dqx.jp.su.qqibhd.com,0,5
3,http://techcrunch.com/video/windows-10-will-be...,0,92,techcrunch.com,0,5
4,http://torcache.net/torrent/6BEC67D4279BDEF07B...,0,140,torcache.net,0,3


In [10]:
# describe the list
print (
    min(count_of_fwd_slash_symbols), 
    max(count_of_fwd_slash_symbols), 
    sum(count_of_fwd_slash_symbols)/len(count_of_fwd_slash_symbols))

1 18 4.478666666666666
