In [1]:
# ! pip install bs4
# ! pip install beautifulsoup4
# ! pip install lxml
# ! pip install python-whois
# ! pip install googlesearch-python
# ! pip install pandas

In [2]:
#importing required packages for this module
import pandas as pd

In [3]:
#loading the phishing URLs data to dataframe
phishurl = pd.read_csv("online-valid.csv")
phishurl.head()

Unnamed: 0,url
0,http://u1047531.cp.regruhosting.ru/acces-inges...
1,http://hoysalacreations.com/wp-content/plugins...
2,http://www.accsystemprblemhelp.site/checkpoint...
3,http://www.accsystemprblemhelp.site/login_atte...
4,https://firebasestorage.googleapis.com/v0/b/so...


In [4]:
phishurl.shape

(5000, 1)

In [5]:
#Loading legitimate files 
legiurl = pd.read_csv("Benign_list_big_final.csv")
legiurl.columns = ['URLs']
legiurl.head()

Unnamed: 0,URLs
0,http://1337x.to/torrent/1048648/American-Snipe...
1,http://1337x.to/torrent/1110018/Blackhat-2015-...
2,http://1337x.to/torrent/1122940/Blackhat-2015-...
3,http://1337x.to/torrent/1124395/Fast-and-Furio...
4,http://1337x.to/torrent/1145504/Avengers-Age-o...


In [6]:
legiurl.shape

(5000, 1)

Legit -> 0
Phish -> 1

In [7]:
# importing required packages for this section
from urllib.parse import urlparse,urlencode
import ipaddress
import re

In [8]:
# 1.Domain of the URL (Domain) 
def getDomain(url):  
  domain = urlparse(url).netloc
  if re.match(r"^www.",domain):
    domain = domain.replace("www.","")
  return domain

In [9]:
# 2.Checks for IP address in URL (Have_IP)
def havingIP(url):
  try:
    ipaddress.ip_address(url)
    ip = 1
  except:
    ip = 0
  return ip


In [10]:
# 3.Checks the presence of @ in URL (Have_At)
def haveAtSign(url):
  if "@" in url:
    at = 1    
  else:
    at = 0    
  return at

In [11]:
# 4.Finding the length of URL and categorizing (URL_Length)
def getLength(url):
  if len(url) < 54:
    length = 0            
  else:
    length = 1            
  return length

In [12]:
# 5.Gives number of '/' in URL (URL_Depth)
def getDepth(url):
  s = urlparse(url).path.split('/')
  depth = 0
  for j in range(len(s)):
    if len(s[j]) != 0:
      depth = depth+1
  return depth

In [13]:
# 6.Checking for redirection '//' in the url (Redirection)
def redirection(url):
  pos = url.rfind('//')
  if pos > 6:
    if pos > 7:
      return 1
    else:
      return 0
  else:
    return 0

In [14]:
# 7.Existence of “HTTPS” Token in the Domain Part of the URL (https_Domain)
def httpDomain(url):
  domain = urlparse(url).netloc
  if 'https' in domain:
    return 1
  else:
    return 0

In [15]:
#listing shortening services
shortening_services = r"bit\.ly|goo\.gl|shorte\.st|go2l\.ink|x\.co|ow\.ly|t\.co|tinyurl|tr\.im|is\.gd|cli\.gs|" \
                      r"yfrog\.com|migre\.me|ff\.im|tiny\.cc|url4\.eu|twit\.ac|su\.pr|twurl\.nl|snipurl\.com|" \
                      r"short\.to|BudURL\.com|ping\.fm|post\.ly|Just\.as|bkite\.com|snipr\.com|fic\.kr|loopt\.us|" \
                      r"doiop\.com|short\.ie|kl\.am|wp\.me|rubyurl\.com|om\.ly|to\.ly|bit\.do|t\.co|lnkd\.in|db\.tt|" \
                      r"qr\.ae|adf\.ly|goo\.gl|bitly\.com|cur\.lv|tinyurl\.com|ow\.ly|bit\.ly|ity\.im|q\.gs|is\.gd|" \
                      r"po\.st|bc\.vc|twitthis\.com|u\.to|j\.mp|buzurl\.com|cutt\.us|u\.bb|yourls\.org|x\.co|" \
                      r"prettylinkpro\.com|scrnch\.me|filoops\.info|vzturl\.com|qr\.net|1url\.com|tweez\.me|v\.gd|" \
                      r"tr\.im|link\.zip\.net"

In [16]:
# 8. Checking for Shortening Services in URL (Tiny_URL)
def tinyURL(url):
    match=re.search(shortening_services,url)
    if match:
        return 1
    else:
        return 0

In [17]:
# 9.Checking for Prefix or Suffix Separated by (-) in the Domain (Prefix/Suffix)
def prefixSuffix(url):
    if '-' in urlparse(url).netloc:
        return 1            # phishing
    else:
        return 0            # legitimate

In [18]:
# importing required packages for this section
import re
from bs4 import BeautifulSoup
import whois
import urllib
import urllib.request
from datetime import datetime

In [19]:
# 11.DNS Record availability (DNS_Record)
# obtained in the featureExtraction function itself

In [20]:
# 12.Web traffic (Web_Traffic)
def web_traffic(url):
  try:
    #Filling the whitespaces in the URL if any
    url = urllib.parse.quote(url)
    rank = BeautifulSoup(urllib.request.urlopen("http://data.alexa.com/data?cli=10&dat=s&url=" + url).read(), "xml").find(
        "REACH")['RANK']
    rank = int(rank)
  except :
        return 1
  if rank <100000:
    return 1
  else:
    return 0

In [21]:
# 13.Survival time of domain: The difference between termination time and creation time (Domain_Age)  
def domainAge(domain_name):
  creation_date = domain_name.creation_date
  expiration_date = domain_name.expiration_date
  if (isinstance(creation_date,str) or isinstance(expiration_date,str)):
    try:
      creation_date = datetime.strptime(creation_date,'%Y-%m-%d')
      expiration_date = datetime.strptime(expiration_date,"%Y-%m-%d")
    except:
      return 1
  if ((expiration_date is None) or (creation_date is None)):
      return 1
  elif ((type(expiration_date) is list) or (type(creation_date) is list)):
      return 1
  else:
    ageofdomain = abs((expiration_date - creation_date).days)
    if ((ageofdomain/30) < 6):
      age = 1
    else:
      age = 0
  return age

In [22]:
# 14.End time of domain: The difference between termination time and current time (Domain_End) 
def domainEnd(domain_name):
  expiration_date = domain_name.expiration_date
  if isinstance(expiration_date,str):
    try:
      expiration_date = datetime.strptime(expiration_date,"%Y-%m-%d")
    except:
      return 1
  if (expiration_date is None):
      return 1
  elif (type(expiration_date) is list):
      return 1
  else:
    today = datetime.now()
    end = abs((expiration_date - today).days)
    if ((end/30) < 6):
      end = 0
    else:
      end = 1
  return end

In [23]:
# importing required packages for this section
import requests

In [24]:
# 15. IFrame Redirection (iFrame)
def iframe(response):
  if response == "":
      return 1
  else:
      if re.findall(r"[<iframe>|<frameBorder>]", response.text):
          return 0
      else:
          return 1

In [25]:
# 16.Checks the effect of mouse over on status bar (Mouse_Over)
def mouseOver(response): 
  if response == "" :
    return 1
  else:
    if re.findall("<script>.+onmouseover.+</script>", response.text):
      return 1
    else:
      return 0

In [26]:
# 17.Checks the status of the right click attribute (Right_Click)
def rightClick(response):
  if response == "":
    return 1
  else:
    if re.findall(r"event.button ?== ?2", response.text):
      return 0
    else:
      return 1

In [27]:
# 18.Checks the number of forwardings (Web_Forwards)    
def forwarding(response):
  if response == "":
    return 1
  else:
    if len(response.history) <= 2:
      return 0
    else:
      return 1

In [28]:
# ! pip install googlesearch-python
# ! pip install google
# ! pip install google-search
# ! pip install google-cloud

In [29]:
from googlesearch import search
def google_index(url):
    site = search(url, 5)
    return 1 if site else 0

In [30]:
def count_dot(url):
    count_dot = url.count('.')
    return count_dot

def count_www(url):
    url.count('www')
    return url.count('www')

def count_per(url):
    return url.count('%')

def count_ques(url):
    return url.count('?')

def count_hyphen(url):
    return url.count('-')

def count_equal(url):
    return url.count('=')

In [31]:
#Function to extract features
def featureExtraction(url,label):

  features = []
  #Address bar based features (10)
  features.append(getDomain(url))
  features.append(havingIP(url))
  features.append(haveAtSign(url))
  features.append(getLength(url))
  features.append(getDepth(url))
  features.append(redirection(url))
  features.append(httpDomain(url))
  features.append(tinyURL(url))
  features.append(prefixSuffix(url))
  
  #Domain based features (4)
  dns = 0
  try:
    domain_name = whois.whois(urlparse(url).netloc)
  except:
    dns = 1

  features.append(dns)
  features.append(web_traffic(url))
  features.append(1 if dns == 1 else domainAge(domain_name))
  features.append(1 if dns == 1 else domainEnd(domain_name))
  
  # HTML & Javascript based features (4)
  try:
    response = requests.get(url)
  except:
    response = ""
  features.append(iframe(response))
  features.append(mouseOver(response))
  features.append(rightClick(response))
  features.append(forwarding(response))
  features.append(google_index(url))
  
  features.append(count_dot(url))
  features.append(count_www(url))
  features.append(count_per(url))
  features.append(count_ques(url))
  features.append(count_hyphen(url))
  features.append(count_equal(url))

  features.append(label)
  
  return features

In [32]:
legiurl.shape

(5000, 1)

In [33]:
#Extracting the feautres & storing them in a list
legi_features = []
label = 0
for i in range(0, 5000):
  url = legiurl['URLs'][i]
  legi_features.append(featureExtraction(url,label))

Error trying to connect to socket: closing socket - [WinError 10054] An existing connection was forcibly closed by the remote host
Error trying to connect to socket: closing socket - [WinError 10054] An existing connection was forcibly closed by the remote host


In [34]:
#converting the list to dataframe
feature_names = ['Domain', 'Have_IP', 'Have_At', 'URL_Length', 'URL_Depth','Redirection', 
                 'https_Domain', 'TinyURL', 'Prefix/Suffix', 'DNS_Record', 'Web_Traffic', 
                 'Domain_Age', 'Domain_End', 'iFrame', 'Mouse_Over','Right_Click', 'Web_Forwards','Google_Index',
                 'count_dot','count_www', 'count_per','count_ques','count_hyphen','count_equal', 'Label']

legitimate = pd.DataFrame(legi_features, columns= feature_names)
legitimate.head()

Unnamed: 0,Domain,Have_IP,Have_At,URL_Length,URL_Depth,Redirection,https_Domain,TinyURL,Prefix/Suffix,DNS_Record,...,Right_Click,Web_Forwards,Google_Index,count_dot,count_www,count_per,count_ques,count_hyphen,count_equal,Label
0,1337x.to,0,0,1,3,0,0,0,0,0,...,1,0,1,1,0,0,0,8,0,0
1,1337x.to,0,0,1,3,0,0,0,0,0,...,1,0,1,1,0,0,0,9,0,0
2,1337x.to,0,0,1,3,0,0,0,0,0,...,1,0,1,1,0,0,0,9,0,0
3,1337x.to,0,0,1,3,0,0,0,0,0,...,1,0,1,1,0,0,0,11,0,0
4,1337x.to,0,0,1,3,0,0,0,0,0,...,1,0,1,1,0,0,0,9,0,0


In [35]:
# Storing the extracted legitimate URLs fatures to csv file
legitimate.to_csv('urldata_legit.csv', index= False)

In [36]:
phishurl.shape

(5000, 1)

In [37]:
#Extracting the feautres & storing them in a list
phish_features = []
label = 1
for i in range(0, 5000):
  url = phishurl['url'][i]
  phish_features.append(featureExtraction(url,label))

Error trying to connect to socket: closing socket - timed out
Error trying to connect to socket: closing socket - timed out


In [38]:
#converting the list to dataframe
feature_names = ['Domain', 'Have_IP', 'Have_At', 'URL_Length', 'URL_Depth','Redirection', 
                      'https_Domain', 'TinyURL', 'Prefix/Suffix', 'DNS_Record', 'Web_Traffic', 
                      'Domain_Age', 'Domain_End', 'iFrame', 'Mouse_Over','Right_Click', 'Web_Forwards','Google_Index',                 
                 'count_dot','count_www', 'count_per','count_ques','count_hyphen','count_equal', 'Label']

phishing = pd.DataFrame(phish_features, columns= feature_names)
phishing.head()

Unnamed: 0,Domain,Have_IP,Have_At,URL_Length,URL_Depth,Redirection,https_Domain,TinyURL,Prefix/Suffix,DNS_Record,...,Right_Click,Web_Forwards,Google_Index,count_dot,count_www,count_per,count_ques,count_hyphen,count_equal,Label
0,u1047531.cp.regruhosting.ru,0,0,1,2,0,0,0,0,0,...,1,1,1,3,0,0,0,3,0,1
1,hoysalacreations.com,0,0,1,6,0,0,0,0,0,...,1,0,1,1,0,0,0,4,0,1
2,accsystemprblemhelp.site,0,0,0,1,0,0,0,0,0,...,1,1,1,3,1,0,0,0,0,1
3,accsystemprblemhelp.site,0,0,1,1,0,0,0,0,0,...,1,1,1,3,1,0,0,0,2,1
4,firebasestorage.googleapis.com,0,0,1,5,0,0,1,0,1,...,1,0,1,5,0,0,1,5,2,1


In [39]:
# Storing the extracted legitimate URLs fatures to csv file
phishing.to_csv('urldata_phish.csv', index= False)

In [40]:
#Concatenating the dataframes into one 
urldata = pd.concat([legitimate, phishing]).reset_index(drop=True)
urldata.head()

Unnamed: 0,Domain,Have_IP,Have_At,URL_Length,URL_Depth,Redirection,https_Domain,TinyURL,Prefix/Suffix,DNS_Record,...,Right_Click,Web_Forwards,Google_Index,count_dot,count_www,count_per,count_ques,count_hyphen,count_equal,Label
0,1337x.to,0,0,1,3,0,0,0,0,0,...,1,0,1,1,0,0,0,8,0,0
1,1337x.to,0,0,1,3,0,0,0,0,0,...,1,0,1,1,0,0,0,9,0,0
2,1337x.to,0,0,1,3,0,0,0,0,0,...,1,0,1,1,0,0,0,9,0,0
3,1337x.to,0,0,1,3,0,0,0,0,0,...,1,0,1,1,0,0,0,11,0,0
4,1337x.to,0,0,1,3,0,0,0,0,0,...,1,0,1,1,0,0,0,9,0,0


In [41]:
# Storing the data in CSV file
urldata.to_csv('urldata.csv', index=False)