# Phishing Website DetectionFeature Extraction

## 1.0 :- Phishing URLs

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
# phishing URLs to dataframe
dataset = pd.read_csv('2.online-valid.csv')
dataset.head()

Unnamed: 0,phish_id,url,phish_detail_url,submission_time,verified,verification_time,online,target
0,6557033,http://u1047531.cp.regruhosting.ru/acces-inges...,http://www.phishtank.com/phish_detail.php?phis...,2020-05-09T22:01:43+00:00,yes,2020-05-09T22:03:07+00:00,yes,Other
1,6557032,http://hoysalacreations.com/wp-content/plugins...,http://www.phishtank.com/phish_detail.php?phis...,2020-05-09T22:01:37+00:00,yes,2020-05-09T22:03:07+00:00,yes,Other
2,6557011,http://www.accsystemprblemhelp.site/checkpoint...,http://www.phishtank.com/phish_detail.php?phis...,2020-05-09T21:54:31+00:00,yes,2020-05-09T21:55:38+00:00,yes,Facebook
3,6557010,http://www.accsystemprblemhelp.site/login_atte...,http://www.phishtank.com/phish_detail.php?phis...,2020-05-09T21:53:48+00:00,yes,2020-05-09T21:54:34+00:00,yes,Facebook
4,6557009,https://firebasestorage.googleapis.com/v0/b/so...,http://www.phishtank.com/phish_detail.php?phis...,2020-05-09T21:49:27+00:00,yes,2020-05-09T21:51:24+00:00,yes,Microsoft


In [None]:
#Ttal URLs are
dataset.shape

(14858, 8)

In [None]:
# Randomly selecting 5000 URLs
randomURL = dataset.sample(n=5000,random_state=42).copy()
randomURL = randomURL.reset_index(drop=True)

In [None]:
# Displaying it
randomURL.head()

Unnamed: 0,phish_id,url,phish_detail_url,submission_time,verified,verification_time,online,target
0,6549743,https://iptf.ir/.well-known/acme-challenge/cha...,http://www.phishtank.com/phish_detail.php?phis...,2020-05-06T08:00:53+00:00,yes,2020-05-06T08:02:24+00:00,yes,Other
1,6524799,https://lynshirt.com/wp-admin/PayPal/customer_...,http://www.phishtank.com/phish_detail.php?phis...,2020-04-23T19:00:26+00:00,yes,2020-04-23T19:01:57+00:00,yes,Other
2,6509811,https://hotdealsaz.com/Secure/inline.php,http://www.phishtank.com/phish_detail.php?phis...,2020-04-16T16:08:38+00:00,yes,2020-05-03T04:04:08+00:00,yes,PayPal
3,6546380,http://lz5.1ee.myftpupload.com/mvc/b105e5a192f...,http://www.phishtank.com/phish_detail.php?phis...,2020-05-04T16:06:41+00:00,yes,2020-05-04T16:51:52+00:00,yes,Other
4,4495683,http://claassistencia.com.br/wp-admin/includes...,http://www.phishtank.com/phish_detail.php?phis...,2016-09-28T17:14:00+00:00,yes,2016-09-28T22:16:18+00:00,yes,Other


## 2.0 :-Legitimate Source URLs

In [None]:
dataset1 = pd.read_csv('1.Benign_list_big_final.csv')
dataset1.columns = ['URLs']
dataset1.head()

Unnamed: 0,URLs
0,http://1337x.to/torrent/1110018/Blackhat-2015-...
1,http://1337x.to/torrent/1122940/Blackhat-2015-...
2,http://1337x.to/torrent/1124395/Fast-and-Furio...
3,http://1337x.to/torrent/1145504/Avengers-Age-o...
4,http://1337x.to/torrent/1160078/Avengers-age-o...


In [None]:
# Total URLs are
dataset1.shape

(35377, 1)

In [None]:
# Randomly selecting 5000 URLs
randomURL1 = dataset1.sample(n=5000,random_state=42).copy()
randomURL1 = randomURL1.reset_index(drop=True)
randomURL1.head()

Unnamed: 0,URLs
0,http://correios.com.br/Para-governo/tribunais-...
1,http://caixa.gov.br/voce/habitacao/financiamen...
2,http://olx.ua/uk/list/q-%D0%BF%D0%BB%D0%B0%D1%...
3,http://emgn.com/entertainment/10-films-that-en...
4,http://metro.co.uk/2015/04/11/one-direction-re...


## 3.0 :-Feature Extraction

It is categorized into
1. Address Bar Based Features
2. HTML & JS Based Features

#### Address Bar Based Features

We have
1. Domain of URL
2. IP Address in URL
3. "@" Symbol in URL
4. Length of URL
5. Depth of URL
6. Redirection "//" in URL
7. "http/https" in Domain name
8. Using URL Shortening Services “TinyURL”
9. Prefix or Suffix "-" in Domain

In [None]:
from urllib.parse import urlparse,urlencode
import ipaddress
import re

In [None]:
# Domain of URL
def getDomain(url):
  domain = urlparse(url).netloc
  if re.match(r"^www.",domain):
    domain = domain.replace("www.","")
  return domain

In [None]:
# IP Address in URL
def hasIP(url):
  try:
    ipaddress.ip_address(url)
    ip=1 # means ip is present in url -> It's phished
  except:
    ip=0 # means ip is not present in url -> It's Legitimate
  return ip

In [None]:
def hasSym(url):
    if "@" in url:
        temp = 1  # URL has '@' symbol -> It's phished
    else:
        temp = 0  # URL does not have '@' symbol -> It's Legitimate
    return temp


In [None]:
# Length of URL
def getLen(url):
  if(len(url) < 54):
    temp=0 # It's legitimate
  else:
    temp=1 # Len is increased to hide doubtful part , It's Phished
  return temp

In [None]:
# Depth of URL (To get the no of '/')
def getDepth(url):
  s = urlparse(url).path.split('/')
  ct = 0
  for i in range(len(s)):
    if(len(s[i])!=0):
      ct=ct+1
  return ct

In [None]:
# Redirection in URL ("//" must be present below 6th position)
def redirect(url):
    pos = url.rfind('//')
    return 1 if pos > 7 else 0

In [None]:
# HTTP/HTTPs in URL
def checkHTTP(url):
  domain = urlparse(url).netloc
  if 'https' in domain:
    return 1 # It's phished one
  else:
    return 0 # It's Legitimate one

In [None]:
# URL Shortening
shortening_services = [
    "bit.ly", "tinyurl.com", "goo.gl", "ow.ly", "t.co", "is.gd", "buff.ly",
    "adf.ly", "bit.do", "shorte.st", "clk.sh", "x.co", "tr.im", "goo.su"
]

def urlShortened(url):
    domain = urlparse(url).netloc
    return 1 if domain in shortening_services else 0 # 1 -> Phished one or 0 -> Legitimate one

In [None]:
# "-" in Domain
def checkPrefSuff(url):
  domain = urlparse(url).netloc
  if '-' in domain:
    return 1 # Phished one
  else:
    return 0 # Legitimate one

#### HTML and JS Based Features

We have
1. IFrame Redirection
2. Status Bar Customization
3. Disabling Right Click
4. Website Forwarding

In [None]:
import requests

In [None]:
# IFrame Redirection
# If the iframe is empty or repsonse is not found then, the value assigned to this feature is 1 (phishing)
# or else 0 (legitimate).
def iframe(response):
  if response == "":
      return 1
  else:
      if re.findall(r"[|]", response.text):
          return 0
      else:
          return 1

In [None]:
# Status Bar Customization
def mouseOver(response):
  if response == "" :
    return 1
  else:
    if re.findall("", response.text):
      return 1
    else:
      return 0

In [None]:
# Disable Right Click
def rightClick(response):
  if response == "":
    return 1
  else:
    if re.findall(r"event.button ?== ?2", response.text):
      return 0
    else:
      return 1

In [None]:
# Website Forwarding
# Taking assumption of visiting Legitimate site max one time
def forwarding(response):
  if response == "":
    return 1
  else:
    if len(response.history) <= 2:
      return 0
    else:
      return 1

## 4.0 :-Making URL Features

In [None]:
def featureExtraction(url,label):
  features=[] # to store feature of URLs

  # Address Based Features
  features.append(getDomain(url))
  features.append(hasIP(url))
  features.append(hasSym(url))
  features.append(getLen(url))
  features.append(getDepth(url))
  features.append(redirect(url))
  features.append(checkHTTP(url))
  features.append(urlShortened(url))
  features.append(checkPrefSuff(url))

  # HTML and JS Based Features
  try:
    response = requests.get(url)
  except:
    response = ""
  features.append(iframe(response))
  features.append(mouseOver(response))
  features.append(rightClick(response))
  features.append(forwarding(response))
  features.append(label)

  return features

#### 4.1 Legitimate URLs

In [None]:
randomURL1.shape

(5000, 1)

In [None]:
# Extracting the features into a list
legi_features = []
label =0

for i in range(0,500):
  url=randomURL1['URLs'][i]
  legi_features.append(featureExtraction(url,label))

In [None]:
# list into Dataframe
feature_names = ['Domain', 'Have_IP', 'Have_At', 'URL_Length', 'URL_Depth','Redirection',
                 'https_Domain', 'TinyURL', 'Prefix/Suffix','iFrame', 'Mouse_Over','Right_Click', 'Web_Forwards', 'Label']

legitimate = pd.DataFrame(legi_features, columns= feature_names)
legitimate.head()

In [None]:
# Storing back the Dataframe into csv file
legitimate.to_csv('legitimate.csv', index= False)

#### 4.2 Phishing URLs:

In [None]:
randomURL.shape

In [None]:
# Extracting the features into a list
phish_features = []
label = 1
for i in range(0, 500):
  url = randomURL['url'][i]
  phish_features.append(featureExtraction(url,label))

In [None]:
#converting the list to dataframe
feature_names = ['Domain', 'Have_IP', 'Have_At', 'URL_Length', 'URL_Depth','Redirection',
                      'https_Domain', 'TinyURL', 'Prefix/Suffix','iFrame', 'Mouse_Over','Right_Click', 'Web_Forwards', 'Label']

phishing = pd.DataFrame(phish_features, columns= feature_names)
phishing.head()

In [None]:
#Storing back the DataFrame into csv file
phishing.to_csv('phishing.csv', index= False)

## 5.0 Final Dataset

In [None]:
finalData = pd.concat([legitimate,phishing]).reset_index(drop=True)
finalData.head()

In [None]:
# Total URLs to be examined are:
finalData.shape

In [None]:
finalData.to_csv('urldata.csv',index=False)