**Data Pre-processing**

In [77]:
import pandas as pd 
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import re

In [78]:
def getDomainName(s):
  temp = s.split("/")[2]
  return temp
def getFeatures(url) :
  featureList = []
  domainName = getDomainName(url)
  all_digits = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']
  # 1) Length of URL
  featureList.append(len(url))
  # 2) Length of Domain Name
  featureList.append(len(domainName))
  # 3) ip
  pattern1 = re.compile(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})')
  if pattern1.search(url) != None:
    featureList.append(1)
  else:
    featureList.append(0)
  # 4) nb_dots
  featureList.append(len(domainName.split(".")) - 1)
  # 5) nb_hyphens
  featureList.append(len(domainName.split("-")) - 1)
  # 6) nb_at
  featureList.append(len(url.split("@")) - 1)
  # 7) nb_slash
  featureList.append(len(url.split("/")) - 1)
  # 8) nb_www
  featureList.append(len(url.split("www")) - 1)
  # 9) nb_dslash
  count = len(url.split("//")) - 2
  if count > 0:
    featureList.append(1)
  else:
    featureList.append(0)
  # 10) http_in_path
  featureList.append(len(url.split("http")) - 2)
  # 11) https_token
  scheme = url.split("/")[0]
  if scheme == "http:":
    featureList.append(1)
  else:
    featureList.append(0)
  # 12) ratio_digits_url
  total_digits_url = 0
  for s in url:
    if s in all_digits:
        total_digits_url += 1
  featureList.append(round(total_digits_url / len(url), 9))
  # 13) ratio_digits_host
  total_digits_domain = 0
  for s in domainName:
    if s in all_digits:
        total_digits_domain += 1
  featureList.append(round(total_digits_domain / len(domainName), 9))
  return featureList

In [79]:
df = pd.read_csv('FinalPhishing.csv')

In [80]:
tempdf = df[["length_url", 'length_hostname', 'ip', 'nb_dots', 'nb_hyphens', 'nb_at', 'nb_slash', 'nb_www', 'nb_dslash', 
             'http_in_path', 'https_token', 'ratio_digits_url', 'ratio_digits_host',# 'nb_hyperlinks', 'page_rank', 
             'status']]

# Model Training

In [81]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_recall_fscore_support

In [82]:
Y = tempdf['status']
tempdf.drop('status', inplace = True, axis = 1)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [83]:
Y

0        0
1        1
2        1
3        0
4        0
        ..
11425    0
11426    1
11427    0
11428    0
11429    1
Name: status, Length: 11430, dtype: int64

In [84]:
X_train, X_test, y_train, y_test = train_test_split(tempdf, Y, test_size = 0.2, random_state = 42)

In [85]:
clf = RandomForestClassifier(n_estimators = 2000, bootstrap = True, random_state = 42).fit(X_train, y_train)
predictions = clf.predict(X_test)
accuracy_score(y_test, predictions)

0.8517060367454068

In [86]:
precision_recall_fscore_support(y_test, predictions)

(array([0.84254606, 0.86172161]),
 array([0.86949006, 0.83348096]),
 array([0.85580604, 0.84736605]),
 array([1157, 1129], dtype=int64))

In [97]:
import pickle
filename = 'finalized_model.sav'
pickle.dump(clf, open(filename, 'wb'))

In [98]:
loaded_model = pickle.load(open(filename, 'rb'))

## Predicting for a given URL


In [99]:
url = """90% daily data quota used as on 08-Dec-21 14:02.
Jio Number : 8696810062 मेरा अगला रिचार्ज कब है? क्लिक करे और जाने wa.me/919654297000?text=Hi
For tips on how to manage data quota effectively, click https://youtu.be/ZFUDydctV78
Time to get expert care from the safety & comfort of your home! Consult Apollo Doctors online today at Apollo24|7. klr.pw/NKwxv/vPORQj
If you do not have any other data plan, your internet speed will be reduced on consumption of 1.50 GB.
To continue enjoying high speed internet, click https://www.jio.com/dl/data_voucher and recharge.
Dial 1991, to know your current balance, validity, plan details and for exciting recharge plans."""

In [100]:
URL_REGEX = r"""(?i)\b((?:https?:(?:/{1,3}|[a-z0-9%])|[a-z0-9.\-]+[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)/)(?:[^\s()<>{}\[\]]+|\([^\s()]?\([^\s()]+\)[^\s()]?\)|\([^\s]+?\))+(?:\([^\s()]?\([^\s()]+\)[^\s()]?\)|\([^\s]+?\)|[^\s`!()\[\]{};:\'\".,<>?«»“”‘’])|(?:(?<!@)[a-z0-9]+(?:[.\-][a-z0-9]+)*[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)\b/?(?!@)))"""

In [101]:
import re
urls_list = re.findall(URL_REGEX,"some text " + url + " more text")
urls_list

['wa.me/919654297000?text=Hi',
 'https://youtu.be/ZFUDydctV78',
 'klr.pw/NKwxv/vPORQj',
 'https://www.jio.com/dl/data_voucher']

In [110]:
url = urls_list[3]

In [111]:
testURL =  np.array(getFeatures(url))

In [112]:
testURL

array([35., 11.,  0.,  2.,  0.,  0.,  4.,  1.,  0.,  0.,  0.,  0.,  0.])

In [113]:
loaded_model.predict(testURL.reshape(1,-1))

array([0], dtype=int64)

Future Scope

Although the use of URL lexical features alone has been shown to result in high accuracy (97%), phishers have learned how to make predicting a URL destination difficult by carefully manipulating the URL to evade detection. Therefore, combining these features with others, such as host, is the most effective approach .

For future enhancements, we intend to build the phishing detection system as a scalable web service which will incorporate online learning so that new phishing attack patterns can easily be learned and improve the accuracy of our models with better feature extraction.


In [94]:
X_test
# tempList = []
# for row in df.index:
#   if df["status"][row] == "phishing":
#     tempList.append(1)
#   else:
#     tempList.append(0)
# df["status"] = tempList

Unnamed: 0,length_url,length_hostname,ip,nb_dots,nb_hyphens,nb_at,nb_slash,nb_www,nb_dslash,http_in_path,https_token,ratio_digits_url,ratio_digits_host
7529,35,11,0,2,0,0,4,1,0,0,0,0.000000,0.000000
11221,22,15,0,2,0,0,2,0,0,0,1,0.000000,0.000000
4889,43,15,0,2,0,0,3,1,0,0,0,0.000000,0.000000
8962,26,18,0,2,0,0,3,1,0,0,1,0.153846,0.222222
4004,62,13,0,1,0,0,6,0,0,0,1,0.016129,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
10243,128,50,0,4,1,0,5,0,0,0,0,0.140625,0.000000
6013,48,39,0,2,1,0,3,1,0,0,0,0.000000,0.000000
1224,33,14,0,2,0,0,3,0,0,0,0,0.090909,0.071429
5157,67,15,0,2,0,0,3,1,0,0,1,0.074627,0.000000
