# Phishing Detector

### Install Dependencies

In [None]:
!pip install numpy
!pip install pandas
!pip install scikit-learn
!pip install tld

In [154]:
import numpy as np
import pandas as pd
import pickle

### Read DataFrame

In [155]:
df = pd.read_csv("data.csv")
df.head()

Unnamed: 0,url,type
0,br-icloud.com.br,phishing
1,mp3raid.com/music/krizz_kaliko.html,benign
2,bopsecrets.org/rexroth/cr/1.htm,benign
3,http://www.garage-pirenne.be/index.php?option=...,defacement
4,http://adventure-nicaragua.net/index.php?optio...,defacement


## Features

In [3]:
from tld import get_tld
from urllib.parse import urlparse
import re

def abnormal_url(url):
    hostname = str(urlparse(url).hostname)
    match = re.search(hostname, url)
    return 0 if match else 1

def count_digit(url):
    digits = 0
    for i in url:
        if i.isnumeric(): digits = digits + 1
    return digits

def count_letter(url):
    letters = 0
    for i in url:
        if i.isalpha(): letters = letters + 1
    return letters

def is_https(url):
    return int(url.startswith("https://"))

def is_http(url):
    return int(url.startswith("http://"))

def count_dot(url):
    return url.count(".")

def count_www(url):
    return url.count("www")

def count_at(url):
    return url.count("@")

def count_dir(url):
    return urlparse(url).path.count("/")

def count_embed(url):
    return urlparse(url).path.count("//")

def count_percent(url):
    return url.count("%")

def count_ques(url):
    return url.count("?")

def count_dash(url):
    return url.count("-")

def count_equal(url):
    return url.count("=")

def url_length(url):
    return len(str(url))

def hostname_length(url):
    return len(urlparse(url).netloc)

def suspicious_words(url):
    match = re.search("PayPal|login|signin|bank|account|update|free|lucky|service|bonus|ebayisapi|webscr", url)
    return 1 if match else 0

# First Directory Length
def fd_length(url):
    urlpath = urlparse(url).path
    paths = urlpath.split("/")
    return len(paths[1]) if len(paths) > 1 else 0

def tld_length(url):
    domain = get_tld(url, fail_silently=True)
    return len(str(domain)) if domain else 0

Set Features.

In [157]:
df["abnormal_url"] = df["url"].apply(lambda x: abnormal_url(x))
df["count_digit"] = df["url"].apply(lambda x: count_digit(x))
df["count_letter"] = df["url"].apply(lambda x: count_letter(x))
df["is_https"] = df["url"].apply(lambda x: is_https(x))
df["is_http"] = df["url"].apply(lambda x: is_http(x))
df["count_dot"] = df["url"].apply(lambda x: count_dot(x))
df["count_www"] = df["url"].apply(lambda x: count_www(x))
df["count_at"] = df["url"].apply(lambda x: count_at(x))
df["count_dir"] = df["url"].apply(lambda x: count_dir(x))
df["count_embed"] = df["url"].apply(lambda x: count_embed(x))
df["count_percent"] = df["url"].apply(lambda x: count_percent(x))
df["count_ques"] = df["url"].apply(lambda x: count_ques(x))
df["count_dash"] = df["url"].apply(lambda x: count_dash(x))
df["count_equal"] = df["url"].apply(lambda x: count_equal(x))
df["url_length"] = df["url"].apply(lambda x: url_length(x))
df["hostname_length"] = df["url"].apply(lambda x: hostname_length(x))
df["suspicious_words"] = df["url"].apply(lambda x: suspicious_words(x))
df["fd_length"] = df["url"].apply(lambda x: fd_length(x))
df["tld_length"] = df["url"].apply(lambda x: tld_length(x))

df.head()

Unnamed: 0,url,type,abnormal_url,count_digit,count_letter,is_https,is_http,count_dot,count_www,count_at,...,count_embed,count_percent,count_ques,count_dash,count_equal,url_length,hostname_length,suspicious_words,fd_length,tld_length
0,br-icloud.com.br,phishing,1,0,13,0,0,2,0,0,...,0,0,0,1,0,16,0,0,0,0
1,mp3raid.com/music/krizz_kaliko.html,benign,1,1,29,0,0,2,0,0,...,0,0,0,0,0,35,0,0,5,0
2,bopsecrets.org/rexroth/cr/1.htm,benign,1,1,25,0,0,2,0,0,...,0,0,0,0,0,31,0,0,7,0
3,http://www.garage-pirenne.be/index.php?option=...,defacement,0,7,63,0,1,3,1,0,...,0,0,1,1,4,88,21,0,9,2
4,http://adventure-nicaragua.net/index.php?optio...,defacement,0,22,199,0,1,2,0,0,...,0,0,1,1,3,235,23,0,9,3


### Set Target Label

In [158]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df["type_label"] = le.fit_transform(df["type"])
df["type_label"].value_counts()

type_label
0    428103
1     96457
3     94111
2     32520
Name: count, dtype: int64

## Train the Model

Select 80% of data to train the model, 20% to test the accuracy.

In [159]:
from sklearn.model_selection import train_test_split

X = df[["abnormal_url", "count_digit", "count_letter", "is_https", "is_http", "count_dot", "count_www",
        "count_at", "count_dir", "count_embed", "count_percent", "count_ques", "count_dash", "count_equal",
        "url_length", "hostname_length", "suspicious_words", "fd_length", "tld_length"]].values
y = df["type_label"].values

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, shuffle=True, random_state=5)

Random Forest Cassification.

In [160]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=100, max_features="sqrt")
clf.fit(X_train, y_train)
preds = clf.predict(X_test)

Metrics.

In [161]:
from sklearn.metrics import classification_report, accuracy_score

print(classification_report(y_test, preds, target_names=["benign", "defacement", "phishing", "malware"]))
print("accuracy:   %0.3f" % accuracy_score(y_test, preds))

              precision    recall  f1-score   support

      benign       0.97      0.98      0.98     85621
  defacement       0.98      0.99      0.99     19292
    phishing       0.99      0.94      0.96      6504
     malware       0.91      0.86      0.88     18822

    accuracy                           0.97    130239
   macro avg       0.96      0.95      0.95    130239
weighted avg       0.97      0.97      0.97    130239

accuracy:   0.966


Write model to pickle file.

In [162]:
pickle.dump(clf, open("model", "wb"))

### Predict URL Type

Read the model from the pickle file and predict.

In [163]:
def get_features(url):
    values = []
    for f in [abnormal_url, count_digit, count_letter, is_https, is_http, count_dot, count_www,
              count_at, count_dir, count_embed, count_percent, count_ques, count_dash, count_equal,
              url_length, hostname_length, suspicious_words, fd_length, tld_length]:
        values.append(f(url))
    return values

def get_prediction_from_url(url):
    model = pickle.load(open("model", "rb"))
    
    features_test = get_features(url)
    features_test = np.array(features_test).reshape((1, -1))
    pred = model.predict(features_test)
    return ["benign", "defacement", "phishing", "malware"][pred[0]]

In [164]:
get_prediction_from_url("http://pikabu.ru/tag/%D0%BC%D0%B8%D0%BD%D0%B8%D0%BC%D0%B0%D0%BB%D0%B8%D0%B7%D0%BC/hot")

'benign'