In [71]:
import pandas as pd
import numpy as np
import os
import re
import tldextract
from urllib.parse import urlparse
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from tqdm import tqdm

In [99]:
def get_feature(url):
    try:
        f1 = len(url)

        symbols = 0
        alpha_num = 0
        for i in range(len(url)):
            if(url[i].isalnum()):
                alpha_num += 1
            else: 
                symbols += 1

        f2 = symbols/alpha_num

        sus_symbol_count = 0
        sus_symbols = ['`','%','#','^','$','&','-','*',':']
        for i in range(len(url)):
            if(url[i] in sus_symbols):
                sus_symbol_count += 1

        f3 = sus_symbol_count

        f4 = len(urlparse(url).path)

        sus_keywords = [ 'submit', 'secure', 'suspend','confirm', 'webscr',
                        'account','login', 'signin', 'logon', 
                        'cmd', 'update', 'wp', 'index', 'payment',
                        'home', 'paypal', 'webhostapp', 'dropbox']

        all_str = ''
        for s in range(len(url)):
            if url[s].isalpha():
                all_str += url[s]

        sus_keyword_count = 0
        for i in range(len(url)):
            for j in range(i+1, len(url)+1):
                sub_str= url[i: j]
                if sub_str in sus_keywords:
                    sus_keyword_count += 1

        f5 = sus_keyword_count

        protocol_used = urlparse(url).scheme # finds https/http/ftp etc.
        f6 = 0 if(protocol_used == '') else 1 

        f7 = url.count('-')

        f8 = 0 if(url[-1].isalpha()) else 1 #last character is a symbol

        redirection_count = 0
        for i in range(1, len(url)-1):
            if(url[i] == '/'):
                if(url[i-1] == ':' and url[i+1] == '/'):
                    continue
                elif(url[i+1] == '/'):
                    redirection_count += 1

        f9 = 0 if(redirection_count == 0) else 1   

        f10 = 0 if(url.count('@') == 0) else 1 #presence of @

        f11 = url.count('/') - 2*url.count('//') # not counting the // slashes

        # regex to check if url contains IP address
        f12 = 0 if(re.match(r'http://\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}/.*', url) is None) else 1 

        f13 = url.count('?')

        f14 = len(tldextract.extract(url).subdomain.split('.')) # NO. of subdomains

        if (url.find('www') == -1):
            f15 = 1 # no www found means phish
        else:
            f15 = 0

        if (url.find('http') == -1 or url.find('https') == -1):
            f16 = 0 # not found means phish
        else:
            f16 = 1

        f17 = 1 if(urlparse(url).port is not None) else 0

        f18 = 0 if(len(url) == len(url.encode())) else 1 #If not ascii, then other unicode symbols are present

        features = list([f1, f2, f3, f4, f5, f6, f7, f8, f9, f10, f11, f12, f13, f14, f15, f16, f17, f18])
    #     print(len(features))
    #     print(features)
        return(features)
    
    except Exception as e:
        return list([])


In [102]:
def feature_extraction(path_to_csv):
    data_list=[]

    df_url = pd.read_csv(path_to_csv)
    # phishing urls are only 104438 in number.
    df_equal = df_url.groupby('label').sample(n=104438, random_state=1) # getting equal no. of both classes
#     print(df_equal.shape)
    
    for index, row in tqdm(df_equal.iterrows()):
        final_feature = get_feature(row['url'])
        if(len(final_feature) == 0): # If there's error and the features returned are 0, then continue
            continue
        else:
            class_label = row['result']

            final_feature.insert(0,row['url'])
            final_feature.insert(1,class_label)

            data_list.append(final_feature)
        
       
    return(data_list)

In [103]:
csv_path = 'YOUR_PATH_TO_CSV/urldata.csv'

data_list1 = feature_extraction(csv_path)

df = pd.DataFrame(data_list1)
# #  --------------------------------------------------------------------------------------
                                
df.rename(columns = {0: "url", 1: "label"}, inplace = True)

208876it [04:04, 853.16it/s] 


In [104]:
df.head()

Unnamed: 0,url,label,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,https://www.amazon.co.uk/find-me-olwen-wymark-...,0,151,0.237705,17,29,0,1,4,1,0,0,2,0,1,1,0,1,0,0
1,https://www.marcelmusic.com/,0,28,0.272727,1,1,0,1,0,1,0,0,1,0,0,1,0,1,0,0
2,https://www.mediafire.com/?j4ymvncnobm,0,38,0.225806,1,1,0,1,0,0,0,0,1,0,1,1,0,1,0,0
3,https://gray-seddon-tea.com/,0,28,0.333333,3,1,0,1,2,1,0,0,1,0,0,1,1,1,0,0
4,https://www.wn.com/Quebec_St_Malo_Race,0,38,0.310345,1,20,0,1,0,0,0,0,1,0,0,1,0,1,0,0


In [105]:
df.shape

(208875, 20)

In [106]:
df.to_csv('url_features.csv', index=False)

### SVM Classification

In [None]:
# df=pd.read_csv('url_features.csv')

In [107]:
array=df.values
x_feature=array[:,2:]
y_label=array[:,1].astype('int')
print(x_feature.shape)
print(y_label.shape)

(208875, 18)
(208875,)


In [108]:
X_train,X_test,Y_train,Y_test=train_test_split(x_feature,y_label,test_size=0.10,random_state=7)

In [109]:
# Normalise the data after splitting to avoid information leak between train and test set.

scaler_norm = MinMaxScaler()

X_train = scaler_norm.fit_transform(X_train)
X_test = scaler_norm.fit_transform(X_test)

In [110]:
# Random check on 10 folds cross-validation
model_SVC=SVC(kernel='rbf',C=100,gamma=0.001)

kfold=KFold(n_splits=10, shuffle=True)
cv_results=cross_val_score(model_SVC,X_train,Y_train,cv=kfold,scoring='accuracy')
msg="%s %f (%f)" % ('Training Accuracy: ',cv_results.mean(),cv_results.std())
print(msg)

Training Accuracy:  0.992558 (0.000762)


In [112]:
model_SVC = SVC(C=100,gamma=0.001, kernel='rbf')
model_SVC.fit(X_train,Y_train) 

predictions=model_SVC.predict(X_test)

print(accuracy_score(Y_test,predictions))
print(confusion_matrix(Y_test,predictions))
print(classification_report(Y_test,predictions))

0.9924358483339717
[[10492    25]
 [  133 10238]]
              precision    recall  f1-score   support

           0       0.99      1.00      0.99     10517
           1       1.00      0.99      0.99     10371

    accuracy                           0.99     20888
   macro avg       0.99      0.99      0.99     20888
weighted avg       0.99      0.99      0.99     20888



### Conclusion
This is an implementation of the paper -**"Phishing URL detection system based on URL features using SVM"** (http://eses.net.in/documents/paper5.2.3.pdf). Please cite this for any usage.
The dataset used for this is taken from here - https://www.kaggle.com/siddharthkumar25/malicious-and-benign-urls

The dataset is different from the one mentioned in the paper. However, the feature extraction process and the svm classification is the same. We have used 104,437 phishing urls and 104,438 genuine urls, in total 208,875 urls. We have extracted 18 features f1 to f18, and used all for the classification while in the paper they only proceeded with using the first 15 features.
The slight difference in result accuracy(99.24%) is due to a much bigger, different dataset.