In [85]:
import numpy as np
import pandas as pd
from urllib.parse import urlparse
from tld import get_tld
import os.path
import scipy as sp
import re

from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,classification_report,accuracy_score

In [3]:
data = pd.read_csv("dataset/finalData.csv")


In [27]:
# data = data.drop("Unnamed: 0", axis=1)
data.tail()

Unnamed: 0,url,label,result,urlLength,hostLength,pathLength,dirLength,tld_length,num@,num-,num.,num?,num-www,num=,num%,num-digit,num-letter,num-dir,checkIp,isShorted
450171,http://ecct-it.com/docmmmnn/aptgd/index.php,malicious,1,43,11,25,8,3,0,1,2,0,0,0,0,0,34,3,1,-1
450172,http://faboleena.com/js/infortis/jquery/plugin...,malicious,1,159,13,139,2,3,0,0,2,0,0,1,0,21,118,12,1,1
450173,http://faboleena.com/js/infortis/jquery/plugin...,malicious,1,147,13,127,2,3,0,0,1,0,0,1,0,20,109,12,1,1
450174,http://atualizapj.com/,malicious,1,22,14,1,0,3,0,0,1,0,0,0,0,0,17,1,1,1
450175,http://writeassociate.com/test/Portal/inicio/I...,malicious,1,143,18,118,4,3,0,1,4,0,1,0,0,9,118,7,1,1


In [94]:
print("Total URLs: {}".format(len(data)))

print("Benign URL: {}".format(len(data[data['result']==0])))

print("Malicious URL: {}".format(len(data[data['result']==1])))

Total URLs: 450176
Benign URL: 345738
Malicious URL: 104438


In [71]:
def processData(url):
    byToken = str(url.encode('utf-8')).split('/')
    total = []
    for t in byToken:
        dashToken = str(t).split('-')
        dotToken = []
        for i in range(0, len(dashToken)):
            temp = str(dashToken[i]).split('.')
            dotToken += temp
        total += dashToken + dotToken
    return list(set(total))

In [83]:
vectorizer = TfidfVectorizer(tokenizer=processData)
X = vectorizer.fit_transform(data['url'])
Y = data['label']

In [73]:
data.columns

Index(['url', 'label', 'result', 'urlLength', 'hostLength', 'pathLength',
       'dirLength', 'tld_length', 'num@', 'num-', 'num.', 'num?', 'num-www',
       'num=', 'num%', 'num-digit', 'num-letter', 'num-dir', 'checkIp',
       'isShorted'],
      dtype='object')

In [75]:
features = sp.sparse.csr_matrix(data[['urlLength', 'hostLength', 'pathLength',
       'dirLength', 'tld_length', 'num@', 'num-', 'num.', 'num?', 'num-www',
       'num=', 'num%', 'num-digit', 'num-letter', 'num-dir', 'checkIp',
       'isShorted']].values)

In [81]:
testing = hstack([X, features])

In [82]:
testing.shape

(450176, 780489)

In [86]:
rfc = RandomForestClassifier()
x_train, x_test, y_train, y_test = train_test_split(testing, Y, train_size=0.3, random_state=42)

In [87]:
rfc.fit(x_train,y_train)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [89]:
result = rfc.predict(x_test)
accuracy_score(y_test, result)

0.9976771048856958

In [90]:
confusion_matrix(y_test, result)

array([[241779,    173],
       [   559,  72613]])

In [97]:
print(classification_report(y_test, result,digits=3))

              precision    recall  f1-score   support

      benign      0.998     0.999     0.998    241952
   malicious      0.998     0.992     0.995     73172

    accuracy                          0.998    315124
   macro avg      0.998     0.996     0.997    315124
weighted avg      0.998     0.998     0.998    315124

