In [27]:
import pandas as pd
from urllib.parse import urlparse
from tld import get_tld
import re

In [28]:
data  = pd.read_csv("dataset/urldata.csv")
data.head()

Unnamed: 0.1,Unnamed: 0,url,label,result
0,0,https://www.google.com,benign,0
1,1,https://www.youtube.com,benign,0
2,2,https://www.facebook.com,benign,0
3,3,https://www.baidu.com,benign,0
4,4,https://www.wikipedia.org,benign,0


In [29]:
#drop the unnamed column
data = data.drop("Unnamed: 0",1)

# Length Features

In [30]:
#Calculate Length
data['urlLength'] = data['url'].apply(lambda i: len(str(i)) )

In [31]:
#Calculate host and path length
def host_length(url):
    length = len(urlparse(url).netloc)
    return length

def path_length(url):
     return urlparse(url).path
     

data['hostLength'] = data['url'].apply(lambda i: host_length(i))

data['pathLength'] = data['url'].apply(lambda i: len(path_length(i)))

In [32]:
#length of first directory

def dir_length(url):
    dirl = urlparse(url).path
    try:
        return len(dirl.split('/')[1])
    except: 
        return 0

data['dirLength'] = data['url'].apply(lambda i: dir_length(i))

In [33]:
#length of Top-Level domain
data['tld'] = data['url'].apply(lambda i: get_tld(i,fail_silently=True))

def tld_length(tld):
    try:
        return len(tld)
    except:
        return -1

data['tld_length'] = data['tld'].apply(lambda i: tld_length(i))
data = data.drop('tld',1)

In [34]:
data.tail()

Unnamed: 0,url,label,result,urlLength,hostLength,pathLength,dirLength,tld_length
450171,http://ecct-it.com/docmmmnn/aptgd/index.php,malicious,1,43,11,25,8,3
450172,http://faboleena.com/js/infortis/jquery/plugin...,malicious,1,159,13,139,2,3
450173,http://faboleena.com/js/infortis/jquery/plugin...,malicious,1,147,13,127,2,3
450174,http://atualizapj.com/,malicious,1,22,14,1,0,3
450175,http://writeassociate.com/test/Portal/inicio/I...,malicious,1,143,18,118,4,3


# Count Features 

In [35]:
data['num@'] = data['url'].apply(lambda i: i.count('@'))
data['num-'] = data['url'].apply(lambda i: i.count('-'))
data['num.'] = data['url'].apply(lambda i: i.count('.'))
data['num?'] = data['url'].apply(lambda i: i.count('?'))
data['num-www'] = data['url'].apply(lambda i: i.count('www'))
data['num='] = data['url'].apply(lambda i: i.count('='))
data['num%'] = data['url'].apply(lambda i: i.count('%'))

In [36]:
#Digit Count
def digit_count(url):
    count=0
    for c in url:
        if c.isnumeric():
            count+=1
    return count

#Letter Count
def letter_count(url):
    count=0
    for c in url:
        if c.isalpha():
            count+=1
    return count


In [37]:
data['num-digit'] = data['url'].apply(lambda i: digit_count(i))
data['num-letter'] = data['url'].apply(lambda i: letter_count(i))

#Directory Count
data['num-dir'] = data['url'].apply(lambda i: urlparse(i).path.count('/'))

In [38]:
data.tail()

Unnamed: 0,url,label,result,urlLength,hostLength,pathLength,dirLength,tld_length,num@,num-,num.,num?,num-www,num=,num%,num-digit,num-letter,num-dir
450171,http://ecct-it.com/docmmmnn/aptgd/index.php,malicious,1,43,11,25,8,3,0,1,2,0,0,0,0,0,34,3
450172,http://faboleena.com/js/infortis/jquery/plugin...,malicious,1,159,13,139,2,3,0,0,2,0,0,1,0,21,118,12
450173,http://faboleena.com/js/infortis/jquery/plugin...,malicious,1,147,13,127,2,3,0,0,1,0,0,1,0,20,109,12
450174,http://atualizapj.com/,malicious,1,22,14,1,0,3,0,0,1,0,0,0,0,0,17,1
450175,http://writeassociate.com/test/Portal/inicio/I...,malicious,1,143,18,118,4,3,0,1,4,0,1,0,0,9,118,7


# Binary Features

In [39]:
#Check if url contains IPv4 / IPv6 / IPv4 in hex
def checkIP(url):
    check = re.search(
        '(([01]?\\d\\d?|2[0-4]\\d|25[0-5])\\.([01]?\\d\\d?|2[0-4]\\d|25[0-5])\\.([01]?\\d\\d?|2[0-4]\\d|25[0-5])\\.'
        '([01]?\\d\\d?|2[0-4]\\d|25[0-5])\\/)|' 
        '((0x[0-9a-fA-F]{1,2})\\.(0x[0-9a-fA-F]{1,2})\\.(0x[0-9a-fA-F]{1,2})\\.(0x[0-9a-fA-F]{1,2})\\/)'
        '(?:[a-fA-F0-9]{1,4}:){7}[a-fA-F0-9]{1,4}', url)
    if check:
        return -1
    else:
        return 1



#Check if the url is shorted or not    
def checkShorted(url):
    check = re.search('bit\.ly|goo\.gl|shorte\.st|go2l\.ink|x\.co|ow\.ly|t\.co|tinyurl|tr\.im|is\.gd|cli\.gs|'
                      'yfrog\.com|migre\.me|ff\.im|tiny\.cc|url4\.eu|twit\.ac|su\.pr|twurl\.nl|snipurl\.com|'
                      'short\.to|BudURL\.com|ping\.fm|post\.ly|Just\.as|bkite\.com|snipr\.com|fic\.kr|loopt\.us|'
                      'doiop\.com|short\.ie|kl\.am|wp\.me|rubyurl\.com|om\.ly|to\.ly|bit\.do|t\.co|lnkd\.in|'
                      'db\.tt|qr\.ae|adf\.ly|goo\.gl|bitly\.com|cur\.lv|tinyurl\.com|ow\.ly|bit\.ly|ity\.im|'
                      'q\.gs|is\.gd|po\.st|bc\.vc|twitthis\.com|u\.to|j\.mp|buzurl\.com|cutt\.us|u\.bb|yourls\.org|'
                      'x\.co|prettylinkpro\.com|scrnch\.me|filoops\.info|vzturl\.com|qr\.net|1url\.com|tweez\.me|v\.gd|'
                      'tr\.im|link\.zip\.net',
                      url)
    if check:
        return -1
    else:
        return 1

In [40]:
data['checkIp'] = data['url'].apply(lambda i : checkIP(i))
data['isShorted'] = data['url'].apply(lambda i : checkShorted(i))

In [41]:
data.tail()

Unnamed: 0,url,label,result,urlLength,hostLength,pathLength,dirLength,tld_length,num@,num-,num.,num?,num-www,num=,num%,num-digit,num-letter,num-dir,checkIp,isShorted
450171,http://ecct-it.com/docmmmnn/aptgd/index.php,malicious,1,43,11,25,8,3,0,1,2,0,0,0,0,0,34,3,1,-1
450172,http://faboleena.com/js/infortis/jquery/plugin...,malicious,1,159,13,139,2,3,0,0,2,0,0,1,0,21,118,12,1,1
450173,http://faboleena.com/js/infortis/jquery/plugin...,malicious,1,147,13,127,2,3,0,0,1,0,0,1,0,20,109,12,1,1
450174,http://atualizapj.com/,malicious,1,22,14,1,0,3,0,0,1,0,0,0,0,0,17,1,1,1
450175,http://writeassociate.com/test/Portal/inicio/I...,malicious,1,143,18,118,4,3,0,1,4,0,1,0,0,9,118,7,1,1


In [42]:
data.to_csv('dataset/finalData.csv')