# *특징추출 함수들* #

In [None]:
import math
import pandas as pd
import ssl
import socket
import whois
import requests
from tld import get_tld
from urllib.parse import urlparse
from dateutil.parser import parse

In [None]:
# 1번 Check_IPdomain    # 1: 정상 / 0: 의심 / -1: 피싱

def Check_IPDomain(url):
    try:
        domain = urlparse(url).netloc
        check = domain.split('.')
        for i in check:
            int(i, 0)
        return -1
    
    except ValueError:
        return 1

    except:
        return 0

In [None]:
# 2번 Check_URLLength     # 1: 정상 / 0: 의심 / -1: 피싱

def Check_URLLength(url):
    if len(url) < 54 :
        return 1
    elif len(url) >= 54 and len(url) <= 75 :
        return 0
    else :
        return -1

In [None]:
# 4번 Check_Symbol   # 1: 정상 / -1: 피싱

def Check_Symbol(url):
    if '@' in url:
        return -1
    else :
        return 1

In [None]:
# 5번 Check_Subdomain # 1: 정상 / -1: 피싱

def Check_SubDomain(url):
    try:
        reset_url = urlparse(url).netloc
        res = get_tld(url, as_object = True)
    except:
        return -1
    
    fld = res.fld
    sub = reset_url[:(-len(fld))]
    sub_num = sub.count('.')
    
    if sub_num == 1:
        return 1
    else:
        return -1

In [None]:
#10번 Check_SpecialCharacters        # 1: 정상 / -1: 피싱

def Check_SpecialCharacters(url):
    if '&' in url:
        return -1
    elif '!' in url:
        return -1
    elif '=' in url:
        return -1
    else:
        return 1

In [None]:
#11번 Check_HostLength        # 1: 정상 / -1: 피싱

def Check_HostLength(url):
    try:
        URL = get_tld(url, as_object = True)
        if len(URL.fld) >22:
            return -1
        else:
            return 1
    except:
        return -1

In [None]:
#12번 Check_SuspiciousWord        # 1: 정상 / -1: 피싱

def Check_SuspiciousWord(url):
    if 'signin' in url:
        return -1
    elif 'wp' in url:
        return -1
    elif 'update' in url:
        return -1
    elif 'login' in url:
        return -1
    elif 'admin' in url:
        return -1
    else:
        return 1

In [None]:
# 13번 Check_URLEntropy          # 1: 정상 / 0: 의심 / -1: 피싱

def entropy_matrix(url):
    try:
        url_parse = urlparse(url)
        url_split = url[len(url_parse.scheme):]
    except:
        return 0
    
    url_set = set(url_split)
    per = 1/len(url_split)
    percent = [url_split.count(i)*per for i in url_set]

    entropy = [p*math.log(p) for p in percent]
    entropy = -sum(entropy)
    
    return entropy

def Check_URLEntropy(url):
    if entropy_matrix(url) < 3.2:
        return 1
    else:
        return -1

In [None]:
# 14번 Check_NumRatio           # 1: 정상 / 0: 의심 / -1: 피싱

def NumRatio(url):
    try:
        url_parse = urlparse(url)
        url_split = url[len(url_parse.scheme):]
    except:
        return 0
    
    url_list = list(url_split)
    num = 0
    for i in url_list:
        try:
            if type(int(i)) is int:
                num += 1
        except:
            continue

    return num/len(url_list)

def Check_NumRatio(url):
    if NumRatio(url) > 0.17:
        return -1
    else:
        return 1

In [None]:
# 9번 Check_RegiLength     # 1: 정상 / 0: 의심 / -1: 피싱

def Get_RegiLength(url):
    domain_info = whois.whois(url)
    if type(domain_info.expiration_date) == list :
        expiration_date = domain_info.expiration_date[0]
    else:
        expiration_date = domain_info.expiration_date
        
    if type(domain_info.updated_date) == list :
        updated_date = domain_info.updated_date[0]
    else:
        updated_date = domain_info.updated_date

    if expiration_date == None or updated_date == None:
        return 0
    
    RegiLength = (expiration_date - updated_date).days
    return RegiLength

def Check_RegiLength(url):
    try :
        RegiLength = Get_RegiLength(url)
        if RegiLength <= 365 :
            return -1
        else:
            return 1
    except whois.parser.PywhoisError:
        return -1
    except:
        return 0

In [None]:
# 6번 Check_SSLnOrg   # 1 : 정상 / 0 : 의심 / -1 : 피싱

 
def https_connect(murl):
    ctx = ssl.create_default_context()
    s = ctx.wrap_socket(socket.socket(), server_hostname=murl)
    s.settimeout(30.0)                                      # socket connect timeout 적정시간은?
    s.connect((murl, 443))
                                           
    return s

def Check_SSLnOrg(url):    
    try:
        murl = urlparse(url).netloc
        s = https_connect(murl)
    except:                       
        return -1
    cert = s.getpeercert()
    issuer = dict(x[0] for x in cert['issuer'])
    issued_by = issuer['organizationName']

    data = pd.read_csv('./dataset/trust_organization.csv')
    trust_orglist = data['0']

    for trusted_issuer in trust_orglist:
        if trusted_issuer == issued_by:
            break
    else :
        return 0
 
    notAfter = cert['notAfter']
    notBefore = cert['notBefore']
    init_date = parse(notBefore)
    expiration_date = parse(notAfter)
    total_days = (expiration_date.date() - init_date.date()).days
    if total_days >= 365 :
        return 1
    else :
        return 0

In [None]:
# 15번 Check_IsHttps           # 1: 정상 / 0: 의심 / -1: 피싱

def Check_IsHttps(url):
    scheme = urlparse(url).scheme
    if scheme == 'https':
        return 1
    elif scheme == 'http':
        return -1
    else:
        return 0

In [None]:
# 3번 Check_Shortening       # 1: 정상 / 0: 의심/ -1: 피싱

def Check_Shortening(url):
    try:
        http_status = requests.get(url, timeout = 30).history  #read timout설정
        for i in http_status:  
            if i.status_code//100 == 3:
                return -1
            else:
                return 1
    except:
        return 0

---

## *한번에 csv로 추출하기* ##

In [None]:
func = []
func.append(Check_IPDomain)
func.append(Check_URLLength)
func.append(Check_Symbol)
func.append(Check_SubDomain)   
func.append(Check_SpecialCharacters)
func.append(Check_HostLength)
func.append(Check_SuspiciousWord)
func.append(Check_URLEntropy)
func.append(Check_NumRatio)
func.append(Check_RegiLength)       
func.append(Check_SSLnOrg)
func.append(Check_IsHttps)
func.append(Check_Shortening)       #

func_name = ['Check_IPDomain', 'Check_URLLength', 'Check_Symbol', 'Check_SubDomain', 'Check_SpecialCharacters',
             'Check_HostLength', 'Check_SuspiciousWord', 'Check_URLEntropy', 'Check_NumRatio', 'Check_RegiLength', 
             'Check_SSLnOrg', 'Check_IsHttps', 'Check_Shortening']

In [None]:
from tqdm import tqdm

parsed = pd.DataFrame()

data = pd.read_csv('./dataset/dataset.csv')
url = data['url']
label = data['label']

parsed['url'] = url
parsed['label'] = label

for i in range(len(func)):
    extracted = []
    for j in tqdm(range(len(url))):
        extracted.append(func[i](url[j]))
    
    parsed[func_name[i]] = extracted
    parsed.to_csv('parsed_dataset.csv', index = False)

100%|█████████████████████████████████████████████████████████████████████████| 21430/21430 [00:00<00:00, 93603.94it/s]
100%|████████████████████████████████████████████████████████████████████████| 21430/21430 [00:00<00:00, 397919.01it/s]
  2%|█▎                                                                          | 359/21430 [19:24<4:11:44,  1.39it/s]

---

## *하나씩 csv로 추출하기* ##

In [None]:
from tqdm import tqdm

data = pd.read_csv('./dataset/dataset.csv')
url = data['url']
label = data['label']

parsed_data = pd.read_csv('parsed_dataset.csv')
parsed_data['url'] = url
parsed_data['label'] = label

func = Check_Shortening
func_name = 'Check_Shortening'
extracted = []

for i in tqdm(range(len(url))):
    extracted.append(func(url[i]))

parsed_data[func_name] = extracted
parsed_data.to_csv('parsed_dataset.csv', index = False)

100%|█████████████████████████████████████████████████████████████████████████| 21430/21430 [00:00<00:00, 47907.28it/s]


---

# 머신러닝 #

---

# Google Extension #

### 1. 받은 URL을 특징별로 분류하기 ###

In [None]:
def parse_url(url):
    func = []
    func.append(Check_IPDomain)
    func.append(Check_URLLength)
    func.append(Check_Symbol)
    func.append(Check_SubDomain)   
    func.append(Check_SpecialCharacters)
    func.append(Check_HostLength)
    func.append(Check_SuspiciousWord)
    func.append(Check_URLEntropy)
    func.append(Check_NumRatio)
    func.append(Check_RegiLength)
    func.append(Check_SSLnOrg)          #
    func.append(Check_Shortening)       #
# 여기에 새로운 특징분류 함수 추가 

    parse = []
    for i in range(len(func)):
        try:
            parse.append(func[i](url))
        except:
            parse.append(0)

    return parse

### 2. 학습한 머신러닝 알고리즘으로 결과 추측하기 ###