In [24]:
import requests
import tldextract
import json
from urlparse import urlparse
import pandas as pd 
import validators

def get_domain_age_in_days(domain):
    show = "https://input.payapi.io/v1/api/fraud/domain/age/" + domain
    data = requests.get(show).json()
    return data['result'] if 'result' in data else None

def parse_domain_from_url(url):
    t = urlparse(url).netloc
    return '.'.join(t.split('.')[-2:])
    
def parse_subdomains_from_url(url):
    
    if url.startswith("https://www"):
        new_url = url.replace("www.", "")
        ext = tldextract.extract(new_url) 
        return ext.subdomain 
    elif url.startswith("http://www"):
        new_url = url.replace("www.", "")
        ext = tldextract.extract(new_url) 
        return ext.subdomain
    else:
        ext = tldextract.extract(url)
        return ext.subdomain
            
def count_subdomains(url):
    
    subdomain = parse_subdomains_from_url(url)
    
    if len(subdomain) == 0:
        number_of_subdomains = 0
        return number_of_subdomains
    else:
        number_of_subdomains = len(subdomain.split('.'))
        return number_of_subdomains
        
def popular_brand_in_url(url):
    
    # 25 most popular brands used in phishing
    # found from https://www.helpnetsecurity.com/2020/02/11/brands-phishing-attacks/
    popular_phishing_brands = ["paypal", "facebook", "microsoft", "netflix", "whatsapp", 
                               "bankofamerica", "cibc", "desjardins", "apple", "amazon", 
                               "chase", "bnp-paribas", "instagram", "square", "dropbox", 
                               "atb", "dhl", "comcast", "orange", "adobe", "mtb", "docusign", 
                               "google", "credit-agricole"]
    
    for brand in popular_phishing_brands:
        if brand in url:
            return True
        else:
            return False

def url_length(url):
    return len(url)

def validate_url(url):
    valid = validators.url(url)
    
    if valid == True:
        return True;
    else:
        print("The URL %s doesn't seem to be valid. No feature extraction was done.") % url
        

def analyze_url(url):
    age_in_days_feature = get_domain_age_in_days(parse_domain_from_url(url));
    subdomain_feature = count_subdomains(url)
    length_feature = url_length(url)
    brand_feature = popular_brand_in_url(url)
    
    info_per_url = [url, age_in_days_feature, subdomain_feature, length_feature, brand_feature]
    return info_per_url

# Note some of these urls are live phishing sites (as of 2019-03-21) use with caution! More can be found at https://www.phishtank.com/
example_urls = ["https://www.slideshare.net/weaveworks/client-side-monitoring-with-prometheus",
                "http://cartaobndes.gov.br.cv31792.tmweb.ru/",
                "https://paypal.co.uk.yatn.eu/m/",
                "http://college-eisk.ru/cli/",
                "https://dotpay-platnosc3.eu/dotpay/"
               ]

data = []

for url in example_urls: 
    url_valid = validate_url(url)
    
    if url_valid == True:
        url_info = analyze_url(url)
        data.append(url_info)

df = pd.DataFrame(data, columns = ["URL","Domain age in days", "Number of subdomains", "URL length", "Popular brand in the URL"])

df
    

Unnamed: 0,URL,Domain age in days,Number of subdomains,URL length,Popular brand in the URL
0,https://www.slideshare.net/weaveworks/client-s...,5103.0,0,76,False
1,http://cartaobndes.gov.br.cv31792.tmweb.ru/,5017.0,4,43,False
2,https://paypal.co.uk.yatn.eu/m/,,3,31,True
3,http://college-eisk.ru/cli/,3086.0,0,27,False
4,https://dotpay-platnosc3.eu/dotpay/,,0,35,False
