In [None]:
from netaddr import valid_ipv4
import re
import requests
from datetime import date
from dateutil.parser import parse as date_parse
from urllib.parse import urlparse
import pythonwhois
import pandas as pd
import tldextract
from googlesearch import search

%run ssl_check.ipynb
%run pyphishtank.ipynb

# Calculates number of months
def diff_month(d1, d2):
    return (d1.year - d2.year) * 12 + d1.month - d2.month

# Generate data set by extracting the features from the URL
def generate_data_set(url):

    data_set = []
    
    o = urlparse(url)
    print(o)
    
    # Stores the response of the given URL
    try:
        response = requests.get(url,verify=False)
    except:
        response = ""
    #print(response.text)
          
    #If URL has IP address set 1 else -1
    if (valid_ipv4(o.netloc.split(":")[0])):
        data_set.append(1)
    else:
        data_set.append(-1)    
    
    # URL_Length
    if len(url) < 54:
        data_set.append(-1)
    elif len(url) >= 54 and len(url) <= 75:
        data_set.append(0)
    else:
        data_set.append(1)
        
    # Shortining_Service
    if re.findall("goo.gl|bit.ly", url):
        data_set.append(1)
    else:
        data_set.append(-1)
    
    # having_At_Symbol
    if re.findall("@", url):
        data_set.append(1)
    else:
        data_set.append(-1)
    
    # double_slash_redirecting
    if re.findall(r"[^https?:]//",url):
        data_set.append(1)
    else:
        data_set.append(-1)
      
    # Prefix_Suffix
    if re.findall(r"-", url):
        data_set.append(1)
    else:
        data_set.append(-1)

    # having_Sub_Domain - Need to work on this
    if len(re.findall("\.", url)) == 1:
        data_set.append(-1)
    elif len(re.findall("\.", url)) == 2:
        data_set.append(0)
    else:
        data_set.append(1)
      
    
    #USING HTTPS
    ca_test=0
    ca_valid=0
    
    valid_ca=["GeoTrust", "GoDaddy", "Network Solutions", "Thawte", "Comodo", "Doster","VeriSign"]
    primary_uri=o.netloc.split(":")
    try:
        #check for port number
        if (o.scheme == "http"):
            data_set.append(1)
        elif (o.scheme == "https"):
            port=443
            
            #Get the primary uri and see if we have port in it ex: https://testphish.com:5454
            #primary_uri=o.netloc.split(":")
            if (len(primary_uri) > 1):
                port = primary_uri[1]
           
            get_cert_data=print_basic_info(get_certificate(primary_uri[0],port))
            #print(get_cert_data)
    
            #Check whether CA is whitelisted one
            for ca in valid_ca:
                if (ca.lower() in get_cert_data["issuer"].lower()):
                    ca_test=1
    
            if (diff_month(get_cert_data["notafter"],date.today()) >= 12):
                ca_valid=1
    
            #Use https and Issuer Is Trusted &and Age of Certificate≥ 1 Years
            if (o.scheme == "https" and ca_test == 1 and ca_valid == 1):
                data_set.append(-1)
        
            #Using https and Issuer Is Not Trusted  → Suspicious
            #elif(o.scheme == "https" and ca_test == 0):
            #    data_set.append(0)
            #Otherwise→ Phishing
            else:
                data_set.append(1)
    except:
        data_set.append(0)
        
    # Domain_registeration_length   
    domain=primary_uri[0]
    details=pythonwhois.get_whois(domain)
    isDNSvalid=1
    
    try:
        if (details['status'][0] == "invalid"):
            data_set.append(1)
            isDNSvalid=0
    except:
        isDNSvalid=1
            
    try:    
        if (isDNSvalid != 0):
            #Domains Expires in≤ 1 years → Phishing
            if (diff_month(details["expiration_date"][0],date.today()) <= 12):
                data_set.append(1)
                #Otherwise→ Legitimate
            else:
                data_set.append(-1)
    except:
        data_set.append(1)
        
    #Favicon
    data_set.append(2)
    
    #port
    data_set.append(-1)
    
    #Existence of “HTTPS” Token in the Domain Part of the URL
    if ("https" in primary_uri[0]):
        data_set.append(1)
    else:
        data_set.append(-1)

        
    # Request_URL
    data_set.append(-1)

    # URL_of_Anchor
    data_set.append(-1)

    # Links_in_tags
    data_set.append(-1)

    # SFH
    data_set.append(-1)

    # Submitting_to_email
    try :
        if (reponse != "" ):
            if re.findall(r"[mail\(\)|mailto:?]", response.text):
                data_set.append(1)
            else:
                data_set.append(-1)
    except: 
         data_set.append(-1)

    # Abnormal_URL
    ext = tldextract.extract(url)
    try:
        #print(ext.domain)
        # Requests all the information about the domain, if domain don't exists we will hit exception
        whois_response = requests.get("https://www.whois.com/whois/"+ext.domain)
        reg_data = re.findall(r'Registered On:</div><div class="df-value">([^<]+)</div>', whois_response.text)[0]
        if reg_data != "":
            data_set.append(-1)
        else:
            data_set.append(1)
    except:
        data_set.append(1)

    # Website Forwarding
    try:
        if (responseonse != ""):
            if len(response.history) <= 1:
                data_set.append(-1)
            elif len(response.history) >=2 and len(response.history) < 4:
                data_set.append(0)
            else:
                data_set.append(1)
    except:
        data_set.append(-1)

     # on_mouseover
    try:
        if (response != ""):        
            if re.findall("<script>.+onmouseover.+</script>", response.text):
                data_set.append(1)
            else:
                data_set.append(-1)
    except:
        data_set.append(-1)
        
    # RightClick
    try:
        if (response != ""):
            if re.findall(r"event.button ?== ?2", response.text):
                data_set.append(1)
            else:
                data_set.append(-1)
    except:
        data_set.append(-1)

    # popUpWidnow
    #try:
    #   if (response != ""):
    #       if re.findall(r"alert\(", response.text):
    #            data_set.append(1)
    #        else:
    #            data_set.append(-1)
    #except:
    #    data_set.append(-1)
    data_set.append(2)
  
    #Iframe
    #try:
    #    if (response != ""):
    #        if re.findall(r"[<iframe>|<frameBorder>]", response.text):
    #            data_set.append(1)
    #        else:
    #            data_set.append(-1)
    #except:
    #    data_set.append(-1)
    data_set.append(2)
        
    try:
        if (isDNSvalid == 0):
            data_set.append(1)
        #Age Of Domain <= 6 months-Phishing
        elif (diff_month(date.today(),details["creation_date"][0]) < 6):
            data_set.append(1)
        #Otherwise→ Legitimate
        else:
            data_set.append(-1)
    except:
        data_set.append(-1)
        
    try:
        if (isDNSvalid == 0):
            data_set.append(1)
        # DNSRecord
        elif (len(details)):
            data_set.append(-1)
        else:
            data_set.append(1)
    except:
        data_set.append(-1)
        
        
    #Website Traffic
    #(Website Rank<100,000 → Legitimate@Website Rank>100,000 →Suspicious@Otherwise → Phish)
    print(primary_uri[0])
    xml = requests.get('http://data.alexa.com/data?cli=10&dat=s&url=%s'%url)
    try: 
        rank = int(re.search(r'<POPULARITY[^>]*TEXT="(\d+)"', xml.text).groups()[0])
        if (rank < 100000):
            data_set.append(-1)
        elif (rank > 100000):
            data_set.append(0)
    except: 
        data_set.append(1)
        
    #page rank
    #check_rank="https://www.alexa.com/siteinfo/"+primary_uri[0]
    #rank_checker_response = requests.post(check_rank)
    page_rank=0
    try:
        rank_checker_response = requests.post("https://www.checkpagerank.net/index.php", {"name": url})
        # Extracts page rank
        if (re.findall(r"Global Rank: ([0-9]+)", rank_checker_response.text) == []):
            data_set.append(1)
        else:
            page_rank = int(re.findall(r"Global Rank: ([0-9]+)", rank_checker_response.text)[0])
            #PageRank<0.2 → Phishing
            print(page_rank)
            if (page_rank > 0.2):
                data_set.append(-1)
            else:
                data_set.append(1)
    except:
        data_set.append(-1)
        
    # Google_Index
    #https://www.geeksforgeeks.org/performing-google-search-using-python-code/
    try:
        #Use double quotations to search for exact string
        domain="\"{}\"".format(ext.domain)
        subdomain="\"{}\"".format(ext.subdomain)
        
        #Use subdomain for the search if present else domain
        if (subdomain == ""):
            search_str=domain
        else:
            search_str=subdomain
          
        tld_conf=ext.suffix
        #print(search_str,tld_conf,ext)
        count = 0
        for j in search(search_str, tld=tld_conf, num=2, stop=1,pause=1): 
            #print(j)
            count+=1
        #print(count)
        if (count != 0):
            data_set.append(-1)
        else:
            data_set.append(1)
    except:
        data_set.append(-1)

    # Links_pointing_to_page
    try:
        if (response != ""):
            number_of_links = len(re.findall(r"<a href=", response.text))
            if number_of_links == 0:
                data_set.append(1)
            elif number_of_links <= 2:
                data_set.append(0)
            else:
                data_set.append(-1)
    except:
        data_set.append(-1)
        
    # Statistical_report
    p = PhishTank()
    result = p.check(url)
    print(url, result.valid)
    if result.in_database:
        if result.valid:
            #print("{url} is a phish!".format(url=result.url))
            data_set.append(1)
        else:
            #print("{url} is not a phish!".format(url=result.url))
            data_set.append(-1)
    else:
        #print("{url} is not in the PhishTank database".format(url=result.url))
        data_set.append(-1)
      
    print(data_set) 
    
    ds_columns=["having_IP_Address","URL_Length","Shortining_Service","having_At_Symbol","double_slash_redirecting","Prefix_Suffix",
            "having_Sub_Domain","SSLfinal_State","Domain_registeration_length","Favico","port","HTTPS_token","Request_URL",
            "URL_of_Anchor","Links_in_tags","SFH","Submitting_to_email","Abnormal_URL","Redirect","on_mouseover","RightClick",
            "popUpWidnow","Iframe", "age_of_domain","DNSRecord","web_traffic","Page_Rank","Google_Index",
            "Links_pointing_to_page","Statistical_report"]
    print(dict(zip(ds_columns, data_set)))
    
    #Remove "2"s as we are not using this features
    new_data=[]
    for x in data_set:
        if (x != 2):
            new_data.append(x)
    return(new_data)
    
#print(generate_data_set("https://paypal-e.com/auth/main/content/"))