Class to accumulate whois records and risk factors.
They will memoized to reduce requested services for more data.

1. Use a database to store the results for use in various programs.
2. Use an ipnetwork as key
3. Store company name with key
4. When a class instance is created, it will load db into dictionary.
5. The instance is normally readonly, but can be writeable. That means new addresses
will be added if they are not in the database and become permanent parts.



In [12]:
import ipaddress
import dbm
import pickle
import sys
# from sortedcontainers import SortedDict
import pprint
pp = pprint.PrettyPrinter()


In [13]:
import requests
from bs4 import BeautifulSoup
import json
import os

def get_risk(ip_string):
    # Return risk factors from scamalytics into a dict
    #     {"ip": ? , 
    #      "score": ?, 
    #      "risk": ?, 
    #      "risk_comment: ?"}

    # Fetch the complete record from scamalytics restful api
    # ip_string ... make request by ip address as a string

    html_text = ""
    url = "https://scamalytics.com/ip/" + ip_string
    html_text = requests.get(url).text

    soup = BeautifulSoup(html_text, 'lxml')
    
    # Tag=pre
    result = json.loads(soup.pre.string)
    
    # The comment is in the body of an unlabelled div. Used the css class to find.
    # Remove special UTF-8 character \U200b, a zero width space.
    result["risk_comment"] = soup.find_all("div", class_="panel_body")[0].get_text().replace("\u200b","")
    
    return result

def get_arin(ip_string):
    '''Return dict for the net_address that contains this ip_string
        {"cidr": ?,
         {"organization": ? ,
          "handle": ? ,
          "asn": ?,
          "city": ? ,
          "address" : ? ,
          "postalcode": ? ,
          "countrycode": ? ,
          "state": ? ,
          "country": ? ,
         }
    '''
     
    # Fetch the complete record from arin restful api
    # Ref: https://www.arin.net/resources/registry/whois/rws/api/#networks-and-asns
    # ip ... make request by ip address as a string
    # pft .. get full record
    
    url       = "http://whois.arin.net/rest/ip/" + ip_string + "/pft"
    html_text = ""
    html_text = requests.get(url).text

    # Parse html into a hierarchy using BeautifulSoup 
    soup = BeautifulSoup(html_text, 'lxml')
    
    # Parse into dict to return results, item by item
    result = {}
    
    # ARIN reports a list of CIDR net_addresses. 
    # The database will be indexed by ipaddress.net_address.
    # A record will be written for each cidr and duplicate the ARIN info
    # Obtain the organization name from tag=net instead of the tag=org which
    # has more than one tag=name making it harder to isolate.
    info = {}
    info["organization"] = soup.net.orgref["name"]
    info["handle"]       = soup.net.orgref["handle"]
    info["asn"]          = soup.net.originas.string

    # Obtain rest of the info from tag=org
    info["city"]         = soup.org.city.string
    
    # More than one address line may be recorded
    address = []
    for line in soup.org.streetaddress:
        address.append(line.string)
    info["address"]      = address
    
    info["postalcode"]   = soup.org.postalcode.string
    
    # The iso3166 tags are the internation country codes
    # Ref: https://www.iso.org/glossary-for-iso-3166.html
    # The tags contain "-", illegal characters in a python var name. 
    # Use find_all to locates the tags with a string search.
    for t in soup.org.find_all("iso3166-2"):
        info["state"]    = t.string
    for t in soup.org.find_all("iso3166-1"):
        info["country"]  = t.find('name').string
        countrycode = t.code2.string
    
    # Add the risk obtained from scamalytics
    info.update(get_risk(ip_string))

    # The netblocks scope contains a list of netblock sections
    for netblock in soup.net.netblocks:
        cidr = netblock.startaddress.string + "/" + netblock.cidrlength.string      
        result[cidr] = info

    return result

pp.pprint(get_arin("100.12.31.4"))

{'100.0.0.0/12': {'address': ['22001 Loudoun County Pkwy'],
                  'asn': 'AS19262',
                  'city': 'Ashburn',
                  'country': 'United States',
                  'handle': 'MCICS',
                  'ip': '100.12.31.4',
                  'organization': 'MCI Communications Services, Inc. d/b/a '
                                  'Verizon Business',
                  'postalcode': '20147',
                  'risk': 'medium',
                  'risk_comment': 'IP address 100.12.31.4 is operated by '
                                  'Verizon Online LLC whose web traffic we '
                                  'consider to present a potentially medium '
                                  'fraud risk. This IP address is owned by MCI '
                                  'Communications Services, Inc. d/b/a Verizon '
                                  'Business whose web traffic we also consider '
                                  'to present a potentially medi

In [None]:
class Risk():
    
    def __init__(self, filename, readonly=True):

        # Open database. Create as needed.
        
        self.readonly = readonly
        self.open_option = f'{"r" if self.readonly else "w"}'
        self.db_filename = filename
        
        try:
            self.db = dbm.open(self.db_filename, self.open_option)
        except:
            if self.readonly:
                print(f"{self.db_filename} does not exist but will not be created when class is {readonly=}")
                return None
            else:
                self.db = dbm.open(self.db_filename, "c")
                
        # Read the data into dictionary:
        #   risk[ipaddress.ipv4network] = [organization, country, risk]
        #   
        
        self.risk = SortedDict()
        self.risk_count = 0

        for key in self.db.keys():
            self.risk[pickle.loads(key)] = pickle.loads(self.db[key])

        self.risk_count = len(self.risk)
        self.db.close()

        
    def find(self, ip_string):
        """ 
        risk[cidr] = [company, location, risk_score]
        creating one if needed and addingit to the database.
        None if could not be found.
        """
        ip = ipaddress.ip_address(ip_string)
        
        # Find the address to insert
        result = self.isin(ip)
        if result:
            return result
        else:
            result = self.risk(ip)
            self.add(result)
            return self.risk
    

    def add(self, v):
        self.risk[v[0]] = [ v[1], v[2], v[3] ]
        self.count += 1

        
    def len(self):
        return self.count


    def isin(self, ip):
        # risk.in(ip) is True when ip's network is stored
        # type(ip) is ipaddress.IPv4Address
        sz = len(self.risk)
        if sz == 0:
            return False
        s = 0
        e = sz
        while True:
            m = (s + e)//2
            cidr = self.risk.peekitem(m)[0]
            if target in cidr:
                return True
            if target > cidr[-1]:
                s = m + 1
            else:
                e = m - 1  
        return False

In [15]:
url       = "http://whois.arin.net/rest/ip/" + '100.12.31.4' + "/pft"
html_text = ""
html_text = requests.get(url).text

# Parse html into a hierarchy using BeautifulSoup 
soup = BeautifulSoup(html_text, 'lxml')