Class to accumulate whois records and risk factors.
They will memoized to reduce requested services for more data.

1. Use a database to store the results for use in various programs.
2. Use an ipnetwork as key
3. Store company name with key
4. When a class instance is created, it will load db into dictionary.
5. The instance is normally readonly, but can be writeable. That means new addresses
will be added if they are not in the database and become permanent parts.



In [2]:
import ipaddress
import dbm
import pickle
import sys
import pandas as pd
import numpy as np

import pprint
pp = pprint.PrettyPrinter()


In [24]:
class Debug():
    """ Print debug messages if active """
    def __init__(self, set=1):
        self._set = set
    def prt(self, str):
        if self._set:
            sys.stderr.write(str)
    def set(self):
        self._set = 1
    def unset(self):
        self._set = 0
debug = Debug()

In [25]:
debug.prt("Hello")

Hello

In [4]:
import requests
from bs4 import BeautifulSoup
import json
import os

def get_risk(ip_string):
    # Return risk factors from scamalytics into a dict
    #     {"ip": ? , 
    #      "score": ?, 
    #      "risk": ?, 
    #      "risk_comment: ?"}

    # Fetch the complete record from scamalytics restful api
    # ip_string ... make request by ip address as a string

    html_text = ""
    url = "https://scamalytics.com/ip/" + ip_string
    html_text = requests.get(url).text

    soup = BeautifulSoup(html_text, 'lxml')
    
    # Tag=pre
    result = json.loads(soup.pre.string)
    
    # The comment is in the body of an unlabelled div. Used the css class to find.
    # Remove special UTF-8 character \U200b, a zero width space.
    result["risk_comment"] = soup.find_all("div", class_="panel_body")[0].get_text().replace("\u200b","")
    
    return result

def get_arin(ip_string):
    '''Return dict for the net_address that contains this ip_string
        {"cidr": ?,
         {"organization": ? ,
          "handle": ? ,
          "asn": ?,
          "city": ? ,
          "address" : ? ,
          "postalcode": ? ,
          "countrycode": ? ,
          "state": ? ,
          "country": ? ,
         }
    '''
    
    fillna = lambda x: "" if not x else x.string if not isinstance(x, str) else x
    
    # Fetch the complete record from arin restful api
    # Ref: https://www.arin.net/resources/registry/whois/rws/api/#networks-and-asns
    # ip ... make request by ip address as a string
    # pft .. get full record
    
    url       = "http://whois.arin.net/rest/ip/" + ip_string + "/pft"
    html_text = ""
    try:
        html_text = requests.get(url).text
    except:
        return None

    # Parse html into a hierarchy using BeautifulSoup 
    soup = BeautifulSoup(html_text, 'lxml')
    
    # Parse into dict to return results, item by item
    result = {}
    
    # ARIN reports a list of CIDR net_addresses. 
    # The database will be indexed by ipaddress.net_address.
    # A record will be written for each cidr and duplicate the ARIN info
    # Obtain the organization name from tag=net instead of the tag=org which
    # has more than one tag=name making it harder to isolate.
    try:
        info = {}
        info["organization"] = fillna(soup.net.orgref["name"])
        info["handle"]       = fillna(soup.net.orgref["handle"])
        info["asn"]          = fillna(soup.net.originas)

        # Obtain rest of the info from tag=org
        info["city"]         = fillna(soup.org.city)

        # More than one address line may be recorded
        address = []
        for line in soup.org.streetaddress:
            address.append(line.string)
        info["address"]      = address

        info["postalcode"]   = fillna(soup.org.postalcode)

        # The iso3166 tags are the internation country codes
        # Ref: https://www.iso.org/glossary-for-iso-3166.html
        # The tags contain "-", illegal characters in a python var name. 
        # Use find_all to locates the tags with a string search.
        for t in soup.org.find_all("iso3166-2"):
            info["state"]    = fillna(t)
        for t in soup.org.find_all("iso3166-1"):
            info["country"]  = fillna(t.find('name'))
            countrycode = fillna(t.code2)

        # Add the risk obtained from scamalytics
        info.update(get_risk(ip_string))

        # The netblocks scope contains a list of netblock sections
        for netblock in soup.net.netblocks:
            cidr = netblock.startaddress.string + "/" + netblock.cidrlength.string      
            result[cidr] = info
    except:
        return None

    return result

# pp.pprint(get_arin("100.12.31.4"))

In [5]:
import ipaddress
import dbm
import pickle
from sortedcontainers import SortedDict

class Risk():
    
    def __init__(self, filename, readonly=True):

        # Open database. Create as needed.
        
        self.readonly = readonly
        self.open_option = f'{"r" if self.readonly else "w"}'
        self.db_filename = filename
         
        try:
            self.db = dbm.open(self.db_filename, self.open_option)
        except:
            if self.readonly:
                print(f"{self.db_filename} does not exist but will not be created when class is {readonly=}")
                return None
            else:
                self.db = dbm.open(self.db_filename, "c")
                
        # Read the data into dictionary:
        #   risk[ipaddress.ipv4network] = [organization, country, risk]
        #   
        
        self.risk = SortedDict()
        self.risk_count = 0

        for key in self.db.keys():
            self.risk[pickle.loads(key)] = pickle.loads(self.db[key])

        self.risk_count = len(self.risk)
        self.db.close()

        
    def find(self, ip_string):
        """ 
        risk[cidr] = [company, location, risk_score]
        creating one if needed and adding it to the database.
        """
        try:
            self.ip = ipaddress.ip_address(ip_string)
        except:
            print(f"Could not find IPv4Address for {ip_string}")
            return None
        
        # Find the address to insert
        if self.cidr_search(self.ip):
            return self.cidr_search_result
        else:
            self.result = get_arin(ip_string)
            self.add(self.result)
            return self.result
    

    def add(self, new_risks):
        # Add the result of get_arin, a dict with cidr as key 
        # to both the Risk.risk dict and the database
        
        # Store in dictionary first.
        # There may be more than one cidr retrieved by get_arin
        # Each CIDR has to be type ip_network

        for new_cidr, new_risk in new_risks.items():
            netblock = ipaddress.ip_network(new_cidr)
            self.risk[netblock] = new_risk
        
        # Store in database next
        if not self.readonly:
            
            with dbm.open(self.db_filename, self.open_option) as self.db:
                for new_cidr, new_risk in new_risks.items():
                    netblock = ipaddress.ip_network(new_cidr)
                    # key and value have to be pickle'd before storing
                    self.db[pickle.dumps(netblock)] = pickle.dumps(new_risk)

        return
    

    def len(self):
        return self.risk_count


    def cidr_search(self, target_ip):
        # risk.cidr_search(target_ip) is True when ip's network is in db
        # type(target_ip) is ipaddress.IPv4Address
        # Updates cidr_search_result property with risk[cidr of target_ip] else None
        sz = len(self.risk)
        if sz == 0:
            self.bisearch_result = None
            return False
        s = 0
        e = sz
        while s > e:
            m = (s + e)//2
            cidr = self.risk.peekitem(m)[0]
            if target_ip in cidr:
                self.cidr_search_result = self.risk[cidr]
                return True
            if target_ip > cidr:
                s = m + 1
            else:
                e = m - 1
        self.cidr_search_result = None
        return False

In [6]:
myrisk = Risk("mywhois", readonly=False)

In [28]:
q = myrisk.find("142.255.122.114")

In [7]:
def api_arin(ip_string):
    url       = "http://whois.arin.net/rest/ip/" + ip_string + "/pft"
    html_text = ""
    try:
        html_text = requests.get(url).text
    except:
        return None
    # Parse html into a hierarchy using BeautifulSoup 
    soup = BeautifulSoup(html_text, 'lxml')
    return soup

In [8]:
soup = api_arin("100.16.0.1")

In [25]:
# Routine to read a clean set ip addresses from the sample data
# and icorporate them into the resk database

db_filename = "mywhois"
sample_filename = "clean_test_data.csv"

# Open the database and load the current data
risk = Risk(db_filename, readonly=False)

# Read the clean set of sample data set
clean_ip = pd.read_csv(sample_filename)
                       
# range over the unique ip addresses
new = 0
old = 0
for n, ip in enumerate(clean_ip.ip.drop_duplicates()[:10]):
    before = risk.len()
    risk.find(ip)
    after  = risk.len()
    if before == after:
        old += 1
    else:
        new += 1
print(f"{n=} {new=} {old=} {risk.len()=}")

    



RecursionError: maximum recursion depth exceeded while calling a Python object

In [9]:
risk=Risk("mywhois")

In [23]:
for ip in clean_ip.ip.drop_duplicates()[:10]:
    print(ip)

142.255.122.114
172.58.230.193
72.68.212.63
172.100.125.174
24.228.215.103
68.199.195.147
108.29.95.66
32.208.115.88
24.189.68.18
172.58.235.43


In [30]:
risk.len()


5

In [31]:
pp.pprint(risk.risk)

{IPv4Network('52.224.0.0/11'): {'address': ['One Microsoft Way'],
                                'asn': '',
                                'city': 'Redmond',
                                'country': 'United States',
                                'handle': 'MSFT',
                                'ip': '52.224.1.1',
                                'organization': 'Microsoft Corporation',
                                'postalcode': '98052',
                                'risk': 'low',
                                'risk_comment': 'IP address 52.224.1.1 is '
                                                'operated by Microsoft '
                                                'Corporation whose web traffic '
                                                'we consider to present a '
                                                'potentially low fraud risk. '
                                                'Non-web traffic may present a '
                                     

In [37]:
for k in risk.risk:
    print(k)

52.224.0.0/11
72.65.128.0/17
72.66.0.0/15
72.68.0.0/15
72.70.0.0/16
72.71.0.0/17
72.71.128.0/18
100.0.0.0/12
100.16.0.0/14
142.255.0.0/17
172.32.0.0/11
