In [3]:
import dhelp

test_url = 'http://www.aschart.kcl.ac.uk/charters/s0001.html'


class BaseASCScraper:
    """Parent class for all ASC database scraper classes"""
    data = None

    def __init__(self, url):
        self._url = url
        with dhelp.WebPage(self._url) as page_soup:
            # get only portion of page with charter-specific content
            self.data = page_soup.body.div.table.find('tr', class_='r02').find('td', id='content').div
            
    def __str__(self):
        return self.data
    
    def __repr(self):
        return self.__str__()

In [10]:
import dhelp


class ASCCharter(BaseASCScraper):
    """Extracts data from a single charter from the ASC database"""
    
    def __init__(self, url):
        super().__init__(url=url)
        # eager load navbar to speed up link retreival
        self._navbar = self.data.find('ul', class_='charter-nav')
            
    @property
    def id(self):
        return self.data.find('td', id='content').div.div.div.h1.get_text()
    
    @property
    def link_previous(self):
        """url to previous charter, if extant, otherwise None"""
        return self._get_navbar_link('charter-prev')
        
    @property
    def link_next(self):
        """url to next charter, if extant, otherwise None"""
        return self._get_navbar_link('charter-next')
        
    @property
    def link_source(self):
        """url to source in PASE database, if extant, otherwise None"""
        return self._get_navbar_link('charter-pase-source')
        
    @property
    def link_witnesses(self):
        """url to list of witnesses in PASE database, if extant, otherwise None"""
        return self._get_navbar_link('charter-witnesses')
    
    @property
    def description(self):
        """text description of charter, contains idiosyncratic dating and location information"""
        return self.data.p.get_text()
    
    @property
    def text(self):
        # grab text and convert from latin-1 to utf-8 encoding
        raw_text = self.data.find_all('div')[3].get_text()
        clean_text = bytearray(raw_text, 'latin-1').decode('utf-8')
        # remove text of embedded editorial marks
        remove_phrases = [
            'DATING CLAUSE', 'INVOCATION', 'PROMULGATION PLACE', 'CURSE',
            'DISPOSITIVE WORD', 'BOUNDS', 'PROEM',
        ]
        for remove_phrase in remove_phrases:
            clean_text = clean_text.replace(remove_phrase, '')
        # removes extra whitespace by spliting into list of words and rejoining
        return dhelp.LatinText(clean_text).rm_spaces().stringify()
        

charter_page = ASCCharter(test_url)
print(charter_page.description)
print(charter_page.text)

Fetching http://www.aschart.kcl.ac.uk/charters/s0001.html
Successfully scraped http://www.aschart.kcl.ac.uk/charters/s0001.html

               Archive: Rochester
            A.D. 604 (28 April). Ãthelberht, king, to St Andrew and his church at Rochester; grant of land at Rochester. Latin with English bounds. 
 
Archive: Rochester
A.D. 604 (28 April). Ãthelberht, king, to St Andrew and his church at Rochester; grant of land at Rochester. Latin with English bounds. 
 
Archive: Rochester

Regnante in perpetuum domino nostro Iesu Christo saluatore . mense Aprilio . sub die iiii . kalendas Maias . indictione vii . ego Æthelberhtus rex filio meo Eadbaldo admonitionem catholice fidei optabilem . Nobis est aptum semper inquirere . qualiter per loca sanctorum pro anime remedio uel stabilitate salutis nostre aliquid de portione terre nostre in subsidiis seruorum dei deuotissimam uoluntatem debeamus offerre . Ideoque tibi Sancte Andrea tueque ecclesiae que est constituta in ciuitate Hrofibreui

In [9]:
class ASCDatabase(BaseASCScraper):
    """This is the object which should be instantiated to begin mining the Anglo-Saxon Charters database."""
    root_url = 'http://www.aschart.kcl.ac.uk/'
    
    def __init__(self, url):
        # if absolute link sent, use that, otherwise add relative link to root
        if self.root_url in url:
            super().__init__(url=url)
        else:
            super().__init__(url=self.root_url + url)
    
    @property
    def charter_links(self):
        links = []
        # looping through each section and group of rulers
        for ruler_section in self.data.find_all('ul', class_='asc-expand'):
            for ruler_group in ruler_section.find_all('li'):
                # get relative links from <a> tags append full link to self.data by adding root_url
                for charter_link_wrapper in ruler_group.find_all('li'):
                    links.append(self.root_url + charter_link_wrapper.a['href'])
        return links


asc_database = ASCDatabase('idc/idx_sawyerNo.html')

for charter_link in asc_database.charter_links:
    print(charter_link)

Fetching http://www.aschart.kcl.ac.uk/idc/idx_sawyerNo.html
Successfully scraped http://www.aschart.kcl.ac.uk/idc/idx_sawyerNo.html
http://www.aschart.kcl.ac.uk//charters/s0001.html
http://www.aschart.kcl.ac.uk//charters/s0002.html
http://www.aschart.kcl.ac.uk//charters/s0003.html
http://www.aschart.kcl.ac.uk//charters/s0004.html
http://www.aschart.kcl.ac.uk//charters/s0005.html
http://www.aschart.kcl.ac.uk//charters/s0006.html
http://www.aschart.kcl.ac.uk//charters/s0007.html
http://www.aschart.kcl.ac.uk//charters/s0008.html
http://www.aschart.kcl.ac.uk//charters/s0009.html
http://www.aschart.kcl.ac.uk//charters/s0010.html
http://www.aschart.kcl.ac.uk//charters/s0011.html
http://www.aschart.kcl.ac.uk//charters/s0012.html
http://www.aschart.kcl.ac.uk//charters/s0013.html
http://www.aschart.kcl.ac.uk//charters/s0014.html
http://www.aschart.kcl.ac.uk//charters/s0015.html
http://www.aschart.kcl.ac.uk//charters/s0016.html
http://www.aschart.kcl.ac.uk//charters/s0017.html
http://www.aschart

In [12]:
class PASEWitnesses:
    
    def __init__(self, url):
        self._url = url
        with dhelp.WebPage(self._url) as page_soup:
            # get only portion of page with relevant data
            self.data = page_soup.find('div', class_='rec').find('ul').find_all('li')
            
    def __str__(self):
        return self.data
    
    def __repr__(self):
        return self.__str__()
    
    @property
    def witnesses(self):
        for each witness_entry in self.data:
            agent = witness_entry.find('strong').get_text()
    
pase_witnesses = PASEWitnesses('http://www.pase.ac.uk/jsp/ASC/factoid.jsp?factoidKey=24083')
print(pase_witnesses.data)

Fetching http://www.pase.ac.uk/jsp/ASC/factoid.jsp?factoidKey=24083
Successfully scraped http://www.pase.ac.uk/jsp/ASC/factoid.jsp?factoidKey=24083
[<li><strong>Agent</strong>: 
   <a href="../DisplayPerson.jsp?personKey=1109"> Æthelberht 3</a>
(<em>First Christian king of Kent, 560 or c.585-616</em>) </li>, <li><strong>Recipient</strong>: 
   <a href="../DisplayPerson.jsp?personKey=5338"> Canterbury, St Peter's 1</a>
(<em></em>) </li>, <li><strong>Participant</strong>: 
   <a href="../DisplayPerson.jsp?personKey=13620"> Augustine 1</a>
(<em>Archbishop of Canterbury, 597-604x609; apostle of the English</em>) : he gave his consent</li>, <li><strong>Participant</strong>: 
   <a href="../DisplayPerson.jsp?personKey=2802"> Anonymi 1054</a>
(<em>Principes of <cpers <="" em="" id="891">) : <a href="../DisplayPerson.jsp?personKey=1109">Æthelberht 3</a>'s leading men gave their consent</cpers></em></li>, <li><strong>Charter Witness</strong>: 
 Royalty (1) <a href="../DisplayPerson.jsp?personKe