In [None]:
"""
Searches for inscriptions containing '' and outputs them as a .csv file. Intended for
research on Julia Domna's title of ''.

David J. Thomas, Instructor of History, University of South Florida
For Julie Langford, Professor of History, University of South Florida

NOT INTENDED FOR PUBLIC DISTRIBUTION
"""

"""===Imports==="""
import csv
import time
from collections import UserString
import requests
from bs4 import BeautifulSoup

"""===Constants==="""

# PHI url containing the search results
SEARCH_URL = 'https://epigraphy.packhum.org/search?patt=%CE%BA%CE%B1%CF%83%CF%84%CF%81%CF%89%CE%BD'

"""===Object Definitions==="""


class BaseWebPage(UserString):
    """Parent class for all webpage scraping objects. Given a url, will provide methods for
    requesting raw HTML from the page and converting it to a BeautifulSoup object."""
    html = None
    soup = None
    # default retries is -1 (infinite retries), set to positive integer for a retry limit
    # silent option specifies if message should print every time a request is launched
    # delay option specifies automated delay between web requests
    options = { 'retries': -1, 'silent': False, 'delay': 2}
    
    def __init__(self, url, options={}):
        # run parent class init function
        super().__init__(str)
        self.data = url
        self.options.update(options)
        
    def get_response(self, retries=-1):
        """Fires request and returns raw response object. Retries by calling self recursively."""
        try:
            print('Getting data from {}'.format(self.data))
            # if delay is specified, pause (default is 2 seconds)
            if self.options['delay']:
                time.sleep(self.options['delay'])
            return requests.get(self.data)
        # error handling, unless retry limit is reached (if specified) call sell recursively
        except:
            print('Problem connecting to {}'.format(self.data))
            # if no retry limit specified, keep calling self recursively
            if retries == -1:
                print('Retrying...')
                return self.get_response()
            # if retry limit is specified but not reached yet, call self and decrement number of attempts left
            elif retries > 0:
                print('Retrying... {} attempts left'.format(retries))
                return self.get_response(retries=retries-1)
            # otherwise, if retry limit is reached, abort
            print('Retry limit has been reached, aborting')
            return None
                
        
    def get_html(self):
        """Invokes web request, extracts html string, both stores in .html and returns it."""
        self.html = self.get_response().text
        return self.html
    
    def get_soup(self):
        """Parses html from url into BeautifulSoup object. Stores in .soup and returns it."""
        # if web request has not already been done, do it now
        if not self.html:
            self.get_html()
        # if there is still not html (because of request error), abort
        if not self.html:
            print('Problem souping page, aborting...')
            return None
        # parse into BeautifulSoup object, store in self.soup and return
        self.soup = BeautifulSoup(self.html, 'html.parser')
        return self.soup

    
class InscriptionPage(BaseWebPage):
    """Extracts information from a page for a specific inscription."""
    reference = None
    region = None
    info = None
    crossrefs = None
    text = None
    
    def get_reference(self):
        """Gets reference number of inscription"""
        return self.soup.find('span', class_='fullref').get_text()
    
    def get_region(self):
        """Gets general region of inscription"""
        region_links = self.soup.find('div', class_='hdr1').find_all('a', class_='link reglink')
        # return last item in list, which has specific region
        return region_links[len(region_links) - 1].span.get_text()
    
    def get_info(self):
        """Gets inscription information, usually contains dating info or location information"""
        info_text = ''
        try:
            info_lines = self.soup.find('div', class_='tildeinfo light').find_all('span', class_='ti')
        # return blank string if no info found
        except:
            return info_text
        for info_line in info_lines:
            # add new information to end of info_text as well as a period and whitespace
            info_text = info_text + info_line.get_text() + '. '
        # remove the trailing whitespace and return
        return info_text.rstrip()

    def get_crossrefs(self, as_list=False):
        """Gets crossreferences, defaults to string result unless as_list specified"""
        cross_ref_text = ''
        try:
            crossrefs = self.soup.find('div', class_='tildeinfo light').find_all('div', class_='xrefs')
            # if as_list, just return crossrefs, no further work needed
            if as_list:
                return crossrefs
        # return empty list or blank string if no results found
        except:
            if as_list:
                return []
            return cross_ref_text
        for cross_ref in crossrefs:
            # add new reference to end of cross_ref_text as well as a period and whitespace
            cross_ref_text = cross_ref_text + cross_ref.get_text() + '. '
        # remove the trailing whitespace and return
        return cross_ref_text.rstrip()
    
    def get_text(self, as_list=False, simplify_text=True):
        """Gets inscription text, if as_list specified, returns as list of lines."""
        text = ''
        text_lines = []
        table_rows = self.soup.find('table', class_='grk').find_all('tr')
        for table_row in table_rows:
            text_lines.append(table_row.get_text())
        # stop here if as_list specified, no further work needed
        if as_list:
            return text_lines
        for text_line in text_lines:
            # clean up text if simplify_text is flagged
            if simplify_text == True:
                # remove endlines
                text_line = text_lines.replace('\n', '')
                # remove numbers
                text_line = ''.join([letter for letter in text_line if not letter.isdigit()])
                # remove leading & trailing whitespaces
                text_line = text_line.strip()
            # figure out if line end is hypenated
            is_hyphenated = False
            if text_line.endswith('-'):
                is_hyphenated = True
            # add new text data to end of text
            text = text + text_line
            # add whitespace if line is not hyphenated
            if not is_hyphenated:
                text = text + ' '
            # otherwise do not add whitespace and also trim hyphen
            else:
                text = text[0:len(text)]
        # remove any trailing whitespace and return
        return text.rstrip()
    
    def extract(self):
        """Runs all getter methods and stores results in respective properties"""
        # soup page, if not done already
        if not self.soup:
            self.get_soup()
        # if there is still no soup (becauseof request error, abort)
        if not self.soup:
            print('Problem getting results, aborting...')
            return None
        self.reference = self.get_reference()
        self.region = self.get_region()
        self.info = self.get_info()
        self.crossrefs = self.get_crossrefs()
        self.text = self.get_text()
        
    def record(self):
        """Runs extract and returns data as a dictionary."""
        self.extract()
        if not self.options['silent']:
            print('Extracting data from {}'.format(self.data))
        return {
            'Reference': self.reference,
            'Region': self.region,
            'Info': self.info,
            'Cross References': self.crossrefs,
            'Text': self.text
        }

            
class SearchResultsPage(BaseWebPage):
    """Extracts information from a search results page."""
    
    def results(self):
        """Returns InscriptionPage objects for each matching item."""
        matching_items = []
        # if web page not already souped, do so now
        if not self.soup:
            self.get_soup()
        # if there is still no soup (because of request error), abort
        if not self.soup:
            print('Problem getting results, aborting...')
            return None
        matches = self.soup.find_all('div', class_='matches')
        # loop through each result extract link, append InscriptionPage instance to list
        for match in matches:
            match_url = 'https://epigraphy.packhum.org/' + str(match.find('div', class_='sentr').ul.li.a['href'])
            matching_items.append(InscriptionPage(match_url, options=self.options))
        return matching_items
    
    def records(self):
        """Runs .results() and produces list of dictionaries with extracted data for each matching record."""
        records = []
        for result in self.results():
            records.append(result.record())
        return records
    
    def extract_to_csv(self, filename='results.csv'):
        """Extracts all info from results and writes to a csv. Only method needed to do everything."""
        print('Starting search result extraction process...')
        data_records = self.records()
        print('Writing to {}'.format(filename))
        # define column header names
        col_names = ['Reference', 'Region', 'Info', 'Cross References', 'Text']
        # open csv file, create csv writer object, and write the header row
        with open(filename, mode='w+') as csv_file:
            csv_writer = csv.DictWriter(csv_file, fieldnames=col_names)
            csv_writer.writeheader()
            # loop through each result record and write to file
            for data_record in data_records:
                csv_writer.writerow(data_record)
        print('Finished writing!')
        return True


"""===Main Script==="""
# create object instance
search_results = SearchResultsPage(SEARCH_URL)
# extract data and write to csv
search_results.extract_to_csv(filename='Mater Castrorum.csv')