In [57]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sun Oct 15 19:31:48 2017
@author: sjhuskey
"""

''' This script:
        * reads in the data at http://latin.packhum.org/canon (the PHI canon).
        * scrapes data for each work.
        * writes results to a CSV file
    See inline comments for information about how it works.
    
    Parts of this program owe much to Lisa Tagliaferri's tutorial at https://www.digitalocean.com/community/tutorials/how-to-scrape-web-pages-with-beautiful-soup-and-python-3.
    Tyler Pearson, Director of Informatics for OU Libraries, provided advice and support.
'''
    
import requests, csv, codecs
from bs4 import BeautifulSoup

# Create the CSV file, open it so that it appends new data instead of overwriting old data.
with codecs.open('data/phi.csv','w',encoding='utf8') as f:
    w = csv.writer(f)

# Write the header for the CSV.
    w.writerow(['Author','PHI Author Number','Title','PHI Work Number','Combined PHI Number','Reference Edition','URL','Publisher','Rights'])

# Open the page and parse the data.
    page = requests.get('http://latin.packhum.org/canon')
    soup = BeautifulSoup(page.text, 'lxml')
        
# Limit the html to just the main content of the page.
    item_list = soup.find('div', class_="canon")

    '''Since author name data is in a different level of the tree from the works,
    create a dictionary with author number as key and author name as value.
    Since the works also include the author number, the dictionary will be 
    a lookup table for when the data is assembled into rows for the CSV below.'''

# Select the author numbers, being sure to select from the author level, not the work level.        
    anums1 = item_list.select('a[class="alnk"] > span[class="anum"]')

# Make an empty list to hold the author numbers as they're generated.
    number_list = []

# Loop through the anums and use .text to get just the numbers. Use .append
# to add them to the number_list.
    for anum in anums1:
        numbers = anum.text
        number_list.append(numbers)

# Select the spans with class "anam", which contains the author's name.
    anams = item_list.find_all('span', {'class' : 'anam'})

# Make an empty list to hold the author names as they're generated.    
    author_list = []

# Loop through the anams and use .text to get just the text for the names. Use
# .append to add the authors to the author_list.
    for anam in anams:
        authors = anam.span.text
        author_list.append(authors)

# Make the dictionary with number as key and name as value for each author.        
    author_dict = dict(zip(number_list,author_list))   
    
# Now, select the data for the works.
    
# Author numbers, which will be used for the lookup below.    
    anums2 = item_list.select('a[class="wlnk"] > span[class="anum"]')
# Work names.
    wnams = item_list.select('span[class="wnam"] > span')
# Work numbers.
    wnums = item_list.select('span[class="wnum"]')
# Bibliography for the reference edition.
    bib_list = item_list.find_all('span', {'class' : 'bib'})
# Relative link to the work.
    link_list = item_list.select('li[class="work"] > a')

# Create an empty list to hold numbers as they're generated below.
    lookup_keys = []
# Loop through of author numbers to and append them to the lookup_keys list. 
    for anumber in anums2:
        author_numbers = anumber.text
        lookup_keys.append(author_numbers)

# Create an empty list to hold values as they're retrieved from the author_dict.
    lookup_values = []
# Loop through the lookup_keys list and retrieve the corresponding values from author_dict.
    for item in lookup_keys:
        if item in author_dict:
            lookup_values.append(author_dict.get(item))
    
# Put all of the data together.
    for author, anum, wnam, wnum, bib, link in zip(lookup_values, anums2, wnams, wnums, bib_list, link_list):            
        anums = anum.contents[0]
        works = wnam.text
        work_numbers = wnum.contents[0]
        phi_number = anums + '.' + work_numbers      
        bibs = bib.text
        links = 'http://latin.packhum.org' + link['href']
        publisher = "PHI"
        rights = "Fair Use"

# Write results to the CSV file.                
        w.writerow([author,anums,works,work_numbers,phi_number,bibs,links,publisher,rights])

In [60]:
author_df = pd.read_csv('data/phi.csv',sep=',')

In [61]:
author_df

Unnamed: 0,Author,PHI Author Number,Title,PHI Work Number,Combined PHI Number,Reference Edition,URL,Publisher,Rights
0,Ablabius,2000,epigramma,1,2000.001,Fragmenta Poetarum Latinorum Epicorum et Lyric...,http://latin.packhum.org/loc/2000/1/0,PHI,Fair Use
1,Lucius Accius,400,carmina,1,400.001,Fragmenta Poetarum Latinorum Epicorum et Lyric...,http://latin.packhum.org/loc/400/1/0,PHI,Fair Use
2,Lucius Accius,400,praetextae,2,400.002,"Scaenicae Romanorum Poesis Fragmenta. Vol. 1, ...",http://latin.packhum.org/loc/400/2/0,PHI,Fair Use
3,Lucius Accius,400,tragoediae,3,400.003,"Scaenicae Romanorum Poesis Fragmenta. Vol. 1, ...",http://latin.packhum.org/loc/400/3/0,PHI,Fair Use
4,Valerius Aedituus,402,epigrammata,1,402.001,Fragmenta Poetarum Latinorum Epicorum et Lyric...,http://latin.packhum.org/loc/402/1/0,PHI,Fair Use
5,Aemilius Sura,2300,De Annis Populi Romani,1,2300.001,"Historicorum Romanorum Reliquiae, Vol. 2, ed. ...",http://latin.packhum.org/loc/2300/1/0,PHI,Fair Use
6,Lucius Afranius,404,togatae,1,404.001,"Scaenicae Romanorum Poesis Fragmenta. Vol. 2, ...",http://latin.packhum.org/loc/404/1/0,PHI,Fair Use
7,Iulius Africanus,902,oratio,1,902.001,M. Fabi Quintiliani Institutionis Oratoriae Li...,http://latin.packhum.org/loc/902/1/0,PHI,Fair Use
8,Gnaeus Domitius Ahenobarbus,301,oratio,1,301.001,Oratorum Romanorum Fragmenta Liberae Rei Publi...,http://latin.packhum.org/loc/301/1/0,PHI,Fair Use
9,"Albinus, poet.",2002,Rerum Romanarum Liber I,1,2002.001,Fragmenta Poetarum Latinorum Epicorum et Lyric...,http://latin.packhum.org/loc/2002/1/0,PHI,Fair Use


In [75]:
author_df['Author'].value_counts()

Marcus Tullius Cicero                75
Anonymi Epici et Lyrici              41
Scriptores Historiae Augustae        30
Titus Maccius Plautus                22
Didascaliae et Argum. in Plautum     19
Lucius Annaeus Seneca iunior         18
Marcus Terentius Varro               17
Marcus Cornelius Fronto              17
Publius Ovidius Naso                 15
Appendix Vergiliana                  14
Marcus Porcius Cato                  12
Apuleius Madaurensis                 10
Gaius Iulius Caesar                   8
Fragmenta Bobiensia                   8
Gaius Sallustius Crispus              8
C. Iul. Caes. Augustus Octavianus     8
Gaius Caesius Bassus                  7
Maurus Servius Honoratus              7
Gnaeus Naevius                        7
Quintus Ennius                        7
Pomponius Porphyrio                   7
Quintus Horatius Flaccus              6
Publius Terentius Afer                6
Gaius Suetonius Tranquillus           6
Sextus Iulius Frontinus               6


In [77]:
author_df[author_df['Author']==author_df['Title']]

Unnamed: 0,Author,PHI Author Number,Title,PHI Work Number,Combined PHI Number,Reference Edition,URL,Publisher,Rights
12,Altercatio Hadr. et Epicteti,1500,Altercatio Hadr. et Epicteti,1,1500.001,Altercatio Hadriani Augusti et Epicteti Philos...,http://latin.packhum.org/loc/1500/1/0,PHI,Fair Use
137,Bucolica Einsidlensia,821,Bucolica Einsidlensia,1,821.001,"Calpurnii et Nemesiani Bucolica, Accedunt Eins...",http://latin.packhum.org/loc/821/1/0,PHI,Fair Use
167,Carmen Arvale,149,Carmen Arvale,1,149.001,Anthologia Latina sive Poesis Latinae Suppleme...,http://latin.packhum.org/loc/149/1/0,PHI,Fair Use
168,Carmen Devotionis,306,Carmen Devotionis,1,306.001,"Ambrosii Theodosii Macrobii Saturnalia, ed. J....",http://latin.packhum.org/loc/306/1/0,PHI,Fair Use
169,Carmen Evocationis,309,Carmen Evocationis,1,309.001,"Ambrosii Theodosii Macrobii Saturnalia, ed. J....",http://latin.packhum.org/loc/309/1/0,PHI,Fair Use
170,Carmen de Bello Aegyptiaco,706,Carmen de Bello Aegyptiaco,1,706.001,"C. Rabirius, Bellum Actiacum e Papiro Herculan...",http://latin.packhum.org/loc/706/1/0,PHI,Fair Use
284,Commentarii Augurum,492,Commentarii Augurum,1,492.001,M. Terenti Varronis De Linguae Latinae Quae Su...,http://latin.packhum.org/loc/492/1/0,PHI,Fair Use
285,Commentarii Consulares,494,Commentarii Consulares,1,494.001,M. Terenti Varronis De Linguae Latinae Quae Su...,http://latin.packhum.org/loc/494/1/0,PHI,Fair Use
455,Laus Pisonis,911,Laus Pisonis,1,911.001,"Laus Pisonis: Text, Übersetzung, Kommentar: In...",http://latin.packhum.org/loc/911/1/0,PHI,Fair Use
501,Mimi Poetarum Incertorum,584,Mimi Poetarum Incertorum,1,584.001,"I Mimi Romani, ed. M. Bonaria, 1965",http://latin.packhum.org/loc/584/1/0,PHI,Fair Use


In [78]:
url = author_df[author_df['Author']=='Publius Vergilius Maro']['URL'].values[0]
driver.get(url)