# Building a dataset of Ancient Roman coins by ruler, using webscraping
wildwinds.com contains an html-based list of Ancient Roman coins from the era of the Roman Empire, contained on separate pages for each emperor/honoree, with inconsistently formatted descriptions and data

### Import webscraping tools: requests, BeautifulSoup, and pandas

In [105]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

### Grab list of links for the landing pages of the various emperors/rulers/honorees from https://www.wildwinds.com/coins/ric/i.html

In [106]:
with requests.get('https://www.wildwinds.com/coins/ric/i.html') as raw:
    soup = BeautifulSoup(raw.content, 'lxml')

### Parse html data for a clean list of ruler names

In [107]:
options = soup.find_all('option')
emperors_raw = [i.contents for i in options if i.attrs['value'] != ''][:-6]
emperors = []
for line in emperors_raw:
    for text in line:
        emperors.append(text.strip())
print(f'First five: {emperors[:5]} \nLast five: {emperors[-5:]} \n{len(emperors)} emperors total')

First five: ['Aelia Ariadne', 'Aelia Flacilla', 'Aelia Verina', 'Aelius', 'Aemilian'] 
Last five: ['Vitellius', 'Volusian', 'Zeno', 'Zenobia', 'Zenonis'] 
231 emperors total


### Generate list of usable link roots for each Emperor's coin page

In [108]:
linkroots = ['https://www.wildwinds.com/coins/ric/' + i.attrs['value'][:-6] for i in options if i.attrs['value'] != ''][:-6]
print(f'First five: {linkroots[:5]} \nLast five: {linkroots[-5:]} \n{len(linkroots)} linkroots total')

First five: ['https://www.wildwinds.com/coins/ric/aelia_ariadne/', 'https://www.wildwinds.com/coins/ric/aelia_flaccilla/', 'https://www.wildwinds.com/coins/ric/aelia_verina/', 'https://www.wildwinds.com/coins/ric/aelius/', 'https://www.wildwinds.com/coins/ric/aemilian/'] 
Last five: ['https://www.wildwinds.com/coins/ric/vitellius/', 'https://www.wildwinds.com/coins/ric/volusian/', 'https://www.wildwinds.com/coins/ric/zeno/', 'https://www.wildwinds.com/coins/ric/zenobia/', 'https://www.wildwinds.com/coins/ric/zenonis/'] 
231 linkroots total


### Create a semi-random list of test pages for building an adaptable parser

In [109]:
testpages = [2, 4, 8, 12, 21, 42, 81, 118, 142, 196]
test_roots = [linkroots[i] for i in testpages]
for i in test_roots:
    print(i)

https://www.wildwinds.com/coins/ric/aelia_verina/
https://www.wildwinds.com/coins/ric/aemilian/
https://www.wildwinds.com/coins/ric/agrippina_II/
https://www.wildwinds.com/coins/ric/annius_verus/
https://www.wildwinds.com/coins/ric/augustus/
https://www.wildwinds.com/coins/ric/commodus/
https://www.wildwinds.com/coins/ric/galba/
https://www.wildwinds.com/coins/ric/laelianus/
https://www.wildwinds.com/coins/ric/martinian/
https://www.wildwinds.com/coins/ric/tacitus/


## pull html from test pages

In [110]:
test_pages_raw = [requests.get(linkroots[i] + 'i.html') for i in testpages]

## explore test page html

In [111]:
test_soups = [BeautifulSoup(test_pages_raw[i].content, 'lxml') for i in range(len(test_pages_raw))]

In [112]:
print(f'Test soup preview: \n\n{str(test_soups[4])[:1500]}')

Test soup preview: 

<html>
<head>
<title>Augustus, Roman Imperial Coins of, at WildWinds.com
</title>
<meta content="width=device-width, initial-scale=1" name="viewport"/>
<base target="_parent"/>
</head>
<body bgcolor="#D7D5D5">
<object data="https://www.wildwinds.com/coins/topscript.html" height="160" width="100%"></object><br/>
<center>
<p></p><h2>Browsing Roman Imperial Coins of Augustus</h2>
<p><img src="toppic.jpg"/>
<br/><font size="2">Octavian: Senator, Consul and Triumvir from 43 BC; de-facto sole ruler from 27 BC; proclaimed Augustus, emperor 12 BC - 14 AD.</font>
</p><h3><a href="t.html">Browse the Augustus page with thumbnail images.</a></h3>
<table border="1" cellpadding="0" cellspacing="1">
<tr><td>Note </td><td colspan="3">This page has later Roman Imperial issues from after Octavian was proclaimed Augustus in 27 BC. Earlier Imperatorial issues are listed on the <a href="../../imp/octavian/i.html">Octavian page</a>, though most Provincial issues are listed here regardle

### Create function to pull soup title

In [113]:
def pull_title(soup):
    raw_title = soup.find('title').contents[0]
    sep = raw_title.find(',')
    return str(raw_title[:sep])

In [132]:
def pull_title_test(soups=test_soups): 
    titles = pd.Series([pull_title(i) for i in soups])
    print(f'Out of {len(soups)} soups, {titles.isna().sum()} have missing titles.')

pull_title_test()
print('')
for soup in test_soups:
    print(pull_title(soup), sep=' ')

Out of 10 soups, 0 have missing titles.

Aelia Verina
Aemilian
Agrippina II
Annius Verus
Augustus
Commodus
Galba
Laelianus
Martinian
Tacitus


### Create a similar function to pull subtitles

In [115]:
def pull_subtitle(soup):
    try:
        subtitle = soup.find_all('h3')[0].contents[-1]
        if 'Click' in str(subtitle) or 'Browse' in str(subtitle):
            try:
                subtitle = soup.find('font').contents[0]
            except:
                return None
    except IndexError:
        try:
            subtitle = soup.find_all('p')[1].contents[-1]
            if 'Click' in str(subtitle) or 'Browse' in str(subtitle):
                raise IndexError
            elif len(subtitle) < 4:
                try:
                    subtitle = soup.find_all('br')[0].contents[0]
                except:
                    subtitle = None
        except IndexError:
            subtitle = None
    if '(' in str(subtitle) or '<' in str(subtitle):
        return None
    return str(subtitle).strip()

In [116]:
def pull_subtitle_test(soups=test_soups):
    subtitles = pd.Series([pull_subtitle(i) for i in soups])
    print(f'Out of {len(soups)} soups, {subtitles.isna().sum()} have missing subtitles.')

pull_subtitle_test()
print('')
for soup in test_soups:
    print(pull_subtitle(soup))
    

Out of 10 soups, 2 have missing subtitles.

Wife of Leo I. Died 484 AD.
Aemilian, 253 AD.
None
Son of Marcus Aurelius. Died 161 AD.
Octavian: Senator, Consul and Triumvir from 43 BC; de-facto sole ruler from 27 BC; proclaimed Augustus, emperor 12 BC - 14 AD.
Commodus, Caesar 172-179, Augustus 179-192 AD.
Galba, AD 68-69.
Ulpius Cornelius Laelianus, usurper against Postumus, 269 AD.
None
Tacitus, 275-276 AD.


### Create pull_coins to create a list of html chunks for each coin

In [117]:
def pull_coins(soup):
    coins = [coin.contents for coin in soup.find_all('tr') if len(coin) >2 and 'bgcolor' in str(coin)]
    return coins

def pull_coins_test(soup_no=7):
    test_coins = pull_coins(test_soups[soup_no])
    print(f'{len(test_coins)} coins in soup \nFirst three: \n')
    for i in test_coins[:3]:
        print(i, '\n')
    print(f'Last three: \n')
    for i in test_coins[-3:]:
        print(i, '\n')
    
pull_coins_test(4)
# enter function argument 1-10 to test against corresponding test soup

808 coins in soup 
First three: 

[<td bgcolor="#FFD700">BMCRE 317</td>, <td>Augustus AR aureus, Emerita mint. 19-18 BC. 7.86 g. CAESAR AVGVSTVS between two laurel trees / OB CIVIS SERVATOS in three lines within an oak wreath. BMCRE 317; Cohen 206; Calico 249; RIC -; Sear -.</td>, <td><a href="BMCRE_317.txt">Text</a></td>, <td><a href="BMCRE_317.jpg">Image</a></td>] 

[<td bgcolor="#C0C0C0">BMCRE 317<br/>denarius</td>, <td>Augustus AR denarius from aureus dies, Emerita mint. 19-18 BC. CAESAR AVGVSTVS between two laurel trees / OB CIVIS SERVATOS in three lines within an oak wreath. BMCRE 317 var (aureus only); RIC - (ditto); Sear - (ditto).</td>, <td><a href="BMCRE_317_denarius.txt">Text</a></td>, <td><a href="BMCRE_317_denarius.jpg">Image</a></td>] 

[<td bgcolor="#B87333">Cohen 92</td>, <td>Augustus, AE contortiate. Struck AD 356-394. 38.7 mm, 26.81 gr. DIVVS AVGVSTVS PATER, laureate head right, countermark: palm branch in right field / Hunting scene, stag running right, chased by hou

### Create coin_metal() to identify the coinage metal

In [150]:
def coin_metal(coin):
    metals = {'#B8':'Copper','#b8':'Copper', '#FF':'Gold', '#C0':'Silver', '#B7':'Brass', '#b7':'Brass', 'red':'FAKE'}
    try:
        bg_color_index = int(str(coin).find('bgcolor=')) + 9
        bg_color = str(coin)[bg_color_index:bg_color_index + 3]
        metal = metals[bg_color]
    except:
        return None
    return metal

def coin_metal_test(soup_no=7):
    test_coins = pull_coins(test_soups[soup_no])
    metals = [coin_metal(i) for i in test_coins]
    print(f'First five: {metals[:5]}')
    print(f'Last five: {metals[-5:]}')
    df = pd.Series(metals)
    print(f'Out of {len(test_coins)} coins, {df.isna().sum()} have missing metal values.')

coin_metal_test(4)
# enter function argument 1-10 to test against corresponding test soup

First five: ['Gold', 'Silver', 'Copper', 'Silver', 'Silver']
Last five: ['Copper', 'Copper', 'Copper', 'Copper', 'Copper']
Out of 808 coins, 1 have missing metal values.


### Create functions to pull coin era (i.e. 'AD' or 'BC') and <i>a</i> year (not <i>every</i> year) in the coin description 
(if there is a range of years i.e. 117-124 AD, function pulls the year closest to era i.e. '117-124 AD' returns '124', while 'AD 117-124' returns '117')

In [146]:
def coin_era(coin):
    coin = str(coin)
    if 'AD' in coin:
        return 'AD'
    elif 'BC' in coin:
        return 'BC'
    else:
        return None

def coin_year(coin):
    era = coin_era(coin)
    coin = str(coin)
    year = 0
    try:
        index = coin.find(coin_era(coin))
    except TypeError:
        return None
    if coin[index - 2].isnumeric():
        if coin[index - 3].isnumeric():
            if coin[index - 4].isnumeric():
                year += int(coin[index-4:index-1])
            else:
                year += int(coin[index-3:index-1])
        else:
            year += int(coin[index-2])
    elif coin[index + 3].isnumeric():
        if coin[index + 4].isnumeric():
            if coin[index + 5].isnumeric():
                year += int(coin[index+3:index+6])
            else:
                year += int(coin[index+3:index+4])
        else:
            year += int(coin[index+3])
    if era == 'AD':
        return year
    elif era == 'BC':
        return -year
    elif era == None:
        return None

In [159]:
def coin_year_test(soup_no=7):
    test_coins = pull_coins(test_soups[soup_no])
    years = [coin_year(i) for i in test_coins]
    print(f'First five: {years[:5]}')
    print(f'Last five: {years[-5:]}')
    df = pd.Series(years)
    print(f'Out of {len(test_coins)} coins, {df.isna().sum()} have missing year values.')
    
coin_year_test(4)
# enter function argument 1-10 to test against corresponding test soup
    

First five: [-18, -18, 356, -23, -23]
Last five: [-19, -19, -5, 1, 1]
Out of 808 coins, 284 have missing year values.


### Create functions to pull .jpg and .txt links from coins

In [120]:
def coin_jpg(coin):
    for item in coin:
        item = str(item)
        if '.jpg' in item:
            start, end = item.find('href=')+6, item.find('.jpg')+4
            return item[start:end]

In [121]:
def coin_txt(coin):
    for item in coin:
        item = str(item)
        if '.txt' in item:
            start, end = item.find('href=')+6, item.find('.txt')+4
            return item[start:end]

### use code from above functions to create a crude function to pull coin_id from links

In [122]:
def coin_id(coin):
    coin = str(coin)
    if '.jpg' in coin:
        start, end = coin.rfind('href=')+6, coin.rfind('.jpg')
        return coin[start:end]
    elif '.txt' in coin:
        start, end = coin.rfind('href=')+6, coin.rfind('.txt')
        return coin[start:end]
    else:
        return None

### Create function for coin mass in grams

In [123]:
def coin_mass(coin):
    coin = str(coin)
    # might be able to clean this up by creating a list of possible gram abbreviations and looping through it
    if 'g.' in coin:
        start = coin.find('g.')
        if coin[start-1].isnumeric() or coin[start-2].isnumeric():
            grams = 'g.'
    elif 'g,' in coin:
        start = coin.find('g,')
        if coin[start-1].isnumeric() or coin[start-2].isnumeric():
            grams = 'g,'
    elif 'gm.' in coin:
        grams = 'gm.'
    elif 'gr.' in coin:
        grams = 'gr.'
    elif 'grs,' in coin:
        grams = 'grs,'
    elif 'grs.' in coin:
        grams = 'grs.'
    else:
        return None
    try:
        start = coin.find(grams)
        if coin[start-1].isnumeric():
            if coin[start-2] == '.':
                if coin[start-4].isnumeric():
                    return float(coin[start-4:start])
                else:
                    return float(coin[start-3:start])
        elif coin[start-3] == '.':
            if coin[start-5].isnumeric():
                return float(coin[start-5:start-1])
            else:
                return float(coin[start-4:start-1])
        else:
            if coin[start-6].isnumeric():
                return float(coin[start-6:start-1])
            else:
                return float(coin[start-5:start-1])
    except:
        return None

### Create function for coin size in mm

In [124]:
def coin_diameter(coin):
    coin = str(coin)
    if 'mm' in coin:
        start = coin.find('mm')
        # if there's no space between value and 'mm'
        if coin[start-1].isnumeric():
            if coin[start-2].isnumeric():
                # if value contains a decimal
                if coin[start-3] == '.': 
                    if coin[start-5:start-3].isnumeric():
                        return float(coin[start-5:start])
                    else:
                        return None
                else:
                    return float(coin[start-2:start])
            # if value contains a decimal
            elif coin[start-2] == '.':
                if coin[start-4:start-2].isnumeric():
                    return float(coin[start-4:start])
                else:
                    return None
        # if there is a space between value and 'mm'
        elif coin[start-1] == ' ' and coin[start-2].isnumeric():
            if coin[start-3].isnumeric():
                # if value contains a decimal
                if coin[start-4] == '.':
                    if coin[start-6:start-4].isnumeric():
                        return float(coin[start-6:start-1])
                    else:
                        return None
                else:
                    return float(coin[start-3:start-1])
            elif coin[start-3] == '.':
                if coin[start-5:start-3].isnumeric():
                    return float(coin[start-5:start-1])
                else:
                    return None


### Check for common inscriptions
...such as "AVG" (Augustus, title of the emperor), "IMP" (Imperator (victorious general), received upon accession), "CAES" (Caesar, inherited name of the Julian family (Julius Caesar), used by later emperors to designate heir), "GERM" (Germanicus, a title honoring military victories in Germany), "COS" (Consul, a title linked to highest office in Senate, usually held by emperor), "PO" (Pontifex Maximus, highest priest, the head of state religion), "TPP" (Tribunica Potestate, tribune of the people, each renewal indicated by numerals), "CENS" (Censor, a public office overseeing taxes, morality, the census and membership in various orders).

In [125]:
def coin_inscriptions(coin):
    coin = str(coin)
    inscriptions_list = ['AVG', 'IMP', 'CAES', 'GERM', 'COS', 'PO', 'CENS', 'TPP', 'TR', 'RESTITVT']
    coin_inscriptions = []
    for i in inscriptions_list:
        if i in coin:
            coin_inscriptions.append(i)
    if len(coin_inscriptions) > 0:
        return coin_inscriptions
    else:
        return None

## Combine individual coin parsing functions into one that returns a Dataframe
i.e. coin_df(soup)

In [126]:
def coin_df(soup):
    title = pull_title(soup)
    id, metal, mass, diameter, era, year, inscriptions, jpg, txt = [], [], [], [], [], [], [], [], []
    for coin in pull_coins(soup):
        id.append(coin_id(coin))
        metal.append(coin_metal(coin))
        mass.append(coin_mass(coin))
        diameter.append(coin_diameter(coin))
        era.append(coin_era(coin))
        year.append(coin_year(coin))
        inscriptions.append(coin_inscriptions(coin))
        jpg.append(coin_jpg(coin))
        txt.append(coin_txt(coin))
    return pd.DataFrame({'title':title, 'id':id, 'metal':metal, 'mass':mass, 'diameter':diameter, \
                         'era':era, 'year':year, 'inscriptions':inscriptions, 'jpg':jpg, 'txt':txt})

In [127]:
coin_df(test_soups[2]).head()

Unnamed: 0,title,id,metal,mass,diameter,era,year,inscriptions,jpg,txt
0,Agrippina II,Cohen_01,Brass,,,AD,5.0,"[AVG, CAES, GERM]",Cohen_01.jpg,Cohen_01.txt
1,Agrippina II,Cohen_01_countermarked,Brass,14.78,28.0,,,"[AVG, CAES, GERM]",Cohen_01_countermarked.jpg,Cohen_01_countermarked.txt
2,Agrippina II,RIC_0075,Silver,,,AD,54.0,"[AVG, CAES, GERM]",RIC_0075.jpg,RIC_0075.txt
3,Agrippina II,,Brass,,,,,"[AVG, CAES, GERM]",,
4,Agrippina II,RIC_0103note,Brass,,,AD,54.0,"[AVG, CAES, GERM]",RIC_0103note.jpg,RIC_0103note.txt


## Create a function to combine multiple Dataframes

In [128]:
def combine_coin_dfs(soups):
    dfs = [coin_df(soup) for soup in soups]
    return pd.concat(dfs) 

In [129]:
len(combine_coin_dfs(test_soups))

2577

# Switch to working with full data

## Pull html from all source pages
(pulling from over 200 pages, which can take a few minutes)

In [130]:
pages_raw = [requests.get(linkroots[i] + 'i.html') for i in range(len(linkroots))]

In [131]:
all_soups = [BeautifulSoup(pages_raw[i].content, 'lxml') for i in range(len(pages_raw))]

#### Test titles and subtitles

In [133]:
pull_title_test(all_soups)
pull_subtitle_test(all_soups)

Out of 231 soups, 0 have missing titles.
Out of 231 soups, 79 have missing subtitles.


### Test individual soups

In [None]:
def soup_test(soup_no=24):
    soup = all_soups[soup_no]
    
    

## Combine it all into a single Dataframe

In [135]:
roman_coins = combine_coin_dfs(all_soups)

In [136]:
roman_coins.reset_index()
roman_coins.index += 1

In [137]:
roman_coins.head(10)

Unnamed: 0,title,id,metal,mass,diameter,era,year,inscriptions,jpg,txt
1,Aelia Ariadne,RIC_0933a,Gold,1.46,14.0,AD,491.0,[AVG],RIC_0933a.jpg,RIC_0933a.txt
2,Aelia Ariadne,RIC_933v,Gold,1.47,15.0,AD,491.0,[AVG],RIC_933v.jpg,RIC_933v.txt
3,Aelia Ariadne,RIC_0933var2,Gold,4.47,,AD,474.0,[AVG],RIC_0933var2.jpg,RIC_0933var2.txt
4,Aelia Ariadne,RIC_0938,Gold,1.47,,AD,474.0,[AVG],RIC_0938.jpg,RIC_0938.txt
1,Aelia Flaccilla,_alexandria_RIC_017,Copper,,,,,[AVG],_alexandria_RIC_017.jpg,_alexandria_RIC_017.txt
2,Aelia Flaccilla,_antioch_RIC_054,Copper,,,,,,_antioch_RIC_054.jpg,_antioch_RIC_054.txt
3,Aelia Flaccilla,_antioch_RIC_061,Copper,,,,,[AVG],_antioch_RIC_061.jpg,_antioch_RIC_061.txt
4,Aelia Flaccilla,_antioch_RIC_061_G,Copper,,,,,[AVG],_antioch_RIC_061_G.jpg,_antioch_RIC_061_G.txt
5,Aelia Flaccilla,"_antioch_RIC_062,D",Copper,,,,,[AVG],"_antioch_RIC_062,D.jpg","_antioch_RIC_062,D.txt"
6,Aelia Flaccilla,_antioch_RIC_062,Copper,,22.0,,,[AVG],_antioch_RIC_062.jpg,_antioch_RIC_062.txt


### Check data quality

In [138]:
roman_coins.describe()

Unnamed: 0,title,id,metal,mass,diameter,era,year,inscriptions,jpg,txt
count,40728,38343,39908,9039.0,5358.0,24311,23374,23080,38341,38342
unique,225,33274,5,2471.0,479.0,2,468,133,33174,33170
top,Caracalla,RIC_0013,Copper,2.9,18.0,AD,193,[AVG],RIC_0013.jpg,RIC_0013.txt
freq,2411,23,28786,39.0,357.0,23774,994,6015,24,24


### run code chunk below to save new Dataframe as a csv file

In [30]:
roman_coins.to_csv('roman_coins_raw.csv')

## Create Dataframe for just titles and subtitles 
(since subtitles are often too long to fit comfortably in a Dataframe - useful for later joins if subtitles are desired)

In [13]:
titles_and_subtitles = 
    

### create a new dataframe which includes linkroots before jpg, using join