In [2]:
import pandas as pd
import numpy as np
import doctest
import re

import sys
stdout = sys.stdout
reload(sys)
sys.setdefaultencoding('utf-8')
sys.stdout = stdout

## TODO:
* ~~Fix clean data~~
* ~~Remove duplicates~~
* Keep only denarius and aureaus

# Read in Data

In [3]:
stringToList = lambda x: x.strip('[]{}').split(", ") # Converts string seperated by commas to a list

def stringToListofDicts(string): 
    '''
    Parameters
    ----------
    string: str
        Takes in a python string representation of a list of dictionaries with the 
        dictionaries seperated by '|', each key-value pair seperated by ';', and
        the keys and values are seperated by ':'
    
    Returns
    -------
    Returns a list of dictionaries
    
    Example
    -------
    stringToListofDicts('Inscription Type:inscription;Inscription Position:reverse;Inscription Language:Latin;Inscription Transliteration:ODE;')
    [{'Inscription Language': 'Latin',
      'Inscription Position': 'reverse',
      'Inscription Transliteration': 'ODE',
      'Inscription Type': 'inscription'}]
    '''
    result = []
    parts = string.split('|')
    
    for part in parts:
        subparts = part.split(';')
        dic = {}
        for subpart in subparts:
            try:
                content = subpart.split(':')
                dic[content[0]] = content[1]
            except:
                pass
        result.append(dic)
    
    return result

#stringToListofDicts('Inscription Type:inscription;Inscription Position:reverse;Inscription Language:Latin;Inscription Transliteration:ODE;')

In [4]:
df = pd.read_csv('AugustusCoins_44BC-14AD.csv',
                 converters={"Authority": stringToList, 'Associated names': stringToList, 
                             'Subjects': stringToList, 'Inscriptions': stringToListofDicts})
df = df.drop('Unnamed: 0', 1)
df = df.replace(np.nan, '', regex=True)
df.Denomination.unique()

array(['', 'denarius ', 'as ', 'dupondius ', 'semis ', 'quadrans ',
       'unit (?) ', 'drachm ', 'unit ', 'sestertius ', 'dupondius (?) ',
       'aureus ', 'tetradrachm ', 'cistophorus ', 'as (cut half) ',
       'quinarius ', 'tridrachm (cistophorus) ', 'denarius serratus ',
       'dupondius or as ', 'quarter stater ', 'sestertius (probably) ',
       'prutah ', 'denarius (plated) ', 'unit,1/4 ', 'stater ',
       'tetradrachm (Attic Weight Standard) ', 'tressis '], dtype=object)

# Clean Data

In [5]:
def cleanString(string):
    '''
    Parameters
    ----------
    string: str
        String that needs to be cleaned
    
    Returns
    -------
    Removes the descriptor and returns the (assumed) description
    
    Doctests
    --------
    >>> cleanString('Named in inscription & portrayed: Julius Caesar (probably)')
    'Julius Caesar'
    >>> cleanString('Ruler: Augustus (Octavian) (?)')
    'Augustus (Octavian)'
    >>> cleanString('dupondius    (?)          ')
    'dupondius'
    >>> cleanString('        http://www.google.com')
    'http://www.google.com'
    '''
    remove_items = ['(?)', '(probably)']
    result = string
    
    for substr in remove_items:
        result = result.replace(substr, '')
        
    if ':' in result and 'http' not in result:
        result = [s for s in result.split(':')][1]
    return result.strip()

#doctest.testmod()
#cleanString('Ruler: Augustus (Octavian) (?)')
#cleanString('dupondius (?)')

In [6]:
def cleanList(lst):
    '''
    Parameters
    ----------
    lst: Python list
        list of strings to be cleaned
    
    Returns
    -------
    Tuple of cleaned strings
    
    Doctests
    --------
    >>> cleanList(['Ruler: Augustus (Octavian) (?)'])
    ('Augustus (Octavian)',)
    >>> cleanList(['Ruler: Augustus (Octavian) (?)', 'Moneyer: P Lurius Aggrippa'])
    ('Augustus (Octavian)', 'P Lurius Aggrippa')
    >>> cleanList(['symbol', 'emperor/empress'])
    ('symbol', 'emperor/empress')
    '''
    return tuple([cleanString(x) for x in lst])

#cleanList(['Ruler: Augustus (Octavian) (?)'])

In [7]:
def dateRange(date):
    '''
    Parameter
    ---------
    date: str
        Date range given as a string
    
    Returns
    -------
    Returns list of the date range
    
    Doctests
    --------
    >>> dateRange('27BC-14 (?)')
    (-27, 14)
    >>> dateRange('44BC (cira) -40BC')
    (-44, -40)
    >>> dateRange('4-14')
    (4, 14)
    '''
    dates = date.split('-')
    result = []
    
    for year in dates:
        certain = True
        bc = 'BC' in year
        try:
            year = int(re.sub('[^\d]', '', year))
        except:
            print(year)
            break
        if bc:
            year = 0 - year
        result.append(year)
        
    return tuple(result)
        
#doctest.testmod()
#dateRange('27BC-14 (?)')

In [8]:
def float_conversion(x):
    '''
    Parameter
    ---------
    x: str
        Input value
    
    Return
    ------
    Returns the float or 0 if empty string
    '''
    try:
        x = float(x)
    except:
        x = 0
    return x

In [9]:
def removeNotes(string):
    '''
    Parameter
    ---------
    string: str
        Input string with notes in parenthesis
        
    Return
    ------
    String with data without notes
    
    Doctests
    --------
    >>> removeNotes('Calagurris (Europe,Spain,Rioja, La (La Rioja),Calahorra,Calagurris (city - archaic))')
    'Calagurris'
    >>> 
    '''
    data = re.findall('^[^\(]+', string)[0].strip()
    return data

doctest.testmod()
#removeNotes('Lugdunum (Europe,France,Rh\xc3\xb4ne-Alpes,Rh\xc3\xb4ne,Lyon,Lugdunum)')

TestResults(failed=0, attempted=11)

In [10]:
mask = ((df['Object type'] == 'coin ') & (df['Date'].str.find('stC') == -1) &
        (df['Denomination'] != '') & (df['Production place'] != '') &
        (df['Bibliography'] != '') & (df['Weight (g)'] != 0))
filtered = df[mask]
cleaned = pd.DataFrame()

lists = ['Authority', 'Subjects', 'Associated names']
strings = ['Museum number', 'Denomination', 'Description', 'State', 'Culture/period', 'Materials', 
            'Production place', 'Curator\'s comments', 'Bibliography', 'Object type', 'url']
floats = ['Weight (g)']
dates = ['Date']
redundant_notes = ['Production place', 'Denomination']

for lst in lists:
    cleaned[lst] = filtered[lst].apply(cleanList)
for string in strings:
    cleaned[string] = filtered[string].apply(cleanString)
for flot in floats:
    cleaned[flot] = filtered[flot].apply(float_conversion).replace(np.nan, -1)
for date in dates:
    cleaned[date] = filtered[date].apply(dateRange)
for col in redundant_notes:
    cleaned[col] = cleaned[col].apply(removeNotes)
 
cleaned = cleaned.reindex_axis(sorted(cleaned.columns), axis=1)

duplicate_cols = ['Authority', 'Date', 'Production place', 'Description', 'Subjects', "Curator's comments"]
removed_dup = (cleaned.drop_duplicates(subset=duplicate_cols)
                        .reset_index(drop=True))
removed_dup.tail()

Unnamed: 0,Associated names,Authority,Bibliography,Culture/period,Curator's comments,Date,Denomination,Description,Materials,Museum number,Object type,Production place,State,Subjects,Weight (g),url
693,"(Augustus (Octavian), Nike/Victoria/Victory)","(Augustus (Octavian),)","RIC1 263, p.60 RR2 4342, p.12 RE1 616, p.101",Roman Imperial,Octavian's IMP CAESAR coinage was a celebratio...,"(-32, -29)",denarius,"Silver coin.(obverse) Victory, draped, standin...",silver,R.6163,coin,Italy,Roman Empire,"(charioteer/chariot, allegory/personification,...",3.78,http://www.britishmuseum.org/research/collecti...
694,"(Mark Antony, Octavia, Augustus (Octavian))","(M Oppius Capito, Mark Antony)","RPC1 1463 (type) RR2 154, p.518",Roman Republican,,"(-38, -37)",tressis,Copper alloy coin.(obverse) Busts of M. Antoni...,copper alloy,18600328.250,coin,Achaea,Roman Republic,"(politician/statesman, emperor/empress, boat/s...",21.51,http://www.britishmuseum.org/research/collecti...
695,"(Marcus Aemilius Lepidus, Augustus (Octavian))","(Marcus Aemilius Lepidus,)","Ghey, Leins & Crawford 2010 495.2.6 RRC 495/2a...",Roman Republican,Die appears to read IMA (ligatured) instead of...,"(-42,)",denarius,Silver coin.(obverse) Head of M. Lepidus right...,silver,20114027.3,coin,Italy,Roman Republic,"(politician/statesman, emperor/empress)",3.56,http://www.britishmuseum.org/research/collecti...
696,"(Augustus (Octavian), Eros/Cupid, Aphrodite/Ve...","(P Clodius,)","RR1 4277, p.583 Ghey, Leins & Crawford 2010 49...",Roman Republican,,"(-42,)",aureus,Gold coin; pierced for suspension.(obverse) He...,gold,18520903.11,coin,Rome,Roman Republic,"(cherub/cupid, politician/statesman, classical...",8.09,http://www.britishmuseum.org/research/collecti...
697,"(Augustus (Octavian), Tyche/Fortuna)","(Ti Sempronius Gracchus,)","RR1 4313, p.593 Ghey, Leins & Crawford 2010 52...",Roman Republican,,"(-40,)",aureus,"Gold coin.(obverse) Head of Octavian right, be...",gold,18440425.473,coin,Rome,Roman Republic,"(politician/statesman, allegory/personificatio...",7.96,http://www.britishmuseum.org/research/collecti...


In [11]:
removed_dup['Production place'].unique()

array(['Lugdunum', 'Calagurris', 'Turiaso', 'Osca', 'Tarraco', 'Bilbilis',
       'Colonia Patricia', 'Caesaraugusta', 'Ilerda', 'Masicytes',
       'Cragus', 'Ercavica', 'Segobriga', 'Osset', 'Carthago Nova', 'Acci',
       'Iulia Traducta', 'Emerita', 'Pax Iulia', 'Ebora', 'Spain', 'Rome',
       'Nemausus', 'Celsa', 'Stratoniceia', 'Ilici', 'Italica', 'Segovia',
       'Laodicea ad Mare', 'Ilercavonia', 'Chios', 'Gades', 'Hierapytna',
       'Crete', 'Eleutherna', 'Romula', 'Pergamon', 'Asia', 'Peloponnese',
       'Samos', 'Arausio', 'Pergamum', 'Ephesus', 'Syria', 'Italy',
       'Antiochia ad Orontem', 'Lyon', 'Narbo', 'Seleucia in Pieria',
       'Gaul', 'Cyrenaica', 'Artaxata', 'Vienna', 'Jerusalem', 'Carteia',
       'Side', 'Achaea', 'England'], dtype=object)

In [27]:
from bkcharts import Bar, show, defaults, cat
from bokeh.io import output_notebook, save
from bokeh.models import Range1d


In [28]:
output_notebook()

In [126]:
location = removed_dup.groupby(['Production place', 'Denomination']).size().sort_values(ascending=False).reset_index()
location.columns = ['Production place', 'Denomination', 'Count']
location['Sum'] = location.groupby('Production place')['Count'].transform('sum')
location = location.loc[location.sort_values('Sum', ascending=False).index]
print(location.head())

location_bar = Bar(location, label=cat(columns='Production place', sort=False), 
                   values='Count', stack='Denomination', responsive=True, legend='top_right',
                   tooltips=[('Denomination', '@Denomination'), 
                             ('Denomination Count', '@height'),
                             ('Location Count', '@y')])
location_bar.y_range=Range1d(0, 200, bounds=(0, 200))
save(location_bar, filename='location_bar.html')
show(location_bar)

   Production place     Denomination  Count  Sum
0              Rome         denarius     74  175
15             Rome         quadrans     12  175
28             Rome  dupondius or as      5  175
5              Rome               as     24  175
8              Rome           aureus     23  175
