In [1]:
import pandas as pd
import numpy as np
import doctest
import re

import sys
stdout = sys.stdout
reload(sys)
sys.setdefaultencoding('utf-8')
sys.stdout = stdout

## TODO:
* Manually select colors for stacked bar graph to avoid repeated colors next to each other
* Display number of coins produced on map
    * ~~Overall produced from 44-14 with Augustus~~
    * Timelapse with interactive controls for passage of time
* Predictive value of subjects, material, inscription

# Read in Data

In [2]:
stringToList = lambda x: x.strip('[]{}').split(", ") # Converts string seperated by commas to a list

def stringToListofDicts(string): 
    '''
    Parameters
    ----------
    string: str
        Takes in a python string representation of a list of dictionaries with the 
        dictionaries seperated by '|', each key-value pair seperated by ';', and
        the keys and values are seperated by ':'
    
    Returns
    -------
    Returns a list of dictionaries
    
    Example
    -------
    stringToListofDicts('Inscription Type:inscription;Inscription Position:reverse;Inscription Language:Latin;Inscription Transliteration:ODE;')
    [{'Inscription Language': 'Latin',
      'Inscription Position': 'reverse',
      'Inscription Transliteration': 'ODE',
      'Inscription Type': 'inscription'}]
    '''
    result = []
    parts = string.split('|')
    
    for part in parts:
        subparts = part.split(';')
        dic = {}
        for subpart in subparts:
            try:
                content = subpart.split(':')
                dic[content[0]] = content[1]
            except:
                pass
        result.append(dic)
    
    return result

#stringToListofDicts('Inscription Type:inscription;Inscription Position:reverse;Inscription Language:Latin;Inscription Transliteration:ODE;')

In [3]:
df = pd.read_csv('AugustusCoins_44BC-14AD.csv',
                 converters={"Authority": stringToList, 'Associated names': stringToList, 
                             'Subjects': stringToList, 'Inscriptions': stringToListofDicts})
df = df.drop('Unnamed: 0', 1)
df = df.replace(np.nan, '', regex=True)
df.Denomination.unique()

array(['', 'denarius ', 'as ', 'dupondius ', 'semis ', 'quadrans ',
       'unit (?) ', 'drachm ', 'unit ', 'sestertius ', 'dupondius (?) ',
       'aureus ', 'tetradrachm ', 'cistophorus ', 'as (cut half) ',
       'quinarius ', 'tridrachm (cistophorus) ', 'denarius serratus ',
       'dupondius or as ', 'quarter stater ', 'sestertius (probably) ',
       'prutah ', 'denarius (plated) ', 'unit,1/4 ', 'stater ',
       'tetradrachm (Attic Weight Standard) ', 'tressis '], dtype=object)

# Clean Data

In [4]:
def cleanString(string):
    '''
    Parameters
    ----------
    string: str
        String that needs to be cleaned
    
    Returns
    -------
    Removes the descriptor and returns the (assumed) description
    
    Doctests
    --------
    >>> cleanString('Named in inscription & portrayed: Julius Caesar (probably)')
    'Julius Caesar'
    >>> cleanString('Ruler: Augustus (Octavian) (?)')
    'Augustus (Octavian)'
    >>> cleanString('dupondius    (?)          ')
    'dupondius'
    '''
    remove_items = ['(?)', '(probably)']
    result = string
    
    for substr in remove_items:
        result = result.replace(substr, '')
    
    if ':' in result:
        result = [s for s in result.split(':')][1]
    return result.strip()

#doctest.testmod()
#cleanString('Ruler: Augustus (Octavian) (?)')
#cleanString('dupondius (?)')

In [5]:
def cleanList(lst):
    '''
    Parameters
    ----------
    lst: Python list
        list of strings to be cleaned
    
    Returns
    -------
    Tuple of cleaned strings
    
    Doctests
    --------
    >>> cleanList(['Ruler: Augustus (Octavian) (?)'])
    ('Augustus (Octavian)',)
    >>> cleanList(['Ruler: Augustus (Octavian) (?)', 'Moneyer: P Lurius Aggrippa'])
    ('Augustus (Octavian)', 'P Lurius Aggrippa')
    >>> cleanList(['symbol', 'emperor/empress'])
    ('symbol', 'emperor/empress')
    '''
    return tuple([cleanString(x) for x in lst])

#cleanList(['Ruler: Augustus (Octavian) (?)'])

In [6]:
def dateRange(date):
    '''
    Parameter
    ---------
    date: str
        Date range given as a string
    
    Returns
    -------
    Returns list of the date range
    
    Doctests
    --------
    >>> dateRange('27BC-14 (?)')
    (-27, 14)
    >>> dateRange('44BC (cira) -40BC')
    (-44, -40)
    >>> dateRange('4-14')
    (4, 14)
    '''
    dates = date.split('-')
    result = []
    
    for year in dates:
        certain = True
        bc = 'BC' in year
        try:
            year = int(re.sub('[^\d]', '', year))
        except:
            print(year)
            break
        if bc:
            year = 0 - year
        result.append(year)
        
    return tuple(result)
        
#doctest.testmod()
#dateRange('27BC-14 (?)')

In [7]:
def float_conversion(x):
    '''
    Parameter
    ---------
    x: str
        Input value
    
    Return
    ------
    Returns the float or 0 if empty string
    '''
    try:
        x = float(x)
    except:
        x = 0
    return x

In [8]:
def cleanProductionPlace(string):
    '''
    Parameter
    ---------
    string: str
        Input string formatted as following:
            * Minted in: (place here)
            * Minted in: (place here) Minted in: (place here)
    
    Return
    ------
    Returns the production place with the 'Minted in: ' filtered out. If
    multiple production places listed, returns the last one.
    
    Doctests
    --------    
    >>> cleanProductionPlace('Minted in: Gaul (Cisalpine) (Europe,Gaul) Minted in: Italy (Europe,Italy) ')
    'Italy (Europe,Italy)'
    >>> cleanProductionPlace('Minted in: Gaul (Cisalpine) ')
    'Gaul (Cisalpine)'
    '''
    place = string.split('Minted in: ')[-1].strip()
    if place == 'Lyon': 
        place = 'Lugdunum'
    return place

#cleanProductionPlace('Minted in: Gaul (Cisalpine) (Europe,Gaul) Minted in: Italy (Europe,Italy) ')

In [9]:
def removeNotes(string):
    '''
    Parameter
    ---------
    string: str
        Input string with notes in parenthesis
        
    Return
    ------
    String with data without notes
    
    Doctests
    --------
    >>> removeNotes('Calagurris (Europe,Spain,Rioja, La (La Rioja),Calahorra,Calagurris (city - archaic))')
    'Calagurris'
    >>> removeNotes('aureus (cut half)')
    'aureus'
    '''
    data = re.findall('^[^\(]+', string)[0].strip()
    return data

#removeNotes('aureus (cut half)')
doctest.testmod()

TestResults(failed=0, attempted=13)

In [10]:
mask = ((df['Object type'] == 'coin ') & (df['Date'].str.find('stC') == -1) &
        (df['Denomination'] != '') & (df['Production place'] != '') &
        (df['Bibliography'] != '') & (df['Weight (g)'] != 0))
filtered = df[mask]
cleaning = pd.DataFrame()

lists = ['Authority', 'Subjects', 'Associated names']
strings = ['Museum number', 'Denomination', 'Description', 'State', 'Culture/period', 'Materials', 
            'Curator\'s comments', 'Bibliography', 'Object type']
floats = ['Weight (g)']
dates = ['Date']
redundant_notes = ['Production place', 'Denomination']
do_nothing = ['url']

cleaning['Production place'] = filtered['Production place'].apply(cleanProductionPlace)
for lst in lists:
    cleaning[lst] = filtered[lst].apply(cleanList)
for string in strings:
    cleaning[string] = filtered[string].apply(cleanString)
for flot in floats:
    cleaning[flot] = filtered[flot].apply(float_conversion).replace(np.nan, -1)
for date in dates:
    cleaning[date] = filtered[date].apply(dateRange)
for col in redundant_notes:
    cleaning[col] = cleaning[col].apply(removeNotes)
for col in do_nothing:
    cleaning[col] = filtered[col]
 
cleaning = cleaning.reindex_axis(sorted(cleaning.columns), axis=1)

duplicate_cols = ['Authority', 'Date', 'Production place', 'Description', 'Subjects', "Curator's comments"]
removed_dup = (cleaning.drop_duplicates(subset=duplicate_cols)
                        .reset_index(drop=True))
cleaned = removed_dup[(removed_dup['Production place'] != 'Gaul')] #too vague
cleaned = cleaned[(cleaned['Denomination'] != 'unit')]
cleaned.set_value(387, 'Production place', 'Lugdunum')
#cleaned.tail()

Unnamed: 0,Associated names,Authority,Bibliography,Culture/period,Curator's comments,Date,Denomination,Description,Materials,Museum number,Object type,Production place,State,Subjects,Weight (g),url
0,"(Tiberius, Augustus (Octavian))","(Augustus (Octavian),)","RIC1 226, p.56 RE1 507, p.87",Roman Imperial,This denarius marks the definitive adoption an...,"(13, 14)",denarius,"Silver coin.(obverse) Head of Augustus, laurea...",silver,18600330.25,coin,Lugdunum,Roman Empire,"(emperor/empress,)",3.87,http://www.britishmuseum.org/research/collecti...
1,"(Augustus (Octavian),)","(Augustus (Octavian), L Baebius Priscus, C Gra...",RPC1 441/42,Roman Provincial,,"(-27, 14)",as,"Copper alloy coin.(obverse) Head of Augustus, ...",copper alloy,G.3056,coin,Calagurris,Roman Empire,"(mammal, emperor/empress)",12.97,http://www.britishmuseum.org/research/collecti...
2,"(Augustus (Octavian),)","(Augustus (Octavian), C Valerius, L Granius)",RPC1 433/36,Roman Provincial,,"(-27, 14)",as,Copper alloy coin.(obverse) Bare head of Augus...,copper alloy,G.3055,coin,Calagurris,Roman Empire,"(mammal, emperor/empress)",9.42,http://www.britishmuseum.org/research/collecti...
3,"(Augustus (Octavian),)","(Augustus (Octavian),)",RPC1 405/13,Roman Provincial,,"(-2, 14)",as,"Copper alloy coin.(obverse) Head of Augustus, ...",copper alloy,G.3052,coin,Turiaso,Roman Empire,"(emperor/empress,)",14.51,http://www.britishmuseum.org/research/collecti...
4,"(Augustus (Octavian),)","(Augustus (Octavian),)",RPC1 403/15,Roman Provincial,,"(-2, 14)",as,"Copper alloy coin.(obverse) Head of Augustus, ...",copper alloy,G.3051,coin,Turiaso,Roman Empire,"(emperor/empress,)",13.31,http://www.britishmuseum.org/research/collecti...
5,"(Augustus (Octavian),)","(Augustus (Octavian), Marullus, Compostus)",RPC1 285/11,Roman Provincial,,"(-2, 14)",as,"Copper alloy coin.(obverse) Head of Augustus, ...",copper alloy,G.3036,coin,Osca,Roman Empire,"(equestrian, emperor/empress)",12.40,http://www.britishmuseum.org/research/collecti...
6,"(Augustus (Octavian),)","(Augustus (Octavian),)",RPC1 289/8,Roman Provincial,,"(-2, 14)",as,"Copper alloy coin.(obverse) Head of Augustus, ...",copper alloy,G.3035,coin,Osca,Roman Empire,"(equestrian, emperor/empress)",13.15,http://www.britishmuseum.org/research/collecti...
7,"(Tiberius, Augustus (Octavian))","(Augustus (Octavian),)",RPC1 215/16,Roman Provincial,,"(4, 14)",as,"Copper alloy coin.(obverse) Head of Augustus, ...",copper alloy,G.3016,coin,Tarraco,Roman Empire,"(emperor/empress,)",8.96,http://www.britishmuseum.org/research/collecti...
8,"(Augustus (Octavian),)","(Augustus (Octavian),)",RPC1 391/17,Roman Provincial,,"(-27, 14)",as,"Copper alloy coin.(obverse) Head of Augustus, ...",copper alloy,19090504.97,coin,Bilbilis,Roman Empire,"(equestrian, emperor/empress)",10.92,http://www.britishmuseum.org/research/collecti...
9,"(Augustus (Octavian),)","(Augustus (Octavian),)",RPC1 390/15,Roman Provincial,,"(-27, 14)",as,Copper alloy coin.(obverse) Bare head of Augus...,copper alloy,19511006.19,coin,Bilbilis,Roman Empire,"(equestrian, emperor/empress)",11.67,http://www.britishmuseum.org/research/collecti...


# Plot Data

In [11]:
from bkcharts import Bar, show, defaults, cat
from bokeh.io import output_notebook, save
from bokeh.models import Range1d, ColumnDataSource, HoverTool
from collections import OrderedDict

In [12]:
output_notebook()

In [18]:
location_bar = cleaned.groupby(['Production place', 'Denomination']).size().reset_index()
location_bar.columns = ['Production place', 'Denomination', 'Count']
location_bar['Sum'] = location_bar.groupby('Production place')['Count'].transform('sum')
location_bar = location_bar.loc[location_bar.sort_values(['Sum', 'Count', 'Denomination'], ascending=[False, False, False]).index]
#print(location_bar.head())

location_bar_plot = Bar(location_bar, label=cat(columns='Production place', sort=False), 
                   values='Count', stack='Denomination', responsive=True, legend='top_right',
                   title='Number of coins produced from each location', #active_scroll='wheel_zoom',
                   tooltips=[('Denomination', '@Denomination'), 
                            ('Denomination Count', '@height'),
                            ('Location Count', '@Sum')])
location_bar_plot.y_range=Range1d(0, 200, bounds=(0, 200))

#save(location_bar_plot, filename='location_bar.html')
show(location_bar_plot)

In [14]:
from bokeh.io import save, show
from bokeh.models import GeoJSONDataSource, Circle, Legend, HoverTool
from bokeh.plotting import figure
from bokeh.tile_providers import STAMEN_TERRAIN
from bokeh.models.glyphs import Patches
import json
import pygeoj
from pyproj import Proj, transform
from io import StringIO

In [15]:
output_notebook()

In [16]:
locations = cleaned['Production place'].unique()
location_map = pd.DataFrame(columns=['Production_place', 'Count', 'Color', 'is_point', 
                                     'point_xs', 'point_ys', 'patch_xs', 'patch_ys', 'Size'],
                            index=range(len(locations)))
counts = location_bar['Sum'].unique()
txt_locations = ['Asia', 'Spain', 'England', 'Crete', 'Jerusalem', 'Masicytes']
json_locations = ['Italy', 'Syria']
colors = ["#b88fff","#4f8c00","#80198e","#65ed9b","#ea418f","#01842b","#ff6f6d","#019b63",
            "#79002e","#bde16d","#004389","#c1a911","#019bef","#dea822","#134d00","#ff847f",
            "#672e00","#ffb35a","#9a5c00","#ff9c7c"]
row = 0

from_proj = Proj(init="epsg:4326")
to_proj = Proj(init="epsg:3857")

for loc in locations:  
    point_xs = 0
    point_ys = 0
    patch_xs = []
    patch_ys = []
    is_point = False
    
    # Make file name of location GeoJSON file
    if loc in txt_locations:
        fname = str(loc) + '.txt'
    elif loc in json_locations:
        fname = str(loc) + '.json'
    else:
        fname = str(loc) + '.html'
    
    # Get coordinates of the location
    data = pygeoj.load(filepath='GeoJSON/'+fname)
    coors = data[0].geometry.coordinates
    if len(coors) == 2:
        is_point = True
        x, y = transform(from_proj, to_proj, coors[0], coors[1])
        point_xs = x
        point_ys = y
    elif len(coors) == 1:
        for lst in coors[0]:
            x, y = transform(from_proj, to_proj, lst[0], lst[1])
            patch_xs.append(x)
            patch_ys.append(y)
    else:
        for lst in coors:
            for sublst in lst[0]:
                x, y = transform(from_proj, to_proj, sublst[0], sublst[1])
                patch_xs.append(x)
                patch_ys.append(y)
    
    # Get count of coins produced from location
    count = location_bar[location_bar['Production place'] == loc]['Sum'].mean()
    
    # Size of point
    size = 10 * np.log(2 * count)
    
    location_map.iloc[row]['Production_place'] = loc
    location_map.iloc[row]['Count'] = count
    location_map.iloc[row]['Color'] = colors[np.where(counts == count)[0][0]]
    location_map.iloc[row]['is_point'] = is_point
    location_map.iloc[row]['point_xs'] = point_xs
    location_map.iloc[row]['point_ys'] = point_ys
    location_map.iloc[row]['Size'] = size
    location_map.iloc[row]['patch_xs'] = patch_xs
    location_map.iloc[row]['patch_ys'] = patch_ys
    row += 1

location_map.head()

Unnamed: 0,Production_place,Count,Color,is_point,point_xs,point_ys,patch_xs,patch_ys,Size
0,Lugdunum,95,#4f8c00,True,536502.0,5741790.0,[],[],52.4702
1,Calagurris,15,#019b63,True,-218312.0,5206250.0,[],[],34.012
2,Turiaso,7,#dea822,True,-192063.0,5146430.0,[],[],26.3906
3,Osca,9,#c1a911,True,-45162.8,5181810.0,[],[],28.9037
4,Tarraco,6,#134d00,True,140046.0,5029940.0,[],[],24.8491


In [17]:
# Seperate patches and points to seperate dataframes
df_patches = location_map[(location_map['is_point'] == False)]
df_points = location_map[location_map['is_point'] == True]

# Convert patches and points to Bokeh databases
source_patches = ColumnDataSource(df_patches)
source_points = ColumnDataSource(data=df_points)

# Create points and patches graph objects
patches = Patches(xs="patch_xs", ys="patch_ys", fill_color="Color",
                      fill_alpha=0.8, line_color="Color", line_width=0.5)
points = Circle(x="point_xs", y="point_ys", size='Size', fill_color="Color", fill_alpha=.9)

# Create plot with map tile
location_map_plot = figure(plot_width=1000, plot_height=480,
                           #active_scroll='wheel_zoom',
                           x_range=(-1.0e6, 4.5e6), y_range=(5e6, 6e6))
location_map_plot.add_tile(STAMEN_TERRAIN)

# Add points and patches objects to graph
location_map_plot.add_glyph(source_patches, patches)
location_map_plot.add_glyph(source_points, points)

# Add tooltips
location_map_plot.add_tools(HoverTool())
hover = location_map_plot.select(dict(type=HoverTool))
hover.tooltips = OrderedDict([
    ('Production place', '@Production_place'),
    ("Count", "@Count")
    ])

#save(location_map_plot, filename='location_map.html')
show(location_map_plot)