In [1]:
import pandas as pd
import numpy as np
import doctest
import CleanBMData as cleanBM
import BokehMaker as magicPlots

import sys
stdout = sys.stdout
reload(sys)
sys.setdefaultencoding('utf-8')
sys.stdout = stdout

## TODO:
* Make function to create:
    * ~~Stacked bar graph~~
    * ~~Map~~
* Take data from [American Numismatic Society](http://numismatics.org/search/apis)
* Look at following types of coins for location, denomination, material, and subject:
    * seperate 44-31, 30-27, 27-19, 18-16, 15-11 (ALL BCE) and 10BC-13AD
        * Star of Julius (Sidus Iulium) 
        * Capricorn and/or globe and/or rudder
        * Secular games
        * Julius Caesar
        * Alpollo and/or lyre
        * Statue of Augustus
* Manually select colors for stacked bar graph to avoid repeated colors next to each other
* Predictive value of subjects, material, inscription
    * Predict what will have laurels

# Read in Data
Read in the data from the British Museum data scraper script with proper column names

In [2]:
bm_df = pd.read_csv('AugustusCoins_44BC-14AD.csv',
                 converters={"Authority": cleanBM.stringToList(), 'Associated names': cleanBM.stringToList(), 
                             'Subjects': cleanBM.stringToList(), 'Inscriptions': cleanBM.stringToListofDicts('|', ';', ':')})
bm_df = bm_df.drop('Unnamed: 0', 1)
bm_df = bm_df.replace(np.nan, '', regex=True)
bm_df.head()

Unnamed: 0,Associated names,Authority,Bibliography,Culture/period,Curator's comments,Date,Denomination,Description,Inscriptions,Materials,Museum number,Object type,Production place,State,Subjects,Weight (g),url
0,[Portrait of: Augustus (Octavian)],[Ruler: Augustus (Octavian)],RPC1 1801,,,27BC-14,,Bronze coin.(obverse) Head of Augustus r. (rev...,"[{u'Inscription Position': u'reverse', u'Inscr...",bronze,19971206.1,coin,"Minted in: Odessus (Europe,Balkans,Bulgaria,Va...",Roman Empire,"[symbol, emperor/empress]",7.45,http://www.britishmuseum.org/research/collecti...
1,[Portrait of: Augustus (Octavian)],[Ruler: Augustus (Octavian)],RPC1 5476,Roman Provincial,,27BC-14,,Bronze coin.(obverse) Head of Augustus r. (rev...,"[{u'Inscription Script': u'Latin', u'Inscripti...",bronze,20010335.3,coin,,Roman Empire,"[symbol, emperor/empress]",2.27,http://www.britishmuseum.org/research/collecti...
2,[Named in inscription & portrayed: Julius Caes...,[Ruler: Augustus (Octavian)],RPC1 2007,Roman Provincial,,31BC-14,,"Alloy coin.(obverse) Diademed head of Caesar, ...","[{u'Inscription Position': u'reverse', u'Inscr...",alloy,G.1200,coin,"Minted in: Apamea (Asia,Turkey,Marmara Region,...",Roman Empire,[emperor/empress],8.36,http://www.britishmuseum.org/research/collecti...
3,[],[Ruler: Augustus (Octavian) (?)],,Greek,,27BC-14,,Alloy coin.,[{}],alloy,19051111.1,coin,"Minted in: Cremna (?) (Asia,Turkey,Mediterrane...",,[],1.49,http://www.britishmuseum.org/research/collecti...
4,[Representation of: Augustus (Octavian)],[],Walker & Higgs 2001 308 Gem 3396,Roman Republican,The portrait probably dates from the later 30s...,44BC-40BC,,Seal of glass paste imitating sard: with a bus...,[{}],glass,19230401.928,seal,,,[],,http://www.britishmuseum.org/research/collecti...


# Clean Data
* Convert columns to proper data types
* Remove coins that do not have enough data
* Remove duplicates

In [3]:
mask = ((bm_df['Object type'] == 'coin ') & (bm_df['Date'].str.find('stC') == -1) &
        (bm_df['Denomination'] != '') & (bm_df['Production place'] != '') &
        (bm_df['Bibliography'] != '') & (bm_df['Weight (g)'] != 0))
filtered = bm_df[mask]
cleaning = pd.DataFrame()

lists = ['Authority', 'Subjects', 'Associated names']
strings = ['Museum number', 'Denomination', 'Description', 'State', 'Culture/period', 'Materials', 
            'Curator\'s comments', 'Bibliography', 'Object type']
floats = ['Weight (g)']
dates = ['Date']
redundant_notes = ['Production place', 'Denomination']
do_nothing = ['url', 'Inscriptions']

cleaning['Production place'] = filtered['Production place'].apply(cleanBM.cleanProductionPlace)
for lst in lists:
    cleaning[lst] = filtered[lst].apply(cleanBM.cleanList)
for string in strings:
    cleaning[string] = filtered[string].apply(cleanBM.cleanString)
for flot in floats:
    cleaning[flot] = filtered[flot].apply(cleanBM.float_conversion).replace(np.nan, -1)
for date in dates:
    cleaning[date] = filtered[date].apply(cleanBM.dateRange)
for col in redundant_notes:
    cleaning[col] = cleaning[col].apply(cleanBM.removeNotes)
for col in do_nothing:
    cleaning[col] = filtered[col]
 
cleaning = cleaning.reindex_axis(sorted(cleaning.columns), axis=1)

duplicate_cols = ['Authority', 'Date', 'Production place', 'Description', 'Subjects', "Curator's comments"]
removed_dup = (cleaning.drop_duplicates(subset=duplicate_cols)
                        .reset_index(drop=True))
cleaned = removed_dup[(removed_dup['Production place'] != 'Gaul')] #too vague
cleaned = cleaned[(cleaned['Denomination'] != 'unit')]
cleaned.set_value(387, 'Production place', 'Lugdunum')
cleaned.tail()

Unnamed: 0,Associated names,Authority,Bibliography,Culture/period,Curator's comments,Date,Denomination,Description,Inscriptions,Materials,Museum number,Object type,Production place,State,Subjects,Weight (g),url
693,"(Augustus (Octavian), Nike/Victoria/Victory)","(Augustus (Octavian),)","RIC1 263, p.60 RR2 4342, p.12 RE1 616, p.101",Roman Imperial,Octavian's IMP CAESAR coinage was a celebratio...,"(-32, -29)",denarius,"Silver coin.(obverse) Victory, draped, standin...","[{u'Inscription Content': u'CAESAR DIVI F', u'...",silver,R.6163,coin,Italy,Roman Empire,"(charioteer/chariot, allegory/personification,...",3.78,http://www.britishmuseum.org/research/collecti...
694,"(Mark Antony, Octavia, Augustus (Octavian))","(M Oppius Capito, Mark Antony)","RPC1 1463 (type) RR2 154, p.518",Roman Republican,,"(-38, -37)",tressis,Copper alloy coin.(obverse) Busts of M. Antoni...,[{u'Inscription Content': u'[M·ANT·IMP·TERT·CO...,copper alloy,18600328.250,coin,Achaea,Roman Republic,"(politician/statesman, emperor/empress, boat/s...",21.51,http://www.britishmuseum.org/research/collecti...
695,"(Marcus Aemilius Lepidus, Augustus (Octavian))","(Marcus Aemilius Lepidus,)","Ghey, Leins & Crawford 2010 495.2.6 RRC 495/2a...",Roman Republican,Die appears to read IMA (ligatured) instead of...,"(-42,)",denarius,Silver coin.(obverse) Head of M. Lepidus right...,[{u'Inscription Content': u'LEPIDVS·PONT·MAX·I...,silver,20114027.3,coin,Italy,Roman Republic,"(politician/statesman, emperor/empress)",3.56,http://www.britishmuseum.org/research/collecti...
696,"(Augustus (Octavian), Eros/Cupid, Aphrodite/Ve...","(P Clodius,)","RR1 4277, p.583 Ghey, Leins & Crawford 2010 49...",Roman Republican,,"(-42,)",aureus,Gold coin; pierced for suspension.(obverse) He...,[{u'Inscription Content': u'C·CAESAR·III·VIR·R...,gold,18520903.11,coin,Rome,Roman Republic,"(cherub/cupid, politician/statesman, classical...",8.09,http://www.britishmuseum.org/research/collecti...
697,"(Augustus (Octavian), Tyche/Fortuna)","(Ti Sempronius Gracchus,)","RR1 4313, p.593 Ghey, Leins & Crawford 2010 52...",Roman Republican,,"(-40,)",aureus,"Gold coin.(obverse) Head of Octavian right, be...","[{u'Inscription Content': u'IIII·VIR·Q·D', u'I...",gold,18440425.473,coin,Rome,Roman Republic,"(politician/statesman, allegory/personificatio...",7.96,http://www.britishmuseum.org/research/collecti...


# Plot Data
* Plot all data together in plots
* Split up data and plot seperately

### All data
* Make stacked bar plot
* Make map plot

In [4]:
from bokeh.io import output_notebook, save
from bokeh.plotting import show
from bokeh.models import Range1d, HoverTool
from bokeh.palettes import linear_palette, viridis, grey

In [5]:
output_notebook()

In [6]:
location_bar_plot = magicPlots.makeStackedBar(cleaned, 'Production place', 'Denomination', sort_bars=True,
                               bars_ascending=False, sort_stacks=True, stacks_agg='sum', stacks_ascending=False,
                              colors=viridis, title='Number of coins produced from each location')

location_bar_plot.yaxis.axis_label='Location Counts'
location_bar_plot.y_range = Range1d(0, 200, bounds=(0, 200))
location_bar_plot.legend.location = 'top_right'
location_bar_plot.add_tools(HoverTool(tooltips=[('Denomination', '@Denomination'), 
                                                ('Denomination Count', '@height'),
                                                ('Location Count', '@Sum')]))

save(location_bar_plot, filename='location_bar.html')
show(location_bar_plot)

In [7]:
from bokeh.io import save, show
import pygeoj
from pyproj import Proj, transform

In [8]:
output_notebook()

In [13]:
location_counts = cleaned.groupby(['Production place']).size().reset_index()
location_counts.columns = ['Production place', 'Count']
location_counts = location_counts.loc[location_counts.sort_values(['Count'], ascending=False).index]

location_map_plot = magicPlots.makeMap(location_counts, 'Production place', 'Count', x_ranges=(-2.0e6, 5e6), 
                            y_ranges=(3.5e6, 7e6), path='GeoJSON/', ext='html', pt_size=lambda x: 5 * np.log(3 * x))

#save(location_map_plot, filename='location_map.html')
show(location_map_plot)

### 44-31 BCE

In [45]:
def coinsFromDates(df, date_range, col_name='Date'):
    '''
    Parameters
    ----------
    df : Pandas dataframe
        Dataframe containing coincs and dates
    date_range : tuple
        Tuple of length two containing date range
    col_name : str
        Column name of dates
        
    Return
    ------
    Returns a dataframe containing only the rows that have the correct dates
    '''
    begin = date_range[0]
    end = date_range[1]
    def intWithinTupleRange(tup):
        in_range = False
        if len(tup) == 1:
            if tup[0] >= begin and tup[0]<= end:
                in_range = True
        elif len(tup) == 2:
            if tup[0] >= begin and tup[1]<= end:
                in_range = False
        return in_range
    return df[df.apply(lambda x: intWithinTupleRange(x[col_name]), axis=1)]

def containKeyword(df, key, col_name):
    '''
    Parameters
    ----------
    df : Pandas dataframe
        Dataframe containing coincs and column to look for keyword in
    key : str
        String to look for in each row of given column
    col_name : str
        Column of where to search for keyword
    
    Return
    ------
    Returns a dataframe containing only rows that have the keyword in the given column
    '''
    def containIn(obj):
        if type(obj) == str:
            return key.lower() in obj.lower()
    return df[df.apply(lambda x: containIn(x[col_name]), axis=1)]

In [61]:
date_ranges = [(-44, -31), (-30, -27), (-27, -19), (-18, -16), (-15, -11), (-10, 13)]
subjects = [('star', 'Description'), ('statue of augustus', 'Description')]
good_dfs = {}

for dates in date_ranges:
    for subject in subjects:
        df_name = 'df_' + str(abs(dates[0])) + '_' + subject[0]
        in_dates = coinsFromDates(cleaned, dates)
        subject_dates = containKeyword(in_dates, subject[0], subject[1])
        if not subject_dates.empty:
            good_dfs[df_name] = subject_dates

print len(good_dfs)

5
