In [1]:
import pandas as pd
import numpy as np
import doctest
import re
import sys
reload(sys)
sys.setdefaultencoding("utf8")

## TODO:
* ~~Fix clean data~~
* Remove duplicates
* Keep only denarius and aureaus or just coins that have significant 
  amount of data (inscription, denomination, date, authority, material,
  production place, state, subjects, and weight)

# Read in Data

In [15]:
stringToList = lambda x: x.strip('[]{}').split(", ") # Converts string seperated by commas to a list

def stringToListofDicts(string): 
    '''
    Parameters
    ----------
    string: str
        Takes in a python string representation of a list of dictionaries with the 
        dictionaries seperated by '|', each key-value pair seperated by ';', and
        the keys and values are seperated by ':'
    
    Returns
    -------
    Returns a list of dictionaries
    
    Example
    -------
    stringToListofDicts('Inscription Type:inscription;Inscription Position:reverse;Inscription Language:Latin;Inscription Transliteration:ODE;')
    [{'Inscription Language': 'Latin',
      'Inscription Position': 'reverse',
      'Inscription Transliteration': 'ODE',
      'Inscription Type': 'inscription'}]
    '''
    result = []
    parts = string.split('|')
    
    for part in parts:
        subparts = part.split(';')
        dic = {}
        for subpart in subparts:
            try:
                content = subpart.split(':')
                dic[content[0]] = content[1]
            except:
                pass
        result.append(dic)
    
    return result

stringToListofDicts('Inscription Type:inscription;Inscription Position:reverse;Inscription Language:Latin;Inscription Transliteration:ODE;')

[{'Inscription Language': 'Latin',
  'Inscription Position': 'reverse',
  'Inscription Transliteration': 'ODE',
  'Inscription Type': 'inscription'}]

In [16]:
df = pd.read_csv('AugustusCoins_44BC-14AD.csv',
                 converters={"Authority": stringToList, 'Associated names': stringToList, 
                             'Subjects': stringToList, 'Inscriptions': stringToListofDicts})
df = df.drop('Unnamed: 0', 1)
df = df.replace(np.nan, '', regex=True)
df.head()

Unnamed: 0,Associated names,Authority,Bibliography,Culture/period,Curator's comments,Date,Denomination,Description,Inscriptions,Materials,Museum number,Object type,Production place,State,Subjects,Weight (g),url
0,[Portrait of: Augustus (Octavian)],[Ruler: Augustus (Octavian)],RPC1 1801,,,27BC-14,,Bronze coin.(obverse) Head of Augustus r. (rev...,"[{u'Inscription Position': u'reverse', u'Inscr...",bronze,19971206.1,coin,"Minted in: Odessus (Europe,Balkans,Bulgaria,Va...",Roman Empire,"[symbol, emperor/empress]",7.45,http://www.britishmuseum.org/research/collecti...
1,[Portrait of: Augustus (Octavian)],[Ruler: Augustus (Octavian)],RPC1 5476,Roman Provincial,,27BC-14,,Bronze coin.(obverse) Head of Augustus r. (rev...,"[{u'Inscription Script': u'Latin', u'Inscripti...",bronze,20010335.3,coin,,Roman Empire,"[symbol, emperor/empress]",2.27,http://www.britishmuseum.org/research/collecti...
2,[Named in inscription & portrayed: Julius Caes...,[Ruler: Augustus (Octavian)],RPC1 2007,Roman Provincial,,31BC-14,,"Alloy coin.(obverse) Diademed head of Caesar, ...","[{u'Inscription Position': u'reverse', u'Inscr...",alloy,G.1200,coin,"Minted in: Apamea (Asia,Turkey,Marmara Region,...",Roman Empire,[emperor/empress],8.36,http://www.britishmuseum.org/research/collecti...
3,[],[Ruler: Augustus (Octavian) (?)],,Greek,,27BC-14,,Alloy coin.,[{}],alloy,19051111.1,coin,"Minted in: Cremna (?) (Asia,Turkey,Mediterrane...",,[],1.49,http://www.britishmuseum.org/research/collecti...
4,[Representation of: Augustus (Octavian)],[],Walker & Higgs 2001 308 Gem 3396,Roman Republican,The portrait probably dates from the later 30s...,44BC-40BC,,Seal of glass paste imitating sard: with a bus...,[{}],glass,19230401.928,seal,,,[],,http://www.britishmuseum.org/research/collecti...


# Clean Data

In [17]:
def cleanString(string):
    '''
    Parameters
    ----------
    string: str
        String that needs to be cleaned
    
    Returns
    -------
    Removes the descriptor and returns the (assumed) description
    
    Doctests
    --------
    >>> cleanString('Named in inscription & portrayed: Julius Caesar (probably)')
    'Julius Caesar'
    >>> cleanString('Ruler: Augustus (Octavian) (?)')
    'Augustus (Octavian)'
    >>> cleanString('dupondius    (?)          ')
    'dupondius'
    >>> cleanString('        http://www.google.com')
    'http://www.google.com'
    '''
    result = string.replace('(?)', "").replace('(probably)', "")
    if ':' in result and 'http' not in result:
        result = [s for s in result.split(':')][1]
    return result.strip()

#doctest.testmod()
#cleanString('Ruler: Augustus (Octavian) (?)')
#cleanString('dupondius (?)')

In [18]:
def cleanList(lst):
    '''
    Parameters
    ----------
    lst: Python list
        list of strings to be cleaned
    
    Returns
    -------
    List of cleaned strings
    
    Doctests
    --------
    >>> cleanList(['Ruler: Augustus (Octavian) (?)', 'Moneyer: P Lurius Aggrippa'])
    ['Augustus (Octavian)', 'P Lurius Aggrippa']
    >>> cleanList(['symbol', 'emperor/empress'])
    ['symbol', 'emperor/empress']
    '''
    return [cleanString(x) for x in lst]

#cleanList(['Ruler: Augustus (Octavian) (?)', 'Moneyer: P Lurius Aggrippa'])

In [19]:
def dateRange(date):
    '''
    Parameter
    ---------
    date: str
        Date range given as a string
    
    Returns
    -------
    Returns list of the date range
    
    Doctests
    --------
    >>> dateRange('27BC-14 (?)')
    [-27, 14]
    >>> dateRange('44BC (cira) -40BC')
    [-44, -40]
    >>> dateRange('4-14')
    [4, 14]
    '''
    dates = date.split('-')
    result = []
    
    for year in dates:
        certain = True
        bc = 'BC' in year
        try:
            year = int(re.sub('[^\d]', '', year))
        except:
            print(year)
            break
        if bc:
            year = 0 - year
        result.append(year)
        
    return result
        
doctest.testmod()
#dateRange('27BC-14 (?)')

TestResults(failed=0, attempted=9)

In [20]:
def float_conversion(x):
    '''
    Parameter
    ---------
    x: str
        Input value
    
    Return
    ------
    Returns the float or 0 if empty string
    '''
    try:
        x = float(x)
    except:
        x = 0
    return x

In [34]:
mask = (df['Object type'] == 'coin ') & (df['Date'].str.find('stC') == -1)
filtered = df[mask]
cleaned = pd.DataFrame()

lists = ['Authority', 'Subjects']
strings = ['Museum number', 'Denomination', 'Description', 'State', 'Culture/period', 'Materials', 
            'Production place', 'Curator\'s comments', 'Bibliography', 'Object type', 'url']
floats = ['Weight (g)']
dates = ['Date']

for lst in lists:
    cleaned[lst] = filtered[lst].apply(cleanList)
for string in strings:
    cleaned[string] = filtered[string].apply(cleanString)
for flot in floats:
    cleaned[flot] = filtered[flot].apply(float_conversion).replace(np.nan, -1)
for date in dates:
    cleaned[date] = filtered[date].apply(dateRange)
 
cleaned = cleaned.reindex_axis(sorted(cleaned.columns), axis=1)
cleaned.head()

Unnamed: 0,Authority,Bibliography,Culture/period,Curator's comments,Date,Denomination,Description,Materials,Museum number,Object type,Production place,State,Subjects,Weight (g),url
0,[Augustus (Octavian)],RPC1 1801,,,"[-27, 14]",,Bronze coin.(obverse) Head of Augustus r. (rev...,bronze,19971206.1,coin,"Odessus (Europe,Balkans,Bulgaria,Varna,Odessus)",Roman Empire,"[symbol, emperor/empress]",7.45,http://www.britishmuseum.org/research/collecti...
1,[Augustus (Octavian)],RPC1 5476,Roman Provincial,,"[-27, 14]",,Bronze coin.(obverse) Head of Augustus r. (rev...,bronze,20010335.3,coin,,Roman Empire,"[symbol, emperor/empress]",2.27,http://www.britishmuseum.org/research/collecti...
2,[Augustus (Octavian)],RPC1 2007,Roman Provincial,,"[-31, 14]",,"Alloy coin.(obverse) Diademed head of Caesar, ...",alloy,G.1200,coin,"Apamea (Asia,Turkey,Marmara Region,Bursa (prov...",Roman Empire,[emperor/empress],8.36,http://www.britishmuseum.org/research/collecti...
3,[Augustus (Octavian)],,Greek,,"[-27, 14]",,Alloy coin.,alloy,19051111.1,coin,"Cremna (Asia,Turkey,Mediterranean Region (Tur...",,[],1.49,http://www.britishmuseum.org/research/collecti...
6,[Augustus (Octavian)],,Greek,,"[-27, 14]",,Alloy coin.,alloy,19360220.13,coin,"Tyndaris (Europe,Italy,Sicily,Messina (provinc...",,[],7.41,http://www.britishmuseum.org/research/collecti...


In [35]:
cleaned[['Bibliography', 'url']].to_csv('test.csv')

In [45]:
cleaned[(cleaned.Bibliography == 'RPC1 289/9')]['Subjects'][286]

['equestrian', 'emperor/empress']

In [48]:
cleaned[ & (cleaned.Description == 'Copper alloy coin.(obverse) Head of Augustus, laureate, right. (reverse) Horseman, right, with spear.')]#.to_csv('unique_desc.csv')

Unnamed: 0,Authority,Bibliography,Culture/period,Curator's comments,Date,Denomination,Description,Materials,Museum number,Object type,Production place,State,Subjects,Weight (g),url
