In [2]:
from sqlalchemy import types
from StringIO import StringIO
import sqlite3
import CleanData as cd
import numpy as np
import pandas as pd

# Clean Data

In [3]:
title = 'Roman_Imperial_Coinage'

### British Museum

In [15]:
# Read in data and convert to appropriate types
bm_df = pd.read_csv('../Data/BM_'+title+'.csv',
                 converters={"Authority": cd.stringToList(), 'Associated names': cd.stringToList(), 
                             'Subjects': cd.stringToList(), 'Inscriptions': cd.stringToListofDicts('|', ';', ':')})
bm_df = pd.concat([bm_df, cd.cleanInscriptions(bm_df)], axis=1)
bm_df = bm_df.drop('Unnamed: 0', 1)
bm_df = bm_df.replace(np.nan, '', regex=True)
bm_df = bm_df.rename(columns = {'Culture/period':'Culture'})

In [16]:
# Clean data to only retain relevant information
mask = ((bm_df['Object type'] == 'coin ') & (bm_df['Date'].str.find('stC') == -1))
filtered = bm_df[mask]

lists = ['Authority', 'Subjects', 'Associated names', 'Obverse legend', 'Reverse legend']
strings = ['Museum number', 'Denomination', 'Description', 'State', 'Culture', 'Materials', 
            'Curator\'s comments', 'Bibliography', 'Object type', "Production place"]
floats = ['Weight (g)']
dates = ['Date']
redundant_notes = ['Production place', 'Denomination']
do_nothing = ['url', 'Inscriptions']
duplicate_cols = ['Authority', 'Date', 'Production place', 'Description', 'Subjects', "Curator's comments",
                  'Obverse legend', 'Reverse legend']

cleaned_bm = cd.cleanDF(filtered, lists, strings, floats, dates, redundant_notes, do_nothing, duplicate_cols)
cleaned_bm = cleaned_bm.drop('Inscriptions', 1)
cleaned_bm['Source'] = 'British Museum'
#cleaned_bm.head()

In [17]:
# Convert everything to strings and integers to put into SQL db
lists = ['Authority', 'Subjects', 'Associated names', 'Obverse legend', 'Reverse legend', 'Date']
for col in lists:
    cleaned_bm[col] = cleaned_bm[col].apply(cd.listToString())

columns = [u'Associated names', u'Authority', u'Bibliography', u'Culture',
       u'Curator\'s comments', u'Date', u'Denomination', u'Description',
       u'Materials', u'Museum number', u'Object type', u'Obverse legend',
       u'Production place', u'Reverse legend', u'State', u'Subjects',
       u'url', u'Source']

for col in columns:
    cleaned_bm[col] = cleaned_bm[col].apply(lambda x: x.decode('utf-8'))

cleaned_bm.columns = ['associatedNames', 'authority', 'bibliography', 'culture',
                       'curatorComment', 'date', 'denomination', 'description',
                       'materials', 'museumNumber', 'objectType', 'obverseLegend',
                       'mint', 'reverseLegend', 'state', 'subjects',
                       'weight', 'url', 'source']

cleaned_bm['startDate'] = cleaned_bm.apply(lambda row: row['date'].split(",")[0], axis=1)
cleaned_bm['endDate'] = cleaned_bm.apply(lambda row: row['date'].split(",")[1] 
                                                 if len(row['date'].split(",")) > 1 
                                                 else row['date'].split(",")[0], 
                                         axis=1)

In [18]:
cleaned_bm.tail()

Unnamed: 0,associatedNames,authority,bibliography,culture,curatorComment,date,denomination,description,materials,museumNumber,...,obverseLegend,mint,reverseLegend,state,subjects,weight,url,source,startDate,endDate
863,Julian (the Apostate),Julian (the Apostate),HXN 1 161.2,Roman Imperial,,"360, 363",?,Silver coin.(obverse) Head of Julian r. (rever...,silver,19940401161.2,...,[...]C L IVLIA [...],Lyon,VOTIS V MVLTIS X,Roman Empire,"symbol, emperor/empress",1.07,http://www.britishmuseum.org/research/collecti...,British Museum,360,363
864,Julian (the Apostate),Julian (the Apostate),HXN 1 160.4,Roman Imperial,,"360, 363",?,Silver coin.(obverse) Head of Julian r. (rever...,silver,19940401160.4,...,[...]C L IVLIANVS P[...],Lyon,VOTIS V MVLTIS X,Roman Empire,"symbol, emperor/empress",1.2,http://www.britishmuseum.org/research/collecti...,British Museum,360,363
865,Julian (the Apostate),Julian (the Apostate),HXN 1 159.2,Roman Imperial,,"360, 363",?,Silver coin.(obverse) Head of Julian r. (rever...,silver,19940401159.2,...,D N C L IVL[...],Trier,VOTIS V MVLTIS X,Roman Empire,"symbol, emperor/empress",1.11,http://www.britishmuseum.org/research/collecti...,British Museum,360,363
866,Julian (the Apostate),Julian (the Apostate),HXN 1 158.4,Roman Imperial,,"360, 363",?,Silver coin.(obverse) Head of Julian r. (rever...,silver,19940401158.4,...,[...]C L IVLIANVS AVG,Trier,VOTIS V MVLTIS X,Roman Empire,"symbol, emperor/empress",1.32,http://www.britishmuseum.org/research/collecti...,British Museum,360,363
867,Julian (the Apostate),Julian (the Apostate),HXN 1 163.5,Roman Imperial,,"360, 363",?,Silver coin.(obverse) Head of Julian r. (rever...,silver,19940401163.5,...,ILLEGIBLE,Trier,VOTIS V MVLTIS X,Roman Empire,"symbol, emperor/empress",0.96,http://www.britishmuseum.org/research/collecti...,British Museum,360,363


### American Numismatic Society

In [11]:
ans_df = pd.read_csv('../Data/OCRE.csv',
                    converters={"Authority": cd.stringToList('|'), 'Year': cd.stringToList('|'),
                               'Issuer': cd.stringToList('|'), 'Portrait': cd.stringToList('|'),
                               'Reference': cd.stringToList('|')})
ans_df = ans_df.replace(np.nan, '', regex=True)
ans_df['Description'] = ans_df.apply(lambda row: cd.makeDescription(
                                                                    row['Material'], 
                                                                    row['Manufacture'], 
                                                                    row['Obverse Type'], 
                                                                    row['Reverse Type']
                                                                   ), axis=1)
ans_df['DupCheck'] = ans_df.apply(lambda row: cd.makeDupCheckCol(
                                                                 row['Material'], 
                                                                 row['Denomination'], 
                                                                 row['Portrait'], 
                                                                 row['Mint'],
                                                                 row['Year']
                                                                ), axis=1)
ans_df.tail()

Unnamed: 0.1,Unnamed: 0,URI,Title,RecordId,Authority,Degree,Deity,Denomination,Dynasty,Engraver,...,Object Type,Portrait,Reference,Region,Reverse Legend,Reverse Type,Year,Date Record Modified,Description,DupCheck
42747,42747,http://numismatics.org/ocre/id/ric.2.hdn.1015,RIC II Hadrian 1015,ric.2.hdn.1015,[Hadrian],,,Quadrans,,,...,Coin,[],[],Dalmatia,METAL DELM,Stag standing,"[134, 138]",2017-11-28T08:46:13Z,"Struck Bronze. (obverse) Head of youth, laurea...","Bronze Quadrans [''] Uncertain value ['134', '..."
42748,42748,http://numismatics.org/ocre/id/ric.2.hdn.1016,RIC II Hadrian 1016,ric.2.hdn.1016,[Hadrian],,Roma,Quadrans,,,...,Coin,[Hadrian],[],,DARDANICI,"Woman, draped, standing left, holding corn-ear...","[134, 138]",2017-11-28T08:46:13Z,"Struck Bronze. (obverse) Bust of Roma, helmete...",Bronze Quadrans ['Hadrian'] Uncertain value ['...
42749,42749,http://numismatics.org/ocre/id/ric.2.hdn.1013,RIC II Hadrian 1013,ric.2.hdn.1013,[Hadrian],,Diana,Quadrans,,,...,Coin,[Hadrian],[],Dalmatia,METAL DELM,Goat standing left,"[134, 138]",2017-11-28T08:46:13Z,"Struck Bronze. (obverse) Bust of Diana, draped...",Bronze Quadrans ['Hadrian'] Uncertain value ['...
42750,42750,http://numismatics.org/ocre/id/ric.2.hdn.1011B,RIC II Hadrian 1011B,ric.2.hdn.1011B,[Hadrian],,,Quadrans,,,...,Coin,[Hadrian],[],Noricum,MET NOR,Legend within laurel wreath,"[134, 138]",2017-11-28T08:46:13Z,"Struck Bronze. (obverse) Bust of Hadrian, laur...",Bronze Quadrans ['Hadrian'] Uncertain value ['...
42751,42751,http://numismatics.org/ocre/id/ric.2.hdn.1014,RIC II Hadrian 1014,ric.2.hdn.1014,[Hadrian],,Mars,Quadrans,,,...,Coin,[],[],Dalmatia,METAL DELM,Cuirass,"[134, 138]",2017-11-28T08:46:13Z,"Struck Bronze. (obverse) Bust of Mars, helmete...","Bronze Quadrans [''] Uncertain value ['134', '..."


In [12]:
lists = ['Year', 'Authority', 'Issuer', 'Portrait', 'Reference']
strings = ['Deity', 'Denomination', 'Mint', 'Description', 'Obverse Legend', 'Reverse Legend']
floats = []
dates = []
redundant_notes = []
do_nothing = ['URI', 'DupCheck']
duplicate_cols = 'DupCheck'

cleaned_ans = cd.cleanDF(ans_df, lists, strings, floats, dates, redundant_notes, do_nothing, 
                              duplicate_cols, production_place='Mint')
cleaned_ans['Source'] = 'Online Coins of the Roman Empire'
cleaned_ans.drop(['DupCheck'], axis=1, inplace=True)
cleaned_ans.tail()
cleaned_ans.columns

Index([u'Authority', u'Deity', u'Denomination', u'Description', u'Issuer',
       u'Mint', u'Obverse Legend', u'Portrait', u'Reference',
       u'Reverse Legend', u'URI', u'Year', u'Source'],
      dtype='object')

In [13]:
# Convert everything to strings and integers to put into SQL db
lists = ['Year', 'Authority', 'Issuer', 'Portrait', 'Reference']
for col in lists:
    cleaned_ans[col] = cleaned_ans[col].apply(cd.listToString())
    
cleaned_ans['StartDate'] = cleaned_ans.apply(lambda row: row['Year'].split(",")[0], axis=1)
cleaned_ans['EndDate'] = cleaned_ans.apply(lambda row: row['Year'].split(",")[1] 
                                                 if len(row['Year'].split(",")) > 1 
                                                 else row['Year'].split(",")[0], 
                                         axis=1)

columns = [u'Authority', u'Deity', u'Denomination', u'Description', u'Issuer',
       u'Mint', u'Obverse Legend', u'Portrait', u'Reference',
       u'Reverse Legend', u'URI', u'Year', u'Source', 'StartDate', 'EndDate']

cleaned_ans.columns = columns

for col in columns:
    cleaned_ans[col] = cleaned_ans[col].apply(lambda x: x.decode('utf-8') if type(x) is not float else x)

In [14]:
cleaned_ans.tail()

Unnamed: 0,Authority,Deity,Denomination,Description,Issuer,Mint,Obverse Legend,Portrait,Reference,Reverse Legend,URI,Year,Source,StartDate,EndDate
9297,Antoninus Pius,Virtus,As,Struck Bronze. (obverse) Head of Marcus Aureli...,,Rome,AVRELIVS CAESAR ANTONINI AVG PII FIL,Marcus Aurelius,,TR POT XV COS II VIRTVS S C,http://numismatics.org/ocre/id/ric.3.ant.1357_as,"159, 160",Online Coins of the Roman Empire,159,160
9298,Antoninus Pius,Virtus,Dupondius,Struck Bronze. (obverse) Head of Marcus Aureli...,,Rome,AVRELIVS CAESAR ANTONINI AVG PII FIL,Marcus Aurelius,,TR POT XV COS II VIRTVS S C,http://numismatics.org/ocre/id/ric.3.ant.1357_...,"159, 160",Online Coins of the Roman Empire,159,160
9299,Antoninus Pius,Virtus,Sestertius,Struck Bronze. (obverse) Head of Marcus Aureli...,,Rome,AVRELIVS CAESAR ANTONINI AVG PII FIL,Marcus Aurelius,,TR P XV COS II VIRTVS S C,http://numismatics.org/ocre/id/ric.3.ant.1356,"159, 160",Online Coins of the Roman Empire,159,160
9300,Hadrian,,Quadrans,"Struck Bronze. (obverse) Bust of Hadrian, laur...",,Uncertainvalue,IMP CAESAR TRAIAN HADRIANVS AVG,Hadrian,,MET NOR,http://numismatics.org/ocre/id/ric.2.hdn.1011A,"134, 138",Online Coins of the Roman Empire,134,138
9301,Hadrian,,Quadrans,"Struck Bronze. (obverse) Head of youth, laurea...",,Uncertainvalue,,,,METAL DELM,http://numismatics.org/ocre/id/ric.2.hdn.1015,"134, 138",Online Coins of the Roman Empire,134,138


# Store Data

In [15]:
cnx = sqlite3.connect('../Data/'+title+'.sqlite')
cur = cnx.cursor()

In [20]:
cur.execute('''CREATE TABLE britishMuseum(associatedNames TEXT, authority TEXT, bibliography TEXT, culture TEXT,
                    curatorComment TEXT, dates TEXT, denomination TEXT, 
                    description TEXT, materials TEXT, museumNumber TEXT, objectType TEXT, obverseLegend TEXT,
                    mint TEXT, reverseLegend TEXT, state TEXT, subjects TEXT, weight INT,
                    url TEXT, source TEXT, startDate INT, endDate INT)''')

<sqlite3.Cursor at 0x11bec0ce0>

In [21]:
cur.executemany("""INSERT INTO britishMuseum (associatedNames, authority, bibliography, culture,
                        curatorComment, dates, denomination, description,
                        materials, museumNumber, objectType, obverseLegend,
                        mint, reverseLegend, state, subjects, weight,
                        url, source, startDate, endDate) VALUES(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)""", 
                list(cleaned_bm.to_records(index=False)))

<sqlite3.Cursor at 0x11bec0ce0>

In [16]:
cur.execute('''CREATE TABLE americanNumismaticSociety(authority TEXT, deity TEXT, denomination TEXT, description TEXT, 
                       issuer TEXT, mint TEXT, obverseLegend TEXT, portrait TEXT, reference TEXT, reverseLegend TEXT, 
                       url TEXT, weight INT, dates TEXT, source TEXT, startDate INT, endDate INT)''')

<sqlite3.Cursor at 0x117645dc0>

In [53]:
cur.executemany("""INSERT INTO americanNumismaticSociety (authority, deity, denomination, description, 
                       issuer, mint, obverseLegend, portrait, reference, reverseLegend, url, weight, dates, 
                       source, startDate, endDate) VALUES(?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)""", 
                list(cleaned_ans.to_records(index=False)))

<sqlite3.Cursor at 0x117645dc0>

In [22]:
cnx.commit()