In [3]:
import json
from pprint import pprint
import tinydb as tdb

In [4]:
db = tdb.TinyDB('data/tinyDB.json')

In [5]:
osm = db.table('osm')
merimee = db.table('merimee')
wikidata = db.table('wikidata')

In [6]:
dataOSM = osm.all()

In [7]:
dataOSM[0]

{'heritage': '3',
 'heritage:operator': 'mhs',
 'historic': 'memorial',
 'idOSM': 21609292,
 'image': "http://phototheque.1000wallpapers.com/galleries/Nature/p/Pa+%E0+Pl/Parc+de+la+T%EAte+d'Or/Ile+du+souvenir/Monument+aux+morts/large/Monument+aux+morts+5.jpg",
 'mhs:inscription_date': '1982',
 'name': "Monument aux morts de l'île du Souvenir",
 'ref:mhs': 'PA00117982',
 'typeOSM': 'node',
 'wikidata': 'Q17341711',
 'wikipedia': "fr:Monument aux morts de l'île du Souvenir"}

In [8]:
dicoMerimeeIndice = {}
for i, d in enumerate(dataOSM):
    merimeeCode = d['ref:mhs']  # code merimee
    if merimeeCode in dicoMerimeeIndice:
        dicoMerimeeIndice[merimeeCode].append(i)
    else:
        dicoMerimeeIndice[merimeeCode] = [i]
        
listOfDuplicates = [ (refs, indices) for refs, indices in dicoMerimeeIndice.items() if len(indices)>1 ]

listOfDuplicates[:2]

[('PA00103673', [24381, 24382]), ('PA00102859', [774, 1229])]

In [9]:
len(listOfDuplicates)

488

In [10]:
len( dataOSM )

32246

In [144]:
def getWikiCode(itemUrl):
    items = itemUrl.split('/')
    text = '[%s](%s)'%( items[-1], itemUrl  )
    return text

def formatOSMInfo( info ):
    osmtype = info['typeOSM']
    desc = info['name'] if 'name' in info else '...'
    idOSM = info['idOSM']
    
    osmUrl = 'https://www.openstreetmap.org/%s/%s' % (osmtype, idOSM)
    osmLink = '[%s%s](%s)' % (osmtype, idOSM, osmUrl)
    
    if 'wikidata' in info:
        wikidataUrl = 'https://www.wikidata.org/wiki/%s' % info['wikidata']
        wikidataLink = '- [.wd.](%s)' % wikidataUrl
    else:
        wikidataLink = ''
        
    return '%s %s - %s   ' % ( osmLink, wikidataLink, desc)

page = ''
for merimee, indices in listOfDuplicates:
    page +=  '**%s**:   \n ' % merimee
    page += '\n'.join( [formatOSMInfo(dataOSM[i]) for i in indices] )
    page += '\n\n'

In [145]:
with open('output.txt', 'w') as file:
    file.write( page )

In [None]:
print(page)

## To csv

In [21]:
def formatOSMInfoCSV( merimee, info ):
    osmtype = info['typeOSM']
    desc = info['name'] if 'name' in info else '...'
    idOSM = info['idOSM']
    
    osmUrl = 'https://www.openstreetmap.org/%s/%s' % (osmtype, idOSM)
   
    if 'wikidata' in info:
        wikidataUrl = 'https://www.wikidata.org/wiki/%s' % info['wikidata']
    else:
        wikidataUrl = ''
        
    return ','.join( (merimee, osmtype, str(idOSM), osmUrl, wikidataUrl, desc) )

In [26]:
csv = 'merimee, osmtype, idOSM, osmUrl, wikidataUrl, desc\n'
for merimee, indices in listOfDuplicates:
    csv += '\n'.join( [formatOSMInfoCSV(merimee, dataOSM[i]) for i in indices] )
    csv += '\n'
    
with open('duplicateOSM.csv', 'w') as file:
    file.write( csv )