In [128]:
import json
from pprint import pprint
import tinydb as tdb
import requests

# Cherche les doublons dans Open Street Map

c.a.d. node/way référençant le même code Mérimée

In [129]:
db = tdb.TinyDB('data/tinyDB.json')

In [130]:
osm = db.table('osm')
merimee = db.table('merimee')
wikidata = db.table('wikidata')

In [131]:
wikidata.all()[4]

{'article': 'https://fr.wikipedia.org/wiki/Th%C3%A9%C3%A2tre_de_la_Madeleine',
 'image': 'http://commons.wikimedia.org/wiki/Special:FilePath/Th%C3%A9%C3%A2tre%20de%20la%20Madeleine.JPG',
 'item': 'http://www.wikidata.org/entity/Q386604',
 'itemDescription': 'théâtre à Paris',
 'merimee': 'PA00132981'}

In [132]:
wikidata.search(tdb.where('merimee') == 'PA00106817')

[{'article': 'https://fr.wikipedia.org/wiki/Cath%C3%A9drale_Saint-%C3%89tienne_de_Metz',
  'image': 'http://commons.wikimedia.org/wiki/Special:FilePath/Cathedrale-saint-etienne-metz-de-place-prefecture.jpg',
  'item': 'http://www.wikidata.org/entity/Q671066',
  'itemDescription': 'cathédrale située à Metz en Moselle, en France',
  'merimee': 'PA00106817'}]

In [133]:
dataOSM = osm.all()
print( len( dataOSM ) )

32246


In [134]:
dataOSM[1]

{'idOSM': 26691547,
 'name': 'Bailleul',
 'name:fr': 'Bailleul',
 'name:nl': 'Belle',
 'name:vls': 'Belle',
 'place': 'town',
 'population': '14480',
 'ref:FR:SIREN': '215900432',
 'ref:INSEE': '59043',
 'ref:mhs': 'PA00107358',
 'source:population': 'INSEE 2013',
 'typeOSM': 'node',
 'wikidata': 'Q2481155'}

In [135]:
#  groupe les entrees OSM par codes Merimee
#

dicoMerimeeIndice = {}
for i, d in enumerate(dataOSM):
    merimeeCode = d['ref:mhs']  # code merimee
    if merimeeCode in dicoMerimeeIndice:
        dicoMerimeeIndice[merimeeCode].append(i)
    else:
        dicoMerimeeIndice[merimeeCode] = [i]
        
listOfDuplicates = [ (refs, indices) for refs, indices in dicoMerimeeIndice.items() if len(indices)>1 ]

print( len(listOfDuplicates) )
listOfDuplicates[:2]

488


[('PA00109812', [7952, 13346, 13347]), ('PA00081815', [13962, 13972])]

In [136]:
def getWikiCode(itemUrl):
    items = itemUrl.split('/')
    text = '[%s](%s)'%( items[-1], itemUrl  )
    return text

def formatOSMInfo( info ):
    osmtype = info['typeOSM']
    desc = info['name'] if 'name' in info else '...'
    idOSM = info['idOSM']
    
    osmUrl = 'https://www.openstreetmap.org/%s/%s' % (osmtype, idOSM)
    osmLink = '[%s%s](%s)' % (osmtype, idOSM, osmUrl)
    
    if 'wikidata' in info:
        wikidataUrl = 'https://www.wikidata.org/wiki/%s' % info['wikidata']
        wikidataLink = '- [.wd.](%s)' % wikidataUrl
    else:
        wikidataLink = ''
        
    return '%s %s - %s   ' % ( osmLink, wikidataLink, desc)


## To Github issues

In [137]:
# charge le token
# voir https://help.github.com/articles/creating-a-personal-access-token-for-the-command-line/
token = open('githubtoken.txt','r').read().strip()

In [138]:
from IPython.display import display, Markdown

In [139]:
blacklist = ['ref:mhs', 'OSMurl', 'idOSM', 'typeOSM']

def createMarkdown( merimee, indices ):
    body = ''
    title = merimee
    body += "# %s  \n" % merimee 

    wd = wikidata.search(tdb.where('merimee') == merimee)
    if len(wd)>1:
        body += 'doublon dans wikidata \n'

    wd = wd[0]
    if 'article' in wd:
        body += '[page wikipedia](%s) \n' % wd['article'] 

    if 'itemDescription' in wd:
        body += wd['itemDescription']
        title += ' %s'% wd['itemDescription']

    body += '\n'

    if 'image' in wd:
        #body += '![image wd](%s | width=100) \n' % wd['image']
        body += '<img src="%s" width="300"  /> \n' % wd['image']


    for i in indices:
        info = dataOSM[i]
        info['OSMurl'] = 'https://www.openstreetmap.org/{typeOSM}/{idOSM}'.format( **info  )
        body += '### [{typeOSM} {idOSM}:]({OSMurl}) \n'.format( **info  )



        for k, v in sorted( info.items(), key=lambda x:x[0] ):
            if k in blacklist:
                continue

            if k == 'wikidata':
                wikidataUrl = 'https://www.wikidata.org/wiki/%s' % v
                body += '- [**wd.{}**]({})  \n'.format(k, wikidataUrl  )

            else:
                body += '- **{}**: {}  \n'.format( k, v  )



        body += '\n\n'
    
    return title, body

In [140]:
def formatOSMInfoIssue( merimee, indices ):

    title, page = createMarkdown( merimee, indices )

    newissue = {
      "title": title,
      "body": page,
      "labels": [
        'doublons_dans_OSM'
      ]
    }
        
    return json.dumps( newissue )

In [141]:
merimee, indices = listOfDuplicates[40]

title, body = createMarkdown( merimee, indices )
display(Markdown( body ))
print( title )

# PA00086599  
[page wikipedia](https://fr.wikipedia.org/wiki/Campo-Formio_(m%C3%A9tro_de_Paris)) 
station du métro de Paris
<img src="http://commons.wikimedia.org/wiki/Special:FilePath/Campo%20Formio%20Sortie.JPG" width="300"  /> 
### [node 2527916266:](https://www.openstreetmap.org/node/2527916266) 
- **heritage**: 3  
- **heritage:operator**: mhs  
- **mhs:inscription_date**: 1978-05-29  
- **name**: Campo-Formio  
- **railway**: subway_entrance  
- **source:heritage**: data.gouv.fr:Ministère de la Culture - 04/2015  
- **wheelchair**: no  
- [**wd.wikidata**](https://www.wikidata.org/wiki/Q683208)  
- **wikipedia**: fr:Campo-Formio (métro de Paris)  


### [node 5313588578:](https://www.openstreetmap.org/node/5313588578) 
- **artist_name**: Hector Guimard  
- **heritage**: 3  
- **heritage:operator**: mhs  
- **mhs:inscription_date**: 2016-02-12  
- **source:heritage**: Base Mérimée 2016  
- **tourism**: artwork  
- **wikipedia**: fr:Édicule Guimard  




PA00086599 station du métro de Paris


In [142]:
for merimee, indices in listOfDuplicates[:35]:
    newIssue = formatOSMInfoIssue( merimee, indices )

    # create an issue
    r = requests.post('https://api.github.com/repos/xdze2/OSM_et_Wikidata/issues', 
                     headers={'Authorization': 'token %s'%token},
                     data=newIssue)
    print( r.ok )

True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
True
