In [3]:
import json
from pprint import pprint
import tinydb as tdb
import requests

# Cherche les doublons dans Open Street Map

c.a.d. node/way référençant le même code Mérimée

In [4]:
db = tdb.TinyDB('data/tinyDB.json')

In [5]:
osm = db.table('osm')
merimee = db.table('merimee')
wikidata = db.table('wikidata')

In [6]:
wikidata.all()[4]

{'article': 'https://fr.wikipedia.org/wiki/Th%C3%A9%C3%A2tre_de_la_Madeleine',
 'image': 'http://commons.wikimedia.org/wiki/Special:FilePath/Th%C3%A9%C3%A2tre%20de%20la%20Madeleine.JPG',
 'item': 'http://www.wikidata.org/entity/Q386604',
 'itemDescription': 'théâtre à Paris',
 'merimee': 'PA00132981'}

In [7]:
wikidata.search(tdb.where('merimee') == 'PA00106817')

[{'article': 'https://fr.wikipedia.org/wiki/Cath%C3%A9drale_Saint-%C3%89tienne_de_Metz',
  'image': 'http://commons.wikimedia.org/wiki/Special:FilePath/Cathedrale-saint-etienne-metz-de-place-prefecture.jpg',
  'item': 'http://www.wikidata.org/entity/Q671066',
  'itemDescription': 'cathédrale située à Metz en Moselle, en France',
  'merimee': 'PA00106817'}]

In [8]:
dataOSM = osm.all()
print( len( dataOSM ) )

32246


In [9]:
dataOSM[1]

{'idOSM': 26691547,
 'name': 'Bailleul',
 'name:fr': 'Bailleul',
 'name:nl': 'Belle',
 'name:vls': 'Belle',
 'place': 'town',
 'population': '14480',
 'ref:FR:SIREN': '215900432',
 'ref:INSEE': '59043',
 'ref:mhs': 'PA00107358',
 'source:population': 'INSEE 2013',
 'typeOSM': 'node',
 'wikidata': 'Q2481155'}

In [12]:
#  groupe les entrees OSM par codes Merimee
#

dicoMerimeeIndice = {}
for i, d in enumerate(dataOSM):
    merimeeCode = d['ref:mhs']  # code merimee
    if merimeeCode in dicoMerimeeIndice:
        dicoMerimeeIndice[merimeeCode].append(i)
    else:
        dicoMerimeeIndice[merimeeCode] = [i]
        
listOfDuplicates = [ (refs, indices) for refs, indices in dicoMerimeeIndice.items() if len(indices)>1 ]

print( len(listOfDuplicates) )
listOfDuplicates[:2]

488


[('PA00103673', [24381, 24382]),
 ('PA00107625', [1381, 1385, 1390, 1393, 1396])]

In [13]:
def getWikiCode(itemUrl):
    items = itemUrl.split('/')
    text = '[%s](%s)'%( items[-1], itemUrl  )
    return text

def formatOSMInfo( info ):
    osmtype = info['typeOSM']
    desc = info['name'] if 'name' in info else '...'
    idOSM = info['idOSM']
    
    osmUrl = 'https://www.openstreetmap.org/%s/%s' % (osmtype, idOSM)
    osmLink = '[%s%s](%s)' % (osmtype, idOSM, osmUrl)
    
    if 'wikidata' in info:
        wikidataUrl = 'https://www.wikidata.org/wiki/%s' % info['wikidata']
        wikidataLink = '- [.wd.](%s)' % wikidataUrl
    else:
        wikidataLink = ''
        
    return '%s %s - %s   ' % ( osmLink, wikidataLink, desc)


## To Github issues

creer une 'issue' automatiquement avec l'API du Github

In [16]:
# Charge le token
# voir https://help.github.com/articles/creating-a-personal-access-token-for-the-command-line/
token = open('githubtoken.txt','r').read().strip()

In [17]:
from IPython.display import display, Markdown

In [18]:
import osmandwiki as ow

In [100]:
blacklist = ['ref:mhs', 'OSMurl', 'idOSM', 'typeOSM']

def createMarkdown( merimee, indices ):
    ''' Return the Markdown formated issue
    '''
    
    body = ''
    title = merimee
    body += "# %s  \n" % merimee 

    wd_results = wikidata.search(tdb.where('merimee') == merimee)
    wd = wd_results[0]
    
    if 'image' in wd:
        imagename = ow.imgUrlToFilename( wd['image'] )
        imageUrl = ow.getThumburl(imagename)

        body += '<img src="%s"   />  \n\n' % imageUrl

    if len(wd_results)>0:
        wikidatacodes = [ r['item'] for r in wd_results ]
        wikidataLinks = [ '[%s](%s)'%( url.split('/')[-1], url) for url in wikidatacodes ]
        body += '**Wikidata:** %s   \n' % ', '.join(wikidataLinks)
    else:
        body += '`pas de fiche wikidata`  \n'
    
    if 'article' in wd:
        body += '[page wikipedia](%s)  \n' % wd['article'] 
    else:
        body += '`pas de page wikipedia`   \n'

    if 'itemDescription' in wd:
        body += '**Description:** ' + wd['itemDescription']
        title += ' %s'% wd['itemDescription']

    body += '\n'
    body += '\n'


    for i in indices:
        info = dataOSM[i]
        info['OSMurl'] = 'https://www.openstreetmap.org/{typeOSM}/{idOSM}'.format( **info  )
        
        if 'name' not in info:
            info['name'] = '-'
        
        body += '### [{typeOSM} {idOSM}:]({OSMurl}) {name}\n'.format( **info  )

        for k, v in sorted( info.items(), key=lambda x:x[0] ):
            if k in blacklist:
                continue

            if k == 'wikidata':
                wikidataUrl = 'https://www.wikidata.org/wiki/%s' % v
                body += '- [**wd.{}**]({})  \n'.format(v, wikidataUrl  )

            else:
                body += '- **{}**: {}  \n'.format( k, v  )



        body += '\n\n'
    
    return title, body

In [101]:
def formatOSMInfoIssue( merimee, indices ):
    ''' Return the parameters to create the issue 
        i.e. send the query
    '''
    
    title, page = createMarkdown( merimee, indices )

    newissue = {
      "title": title,
      "body": page,
      "labels": [
        'doublons_dans_OSM'
      ]
    }
        
    return json.dumps( newissue )

In [102]:
# test
merimee, indices = listOfDuplicates[40]

title, body = createMarkdown( merimee, indices )
display(Markdown( body ))
print( title )

# PA00107649  
<img src="https://upload.wikimedia.org/wikipedia/commons/thumb/b/bb/Lille_120-122_rue_paris.jpg/300px-Lille_120-122_rue_paris.jpg"   />  

**Wikidata:** [Q22960840](http://www.wikidata.org/entity/Q22960840)   
`pas de page wikipedia`   
**Description:** immeuble à Lille (Nord)

### [node 2063752819:](https://www.openstreetmap.org/node/2063752819) Immeubles
- **description**: Façades et toitures sur rue : inscription par arrêté du 14 mars 1944  
- **heritage**: 3  
- **mhs:inscription_date**: 1944-03-14  
- **name**: Immeubles  
- **tourism**: attraction  
- [**wd.Q22960840**](https://www.wikidata.org/wiki/Q22960840)  


### [node 2063752821:](https://www.openstreetmap.org/node/2063752821) Immeubles
- **description**: Façades et toitures sur rue : inscription par arrêté du 14 mars 1944  
- **heritage**: 3  
- **mhs:inscription_date**: 1944-03-14  
- **name**: Immeubles  




PA00107649 immeuble à Lille (Nord)


In [99]:
# --- Run ---
for merimee, indices in listOfDuplicates[100:102]:
    newIssue = formatOSMInfoIssue( merimee, indices )

    # create an issue
    r = requests.post('https://api.github.com/repos/xdze2/OSM_et_Wikidata/issues', 
                     headers={'Authorization': 'token %s'%token},
                     data=newIssue)
    print( r.ok )

True
True
