# Аналіз дадзеных тэгу name у ОСМ Беларусі

## Зьмест

- Праблематыка
- Спампуем дамп ОСМ
- Усталёўваем залежанасьцьі
- Пошук сувязей дзеля падтрыманьня спасылачнай цэласнасьці

## Праблематыка

У беларускім ОСМ шырока выкарыстоўваюцца беларуская і расейская мова, для іх ёсьць адпаведнікі `name:be` і `name:ru`, таксама мовы выкарыстоўваюцца ў агульных тэгах як `name`, `addr:*` і іншых. Праблематка выкарыстоўваньня аднае, ці іншае, ці абедзьвух моваў апісанае тут https://wiki.openstreetmap.org/wiki/BE:Belarus_language_issues. Незалежна ад варыянту выкарыстоўваньня мовы павінны вытрымлівацца наступныя правілы: пошук на любое мове мусіць працаваць, павінна быць магчымасьць паказываць подпісы на любой мове (ці ў арыгінале, але гэтае правіла зараз не выконваецца), павінна захоўвацца спасылкавая цэласнасьць (што можа ўплываць на папярэднія два пункты).

Гэты аналіз ставіць мэтаю знайсьці адпаведныя катэгорыі і тэгі якія ўтрымліваюць кірылічныя значэньні тэгу name і падлічыць запаўняльнасьць тэгаў name:be, name:ru.

## Спампуем дамп ОСМ

In [37]:
!wget --backups=1 -N https://download.geofabrik.de/europe/belarus-latest.osm.pbf

--2022-04-02 07:50:03--  https://download.geofabrik.de/europe/belarus-latest.osm.pbf
Resolving download.geofabrik.de (download.geofabrik.de)... 116.202.112.212, 95.216.28.113
Connecting to download.geofabrik.de (download.geofabrik.de)|116.202.112.212|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 256893316 (245M) [application/octet-stream]
Saving to: ‘belarus-latest.osm.pbf’


2022-04-02 07:50:50 (5.21 MB/s) - ‘belarus-latest.osm.pbf’ saved [256893316/256893316]



## Загрузім дамп у postgis
- патрэбна толькі калі хочам атрымаць больш дакладныя дадзеныя, але можа не ўтрымліваць некаторыя дачыненьні

In [None]:
!PGPASSWORD=$POSTGRES_PASSWORD psql -h $POSTGRES_HOST -p $POSTGRES_POST -U $POSTGRES_USER -d $POSTGRES_DB  -c "CREATE EXTENSION IF NOT EXISTS hstore"
!PGPASSWORD=$POSTGRES_PASSWORD osm2pgsql -H $POSTGRES_HOST -P $POSTGRES_POST -U $POSTGRES_USER -d $POSTGRES_DB -v -m -j -G -x --latlong --hstore-add-index -C $OSM2PGSQL_CACHE -S /usr/share/osm2pgsql/default.style belarus-latest.osm.pbf

NOTICE:  extension "hstore" already exists, skipping
CREATE EXTENSION
2022-04-02 07:51:03  osm2pgsql version 1.6.0
2022-04-02 07:51:03  [0] Database version: 14.2
2022-04-02 07:51:03  [0] PostGIS version: 3.2
2022-04-02 07:51:03  [0] Reading file: belarus-latest.osm.pbf
2022-04-02 07:51:03  [0] Started pool with 4 threads.
2022-04-02 07:51:03  [0] Using projection SRS 4326 (Latlong)
2022-04-02 07:51:03  [0] Using built-in tag transformations
2022-04-02 07:51:03  [0] Middle 'ram' options:
2022-04-02 07:51:03  [0]   locations: true
2022-04-02 07:51:03  [0]   way_nodes: true
2022-04-02 07:51:03  [0]   nodes: false
2022-04-02 07:51:03  [0]   untagged_nodes: true
2022-04-02 07:51:03  [0]   ways: false
2022-04-02 07:51:03  [0]   relations: false
2022-04-02 07:51:03  [0] Setting up table 'planet_osm_point'
2022-04-02 07:51:03  [0] Setting up table 'planet_osm_line'
2022-04-02 07:51:03  [0] Setting up table 'planet_osm_polygon'
2022-04-02 07:51:04  [0] Setting up table 'planet_osm_roads'
2022-

## Усталюем залежнасьці

In [2]:
!pip install pandas matplotlib psycopg2-binary https://github.com/lechup/imposm-parser/archive/python3.zip

Collecting https://github.com/lechup/imposm-parser/archive/python3.zip
  Using cached https://github.com/lechup/imposm-parser/archive/python3.zip
  Preparing metadata (setup.py) ... [?25ldone
[0m[?25h

## Вызначым катэгорыі

In [1]:
import os
from collections import defaultdict, Counter

from imposm.parser import OSMParser
import psycopg2

import pandas as pd
pd.set_option('display.max_rows', None)


cirylic_chars = frozenset('абвгдеёжзіийклмнопрстуўфхцчшщьыъэюяАБВГДЕЁЖЗІИІЙКЛМНОПРСТУФХЦЧШЩЬЫЪЭЮЯ')


In [2]:
categories_rules = {
    'admin': [
        ['boundary', 'administrative'],
#         ['admin_level', '2'],
#         ['admin_level', '4'],
#         ['admin_level', '6'],
#         ['admin_level', '8'],
#         ['admin_level', '9'],
#         ['admin_level', '10'],
        ['admin_level', None],
    ],
    'place': [
        ['place', 'city'],
        ['place', 'town'],
        ['place', 'village'],
        ['place', 'hamlet'],
        ['place', 'isolated_dwelling'],
    ],
    'locality': [
        ['place', 'allotments'],
        ['place', 'locality'],
        ['abandoned:place', None],
    ],
    'suburb': [
        ['landuse', 'commercial'],
        ['landuse', 'construction'],
        ['landuse', 'education'],
        ['landuse', 'industrial'],
        ['landuse', 'residential'],
        ['landuse', 'retail'],
        ['landuse', 'allotments'],
        ['place', None],
        ['residential', None],
        ['industrial', None],
    ],
    'highway': [
        ['highway', 'motorway'],
        ['highway', 'trunk'],
        ['highway', 'primary'],
        ['highway', 'secondary'],
        ['highway', 'tertiary'],
        ['highway', 'unclassified'],
        ['highway', 'residential'],
        ['highway', 'service'],
        ['highway', 'track'],
        ['highway', None],
        ['type', 'associatedStreet'],
    ],
    'public_transport': [
        ['highway', 'bus_stop'],
        ['public_transport', None],
        ['route', None],
        ['type', 'route'],
        ['railway', None],
        ['route_master', None],
    ],
    'infrastructure': [
        ['tunnel', None],
        ['barrier', None],
        ['power', None],
        ['bridge', None],
        ['substation', None],
        ['emergency', None],
        ['ele', None],
        ['man_made', None],
        ['embankment', None],
    ],
    'amenity': [
        ['amenity', 'place_of_worship'],
        ['amenity', 'school'],
        ['amenity', 'kindergarten'],
#         ['amenity', 'cafe'],
#         ['amenity', 'atm'],
#         ['amenity', 'pharmacy'],
#         ['amenity', 'bank'],
#         ['amenity', 'post_office'],
#         ['amenity', 'fast_food'],
#         ['amenity', 'fuel'],
#         ['amenity', 'community_centre'],
#         ['amenity', 'hospital'],
#         ['amenity', 'police'],
#         ['amenity', 'restaurant'],
#         ['amenity', 'clinic'],
#         ['amenity', 'doctors'],
#         ['amenity', 'library'],
#         ['amenity', 'bar'],
        ['amenity', None],
        ['shop', None],
        ['leisure', None],
        ['sport', None],
        ['craft', 'shoemaker'],
        ['clothes', None],
    ],
    'government': [
        ['healthcare', None],
        ['office', 'government'],
        ['government', None],
        ['military', None],
    ],
    'office': [
        ['office', None],
    ],
    'building': [
        ['building', 'industrial'],
        ['building', 'service'],
        ['building', 'retail'],
        ['building', 'school'],
        ['building', 'kindergarten'],
        ['building', 'commercial'],
        ['building', 'church'],
        ['building', 'warehouse'],
        ['building', 'public'],
        ['building', 'dormitory'],
        ['building', 'hospital'],
        ['building', 'warehouse'],
        ['building', None],
    ],
    'tourism': [
        ['tourism', None],
        ['historic', None],
        ['memorial', None],
        ['ruins', None],
        ['information', None],
        ['attraction', None],
        ['resort', None],
        ['artwork_type', None],
    ],
    'water': [
        ['waterway', 'drain'],
        ['waterway', 'ditch'],
        ['waterway', 'stream'],
        ['waterway', 'river'],
        ['waterway', 'canal'],
        ['waterway', None],
        ['type', 'waterway'],
        ['water', None],
        ['natural', 'water'],
        ['natural', 'spring'],
    ],
    'natural': [
        ['natural', None],
        ['place', 'island'],
        ['place', 'islet'],
        ['landuse', None],
    ],
}

usage = defaultdict(set)
categories_rules2 = {}
for category, group in categories_rules.items():
    if category not in categories_rules2:
        categories_rules2[category] = []
    for tag, value in group:
        if value is not None:
            categories_rules2[category].append([tag, True, {value}])
            usage[tag].add(value)                          
for category, group in categories_rules.items():
    if category not in categories_rules2:
        categories_rules2[category] = []
    for tag, value in group:
        if value is None:
            categories_rules2[category].append([tag, False, usage[tag]])

## Падлічам статыстыку для дампу
- дамп падліча ўсе дадзеныя, але можа быць трошкі недакладным таму што не ўлічвае грубую абрэзку Беларусі

In [3]:
key_counter = defaultdict(lambda: defaultdict(list))

categories_tags = {}
categories_rules_tags_set = {}
for category, group in categories_rules2.items():
    for tag, eq, values in group:
        if tag not in categories_tags:
            categories_tags[tag] = {category}
        else:
            categories_tags[tag].add(category)

            
def process(params):
    for _, tags, _ in params:
        if 'name' not in tags:
            continue
        if not (frozenset(tags['name']) & cirylic_chars):
            continue
        categories = {category for tag in categories_tags.keys() & tags.keys() for category in categories_tags[tag]}
        for tag in ['name', 'name:be', 'name:ru']:
            if tag not in tags:
                continue
            value = tags[tag]
            cyr = frozenset(value) & cirylic_chars
            if not cyr:
                continue
            match = False
            for category in categories:
                group = categories_rules2[category]
                category_match = False
                for i, (k, eq, vv) in enumerate(group):
                    if k not in tags:
                        continue
                    if eq:
                        if tags[k] in vv:
                            if not category_match:
                                key_counter[(category,)][tag].append(value)
                                match = category_match = True
                            key_counter[(category, i)][tag].append(value)   
                    else:
                        if tags[k] not in vv:
                            if not category_match:
                                key_counter[(category,)][tag].append(value)
                                match = category_match = True
                            key_counter[(category, i)][tag].append(value)   
            if not match:
                key_counter[('other',)][tag].append(value)
                

OSMParser(
    nodes_callback=process,
    ways_callback=process,
    relations_callback=process,
).parse('belarus-latest.osm.pbf')


# cirylic tag values count
data = []
for c in list(categories_rules) + ['other']:
    name_cnt = len(key_counter[(c,)]['name'])
    name_uniq = len(set(key_counter[(c,)]['name']))
    name_be_cnt = len(key_counter[(c,)]['name:be'])
    name_be_uniq = len(set(key_counter[(c,)]['name:be']))
    name_ru_cnt = len(key_counter[(c,)]['name:ru'])
    name_ru_uniq = len(set(key_counter[(c,)]['name:ru']))
    data.append([
        '#', c, 
        name_cnt, name_be_cnt, name_ru_cnt, name_be_cnt/name_cnt, name_ru_cnt/name_cnt,
        name_uniq, name_be_uniq, name_ru_uniq, name_be_uniq/name_uniq, name_ru_uniq/name_uniq,
    ])
    if c == 'other':
        continue
    for i, (k, eq, vv) in enumerate(categories_rules2[c]):
        if eq:
            tag = f'{k} = {list(vv)[0]}'
        else:
            tag = f'{k} = *'
        name_cnt = len(key_counter[(c, i)]['name'])
        name_uniq = len(set(key_counter[(c, i)]['name']))
        name_be_cnt = len(key_counter[(c, i)]['name:be'])
        name_be_uniq = len(set(key_counter[(c, i)]['name:be']))
        name_ru_cnt = len(key_counter[(c, i)]['name:ru'])
        name_ru_uniq = len(set(key_counter[(c, i)]['name:ru']))
        data.append([
            '', tag, 
            name_cnt, name_be_cnt, name_ru_cnt, name_be_cnt/name_cnt, name_ru_cnt/name_cnt,
            name_uniq, name_be_uniq, name_ru_uniq, name_be_uniq/name_uniq, name_ru_uniq/name_uniq,
        ])

In [4]:
df = pd.DataFrame(data, columns=[
    'lvl', 'category', 
    'all name', 'all name:be', 'all name:ru', 'all name:be%', 'all name:ru%',
    'uniq name', 'uniq name:be', 'uniq name:ru', 'uniq name:be%', 'uniq name:ru%',
])
df.to_csv('dump.csv')
df.style.set_properties(**{'text-align': 'left'}).background_gradient('YlOrRd', subset=[
    'all name:be%', 'all name:ru%', 'uniq name:be%', 'uniq name:ru%',
]).apply(lambda row: [("font-weight: bold" if row.loc['lvl'] == '#' else '') for _ in row], axis=1)

Unnamed: 0,lvl,category,all name,all name:be,all name:ru,all name:be%,all name:ru%,uniq name,uniq name:be,uniq name:ru,uniq name:be%,uniq name:ru%
0,#,admin,25056,24696,24663,0.985632,0.984315,16989,16891,16818,0.994232,0.989935
1,,boundary = administrative,25054,24694,24661,0.985631,0.984314,16989,16891,16818,0.994232,0.989935
2,,admin_level = *,25048,24691,24658,0.985747,0.98443,16983,16887,16814,0.994347,0.990049
3,#,place,45406,45039,45165,0.991917,0.994692,15621,15628,15514,1.000448,0.99315
4,,place = city,31,31,31,1.0,1.0,16,16,16,1.0,1.0
5,,place = town,274,273,274,0.99635,1.0,138,138,138,1.0,1.0
6,,place = village,5031,4987,5015,0.991254,0.99682,2227,2208,2215,0.991468,0.994612
7,,place = hamlet,38799,38562,38607,0.993892,0.995051,13577,13621,13503,1.003241,0.99455
8,,place = isolated_dwelling,1271,1186,1238,0.933124,0.974036,644,595,621,0.923913,0.964286
9,#,locality,13816,10631,13117,0.76947,0.949406,9652,6877,9154,0.712495,0.948404


## Падлічам статыстыку для выгрузкі ў postgis
- вынік будзе больш дакладным, але можа ня ўлічываць дачыненьні што не пераносяцца ў postgis

In [5]:
# query_template = """
# SELECT '{category}' AS category, {num} AS num, g.tags->'name' AS name, g.tags->'name:be' AS name_be, g.tags->'name:ru' AS name_ru
# FROM {table} g
# INNER JOIN planet_osm_polygon p
# ON ST_Intersects(g.way, p.way)
# WHERE p.osm_id = -59065
# AND g.tags->'name' ~ '({cyr})'
# AND {condition}
# """
query_template = """
SELECT '{category}' AS category, {num} AS num, g.tags->'name' AS name, g.tags->'name:be' AS name_be, g.tags->'name:ru' AS name_ru
FROM {table} g
WHERE {condition}
-- ({cyr})
"""
# tables = ['planet_osm_line', 'planet_osm_point', 'planet_osm_polygon']
tables = ['planet_osm_data']
cyr = '|'.join(cirylic_chars)

queries = []
exclude = []
for category, group in categories_rules2.items():
    conditions = []
    for i, (k, eq, vv) in enumerate(group):
        if vv:
            eq_str = 'IN' if eq else 'NOT IN'
            vv_str = ','.join(f"'{v}'" for v in vv)
            condition = f"g.tags->'{k}' {eq_str} ({vv_str})"
        elif not eq:
            condition = f"g.tags->'{k}' IS NOT NULL"
        else:
            raise ValueError()
        conditions.append(condition)
        exclude.append(condition)
        for table in tables:
            query = query_template.format(category=category, num=i, table=table, cyr=cyr, condition=condition)
            queries.append(query)
    condition = ' OR '.join(f'({c})' for c in conditions)
    for table in tables:
        query = query_template.format(category=category, num=-1, table=table, cyr=cyr, condition=condition)
        queries.append(query)
condition = ' OR '.join(f'({c})' for c in exclude)
for table in tables:
    query = query_template.format(category='other', num=-1, table=table, cyr=cyr, condition=f'NOT ({condition})')
    queries.append(query)
query = ' UNION ALL '.join(queries)

print(len(queries))


110


In [6]:
key_counter = defaultdict(lambda: defaultdict(list))

view_creation = """
CREATE MATERIALIZED VIEW IF NOT EXISTS planet_osm_data AS

SELECT
    g.osm_id AS osm_id,
    'node' AS osm_type,
    g.tags AS tags
FROM planet_osm_point g
INNER JOIN planet_osm_polygon p
ON ST_Contains(p.way, g.way)
WHERE p.osm_id = -59065
AND g.tags->'name' ~ '({cyr})'

UNION ALL

SELECT
    g.osm_id AS osm_id,
    'way' AS osm_type,
    g.tags AS tags
FROM planet_osm_line g
INNER JOIN planet_osm_polygon p
ON ST_Contains(p.way, g.way)
WHERE p.osm_id = -59065
AND g.tags->'name' ~ '({cyr})'

UNION ALL

SELECT
    ABS(g.osm_id) AS osm_id,
    CASE WHEN g.osm_id < 0 THEN 'relation' ELSE 'way' END AS osm_type,
    g.tags AS tags
FROM planet_osm_polygon g
INNER JOIN planet_osm_polygon p
ON ST_Contains(p.way, g.way)
WHERE p.osm_id = -59065
AND g.tags->'name' ~ '({cyr})'
""".format(cyr=cyr)
index_creation_id_type = """
CREATE INDEX IF NOT EXISTS "planet_osm_data_osm_id_type_idx" ON planet_osm_data (osm_id, osm_type)
"""
index_creation_tags = """
CREATE INDEX IF NOT EXISTS "planet_osm_data_tags_idx" ON planet_osm_data USING GIN (tags)
"""
index_creation_geom = """
CREATE INDEX IF NOT EXISTS "planet_osm_data_way_idx" ON planet_osm_data USING GIST (way)
"""

conn = psycopg2.connect(
    host=os.environ['POSTGRES_HOST'],
    dbname=os.environ['POSTGRES_DB'],
    user=os.environ['POSTGRES_USER'],
    password=os.environ['POSTGRES_PASSWORD'],
)
cur = conn.cursor()
cur.execute(view_creation)
# cur.execute(index_creation_id_type)
cur.execute(index_creation_tags)
# cur.execute(index_creation_geom)
for i, query in enumerate(queries, 1):
    cur.execute(query)
    records = cur.fetchall()
    for category, num, name, name_be, name_ru in records:
        key = (category,) if num == -1 else (category, num)
        key_counter[key]['name'].append(name)
        if name_be is not None:
            key_counter[key]['name:be'].append(name_be)
        if name_ru is not None:
            key_counter[key]['name:ru'].append(name_ru)
cur.close()
conn.close()

In [7]:
data = []
for c in list(categories_rules) + ['other']:
    name_cnt = len(key_counter[(c,)]['name'])
    name_uniq = len(set(key_counter[(c,)]['name']))
    name_be_cnt = len(key_counter[(c,)]['name:be'])
    name_be_uniq = len(set(key_counter[(c,)]['name:be']))
    name_ru_cnt = len(key_counter[(c,)]['name:ru'])
    name_ru_uniq = len(set(key_counter[(c,)]['name:ru']))
    data.append([
        '#', c, 
        name_cnt, name_be_cnt, name_ru_cnt, name_be_cnt/(name_cnt or 1), name_ru_cnt/(name_cnt or 1),
        name_uniq, name_be_uniq, name_ru_uniq, name_be_uniq/(name_uniq or 1), name_ru_uniq/(name_uniq or 1),
    ])
    if c == 'other':
        continue
    for i, (k, eq, vv) in enumerate(categories_rules2[c]):
        if eq:
            tag = f'{k} = {list(vv)[0]}'
        else:
            tag = f'{k} = *'
        name_cnt = len(key_counter[(c, i)]['name'])
        name_uniq = len(set(key_counter[(c, i)]['name']))
        name_be_cnt = len(key_counter[(c, i)]['name:be'])
        name_be_uniq = len(set(key_counter[(c, i)]['name:be']))
        name_ru_cnt = len(key_counter[(c, i)]['name:ru'])
        name_ru_uniq = len(set(key_counter[(c, i)]['name:ru']))
        data.append([
            '', tag, 
            name_cnt, name_be_cnt, name_ru_cnt, name_be_cnt/(name_cnt or 1), name_ru_cnt/(name_cnt or 1),
            name_uniq, name_be_uniq, name_ru_uniq, name_be_uniq/(name_uniq or 1), name_ru_uniq/(name_uniq or 1),
        ])


In [8]:
df = pd.DataFrame(data, columns=[
    'lvl', 'category', 
    'all name', 'all name:be', 'all name:ru', 'all name:be%', 'all name:ru%',
    'uniq name', 'uniq name:be', 'uniq name:ru', 'uniq name:be%', 'uniq name:ru%',
])
df.to_csv('postgis.csv')
df.style.set_properties(**{'text-align': 'left'}).background_gradient('YlOrRd', subset=[
    'all name:be%', 'all name:ru%', 'uniq name:be%', 'uniq name:ru%',
]).apply(lambda row: [("font-weight: bold" if row.loc['lvl'] == '#' else '') for _ in row], axis=1)

Unnamed: 0,lvl,category,all name,all name:be,all name:ru,all name:be%,all name:ru%,uniq name,uniq name:be,uniq name:ru,uniq name:be%,uniq name:ru%
0,#,admin,58342,56195,56469,0.9632,0.967896,16740,16806,16725,1.003943,0.999104
1,,boundary = administrative,58340,56193,56467,0.963198,0.967895,16740,16806,16725,1.003943,0.999104
2,,admin_level = *,58150,56182,56456,0.966156,0.970868,16735,16802,16721,1.004004,0.999163
3,#,place,75015,74777,74949,0.996827,0.99912,15489,15602,15468,1.007296,0.998644
4,,place = city,75,75,75,1.0,1.0,16,16,16,1.0,1.0
5,,place = town,474,474,474,1.0,1.0,137,138,137,1.007299,1.0
6,,place = village,8586,8574,8582,0.998602,0.999534,2198,2206,2196,1.00364,0.99909
7,,place = hamlet,63805,63697,63774,0.998307,0.999514,13485,13602,13481,1.008676,0.999703
8,,place = isolated_dwelling,2075,1957,2044,0.943133,0.98506,634,589,615,0.929022,0.970032
9,#,locality,14477,11009,13731,0.760448,0.94847,9604,6879,9144,0.716264,0.952103
