<a href="https://colab.research.google.com/github/z-gard/analysis/blob/main/notebooks/OpenStreetMap.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [96]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
!pip install -q pyrosm　osmread

In [50]:
import os
os.environ["USE_PYGEOS"] = "0"
import geopandas
import pandas as pd
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)

In [3]:
from pyrosm import OSM, get_data
from osmread import parse_file, Node

In [98]:
DATA_DIR = '/content/drive/MyDrive/z-gard/data/osm'

### OSMファイルをダウンロード

In [20]:
osm_dir = './osm'
os.makedirs(osm_dir, exist_ok=True)

# 関東のOSMデータ
kanto_pbf = get_data('kanto', directory=osm_dir)

### OSMファイルをparseしてDataFrameに変換
- https://wiki.openstreetmap.org/wiki/Map_features

In [156]:
def get_category_data(pbf_file, cat):
    tag_all = []
    osm_file = parse_file(pbf_file)
    for data in osm_file:
        if isinstance(data, Node):
            if len(data.tags.keys() & {cat}) > 0:
                tag_data = {
                    'id': data.id,
                    'tags': data.tags,
                    'lon': data.lon,
                    'lat': data.lat
                }
                tag_all.append(tag_data)
    _tmp = pd.json_normalize(tag_all)
    _tmp['name'] = _tmp['tags.name']
    _tmp['category'] = _tmp[f'tags.{cat}']
    return _tmp

In [164]:
def extract_columns(df_data, cat):
    _tmp = df_data[df_data['category'] == cat]
    total_len = len(_tmp)
    col_list = []
    for col in _tmp.columns:
        if col in ['id', 'lon', 'lat', 'category', 'name']:
            continue
        count = len(_tmp[_tmp[col].notna()])
        if count > (total_len/10):
            print(col, count)
            col_list.append(col)
    return _tmp[col_list]

### shop

In [70]:
%%time
df_shop = get_category_data(kanto_pbf, 'shop')
print(df_shop.shape)

(68499, 567)
CPU times: user 5min 6s, sys: 1.61 s, total: 5min 8s
Wall time: 5min 8s


In [112]:
df_shop.loc[df_shop['name'].isna(), 'name'] = df_shop['tags.name:ja']
df_shop.loc[df_shop['name'].isna(), 'name'] = df_shop['tags.name:en']

In [113]:
file_name = os.path.join(DATA_DIR, 'osm_shop.csv')
df_shop[['id', 'lon', 'lat', 'category', 'name']].to_csv(file_name, encoding='utf_8_sig', index=False)

In [114]:
df_shop['category'].value_counts().head(20)

convenience      13772
hairdresser       6490
supermarket       4663
clothes           3816
car               2531
dry_cleaning      2140
massage           2113
confectionery     1642
bakery            1605
variety_store     1332
mobile_phone      1318
yes               1229
books             1212
alcohol           1157
bicycle           1097
florist           1060
laundry           1056
electronics        929
deli               882
optician           805
Name: category, dtype: int64

### amenity

In [107]:
%%time
df_amenity = get_category_data(kanto_pbf, 'amenity')
print(df_amenity.shape)

(179801, 941)
CPU times: user 6min 12s, sys: 5.35 s, total: 6min 17s
Wall time: 6min 17s


In [151]:
df_amenity['category'].value_counts().head(20)

restaurant          27245
vending_machine     10354
bench                9874
fast_food            8833
place_of_worship     8327
pub                  7332
kindergarten         7181
cafe                 6789
social_facility      6385
toilets              6380
pharmacy             6290
post_box             5795
parking              4850
dentist              4274
telephone            4193
doctors              4035
post_office          3953
drinking_water       3543
school               3152
police               3014
Name: category, dtype: int64

In [154]:
file_name = os.path.join(DATA_DIR, 'osm_amenity.csv')
df_amenity[
    ~df_amenity['category'].isin(['vending_machine', 'bench', 'toilets', 'post_box', 'parking', 'telephone', 'drinking_water'])
][['id', 'lon', 'lat', 'category', 'name']].to_csv(file_name, encoding='utf_8_sig', index=False)

In [155]:
# extract_columns(df_amenity, 'public_bath')

### building

In [158]:
%%time
df_building = get_category_data(kanto_pbf, 'building')
print(df_building.shape)

(1894, 181)
CPU times: user 4min 47s, sys: 857 ms, total: 4min 48s
Wall time: 4min 47s


In [160]:
df_building['category'].value_counts().head(20)

yes                  730
public               530
apartments           171
detached              55
industrial            45
retail                40
office                35
warehouse             33
house                 32
residential           29
entrance              21
shrine                18
train_station         17
school                15
roof                  15
civic                 11
wayside_shrine         9
commercial             8
church                 8
transformer_tower      7
Name: category, dtype: int64

In [171]:
file_name = os.path.join(DATA_DIR, 'osm_building.csv')
df_building[['id', 'lon', 'lat', 'category', 'name']].to_csv(file_name, encoding='utf_8_sig', index=False)

### landuse

In [159]:
%%time
df_landuse = get_category_data(kanto_pbf, 'landuse')
print(df_landuse.shape)

(194, 86)
CPU times: user 4min 48s, sys: 808 ms, total: 4min 49s
Wall time: 4min 48s


In [172]:
df_landuse['category'].value_counts().head(20)

residential          73
military             22
cemetery             22
farmland             12
industrial           11
recreation_ground    10
construction          5
landfill              5
grass                 4
commercial            4
retail                4
forest                3
orchard               3
railway               2
basin                 2
flowerbed             2
reservoir             1
institutional         1
village_green         1
allotments            1
Name: category, dtype: int64