In [1]:
import numpy as np
import pandas as pd
import requests as req
import shapely as shp

from pandana.loaders import osm
from time import sleep

### We need to give OSM an area to consider for the query. We do this by creating a bounding box around the area we are interested in; in this case, Portland

In [2]:
# save the bounding box--PDX
bbox = {'xmax': -122.2575, 'ymax': 45.7859, 'xmin': -123.1095, 'ymin': 45.1453}
nw = shp.geometry.Point(bbox['xmin'], bbox['ymax'])
sw = shp.geometry.Point(bbox['xmin'], bbox['ymin'])
se = shp.geometry.Point(bbox['xmax'], bbox['ymin'])
ne = shp.geometry.Point(bbox['xmax'], bbox['ymax'])

In [4]:
pd.set_option('max_columns', None)

### OSM has a lot of data and we don't want to be overwhelmed. We can use the taginfo api to get useful meta data from OSM

In [5]:
# get the most frequently used amenity tag in OSM from the taginfo api
r = req.get('https://taginfo.openstreetmap.org/api/4/tags/popular?query=amenity')
r.status_code

200

#### Convert the json to a dataframe for easy use

In [6]:
tags = pd.concat(pd.DataFrame.from_dict(d, orient='index').transpose() for d in r.json()['data'])

In [7]:
tags.sort_values(by='count_nodes', ascending=False)

Unnamed: 0,value,projects,count_ways_fraction,count_relations_fraction,count_relations,count_ways,count_nodes,count_all,in_wiki,count_all_fraction,count_nodes_fraction,key
0,bench,11,0,0,21,8367,784816,793204,1,0.0002,0.0063,amenity
0,restaurant,15,0.0003,0.0002,856,149529,623979,774364,1,0.0002,0.005,amenity
0,place_of_worship,15,0.0011,0.0009,4666,465134,439363,909163,1,0.0002,0.0036,amenity
0,school,15,0.001,0.0027,13988,442028,403748,859764,1,0.0002,0.0033,amenity
0,parking,11,0.0049,0.0047,24794,2161368,303495,2489657,1,0.0005,0.0025,amenity
0,cafe,15,0.0001,0,210,39225,257569,297004,1,0.0001,0.0021,amenity
0,fuel,16,0.0002,0.0003,1344,101883,245377,348604,1,0.0001,0.002,amenity
0,post_box,13,0,0,2,100,230804,230906,1,0.0001,0.0019,amenity
0,waste_basket,9,0,0,4,391,224749,225144,1,0,0.0018,amenity
0,fast_food,14,0.0001,0,248,53883,224115,278246,1,0.0001,0.0018,amenity


In [8]:
# the loader for some reason can't take a list of tags so we will have to iterate and concat
tag_pairs = tags.apply(lambda row: row['key'] + '=' + row['value'], axis=1).tolist()

In [9]:
tag_pairs

['amenity=arts_centre',
 'amenity=atm',
 'amenity=bank',
 'amenity=bar',
 'amenity=bbq',
 'amenity=bench',
 'amenity=bicycle_parking',
 'amenity=bicycle_rental',
 'amenity=bus_station',
 'amenity=cafe',
 'amenity=car_rental',
 'amenity=car_wash',
 'amenity=charging_station',
 'amenity=childcare',
 'amenity=cinema',
 'amenity=clinic',
 'amenity=clock',
 'amenity=college',
 'amenity=community_centre',
 'amenity=courthouse',
 'amenity=dentist',
 'amenity=doctors',
 'amenity=drinking_water',
 'amenity=driving_school',
 'amenity=embassy',
 'amenity=fast_food',
 'amenity=ferry_terminal',
 'amenity=fire_station',
 'amenity=fountain',
 'amenity=fuel',
 'amenity=grave_yard',
 'amenity=grit_bin',
 'amenity=hospital',
 'amenity=hunting_stand',
 'amenity=ice_cream',
 'amenity=kindergarten',
 'amenity=library',
 'amenity=marketplace',
 'amenity=mobile_money_agent',
 'amenity=nightclub',
 'amenity=nursing_home',
 'amenity=parking',
 'amenity=parking_entrance',
 'amenity=parking_space',
 'amenity=pha

In [10]:
# we are only interested in some of these; let's filter now to spare the api
keepers = ['restaurant', 'cafe', 'pub', 'bar', 'theater', 'college', 'cinema', 'arts_centre', 'nightclub', 'university']
new_ls = []
for keeper in keepers:
    new_ls.append('amenity={}'.format(keeper))

In [11]:
new_ls

['amenity=restaurant',
 'amenity=cafe',
 'amenity=pub',
 'amenity=bar',
 'amenity=theater',
 'amenity=college',
 'amenity=cinema',
 'amenity=arts_centre',
 'amenity=nightclub',
 'amenity=university']

### Generally, the OSM api is not meant to be used over such a large area. In thise case since we are interested in such a small subset of data it will do. Still, the sleep time (t) might need adjusting in order to get the data back. Too long and you risk timing out, too short you might get throttled

In [16]:
def pdx_amenities(pairs):
    t = 4
    dfs = []
    for pair in pairs:
        try:
            df = osm.node_query(bbox['ymin'], bbox['xmin'], bbox['ymax'], bbox['xmax'], tags=pair).reset_index()
            keep_cols = ['id', 'lat', 'lon']
            if 'name'  in df.columns.tolist():
                keep_cols.append('name')
            df = df[keep_cols]
            df['amenity'] = pair.split('=')[1]
            dfs.append(df)
        except RuntimeError as e:
            print(pair + ': ')
            print(e)
        # try to avoid auto throttle
        sleep(t)
    return pd.concat(dfs, ignore_index=True)

In [17]:
test = pdx_amenities(new_ls)

amenity=theater: 
OSM query results contain no data.


In [18]:
test.head(20)

Unnamed: 0,id,lat,lon,name,amenity
0,287206333,45.534615,-122.916619,Swagath,restaurant
1,287207155,45.550412,-122.904555,Pho Tango,restaurant
2,323588282,45.431159,-122.564075,Gustav's,restaurant
3,324193662,45.504988,-122.619285,New Thai Blues,restaurant
4,324193669,45.497759,-122.614058,Pho Hung,restaurant
5,324207464,45.523454,-122.697037,Thai Orchid,restaurant
6,324207465,45.523463,-122.697231,Pizza Oasis,restaurant
7,324207469,45.524147,-122.69824,Uptown Billiards,restaurant
8,324207483,45.529349,-122.698433,Pepinos Mexican food,restaurant
9,324207484,45.529458,-122.698413,Little Big Burger,restaurant


In [19]:
test.groupby('amenity').size().sort_values(ascending=False)

amenity
restaurant     648
cafe           365
pub             95
bar             84
cinema          16
college         12
arts_centre      8
nightclub        6
university       2
dtype: int64

In [21]:
test.id.size

1236

In [22]:
# create some useful grouping
test['alias'] = test.apply(lambda row: 'food_drink' if row['amenity'] in ('restaurant', 'pub', 'bar') else 'entertainment' if row['amenity'] in ('cinema', 'arts_centre', 'nightclub', 'theater') else 'college' if row['amenity'] in ('college', 'university') else row['amenity'], axis=1)

In [23]:
test

Unnamed: 0,id,lat,lon,name,amenity,alias
0,287206333,45.534615,-122.916619,Swagath,restaurant,food_drink
1,287207155,45.550412,-122.904555,Pho Tango,restaurant,food_drink
2,323588282,45.431159,-122.564075,Gustav's,restaurant,food_drink
3,324193662,45.504988,-122.619285,New Thai Blues,restaurant,food_drink
4,324193669,45.497759,-122.614058,Pho Hung,restaurant,food_drink
5,324207464,45.523454,-122.697037,Thai Orchid,restaurant,food_drink
6,324207465,45.523463,-122.697231,Pizza Oasis,restaurant,food_drink
7,324207469,45.524147,-122.698240,Uptown Billiards,restaurant,food_drink
8,324207483,45.529349,-122.698433,Pepinos Mexican food,restaurant,food_drink
9,324207484,45.529458,-122.698413,Little Big Burger,restaurant,food_drink


### Write the data to a csv for later use

In [32]:
test.to_csv('./data/osm.csv', index=False)