In [3]:
# The idea of this script is calculate size distrbutions of osm Features 
# of intresset is to calculate a distrbution for each key value pair that is 
# listed on the wikipages refrences in osm_groups.txt
# for this we will randomly querry the whole OSM dataset to collect samples

In [4]:
# https://gis.stackexchange.com/questions/127427/transforming-shapely-polygon-and-multipolygon-objects

In [5]:
import parse_osm
import tag_handler
from shapely.geometry import Polygon, LineString, Point, MultiPolygon, MultiLineString, MultiPoint
from shapely.ops import transform
import random
import pandas as pd
import geopandas as gpd
import pyproj
import numpy as np
import statistics

In [6]:
def agg_len_area(x):
    length = [el for el in x['length'].tolist() if el]
    area = [el for el in x['area'].tolist() if el]
    
    return pd.Series({'area':area,'length':length}, index=['area', 'length'])


def select_ways_and_relations(df):
    
    if (not 'way' in df.index) and (not 'relation' in df.index):
        # if there are neither way nor relation in the df we produce an empty df
        selected_df = df.iloc[0:0]    
    
    elif not 'relation' in df.index:
        selected_df = df.loc[['way']]
    
    elif not 'way' in df.index:
        selected_df = df.loc[['relation']]
    
    else:
        selected_df = df.loc[['way','relation']]
    
    return selected_df


def calc_median(values):
    #print(values)
    median_v = None
    if values:
        median_v = statistics.median(values)
    return median_v

In [7]:
# Setup parameters
# this sets how many itterations we will go for 
n_iter = 10


# we can define an area in which we ware looking for key value combinations
# if this is not supplied the whole world will be querried
# polygon can be arbitrarie (must be in degrees) but only the lower left corner of the random bounding box
# is quaranteete to be within the polygon
# q_poly = Polygon([(52.450425727741,13.286182880402),(52.458323725344,13.286182880402),(52.458323725344,13.299744129181),(52.450425727741,13.299744129181)])

level_1_admin = gpd.read_file("../data/shapes/level_1_admin/ne_10m_admin_1_states_provinces.shp")
# in this case we select vienna as the polyon
q_poly  = level_1_admin[level_1_admin['name']=='Wien']['geometry'].iloc[0]


#level_0_admin = gpd.read_file("../data/shapes/level_0_admin/ne_10m_admin_0_countries.shp")
# in this case we select vienna as the polyon
#q_poly  = level_0_admin[level_0_admin['NAME']=='Austria']['geometry'].iloc[0]
# x,y need to be fliped because in the loaded shape x == lon y == lat 
# but overpass wants this to be the other way around
# q_poly
# here we define the size of our random bounding box in degrees
# 0.001° =111 m
bb_size = 0.005

# now we load the predefined tags from pickle

tag_df =  pd.read_pickle('../data/spatial_semantics/kv_df_just_eng.pickle')
filtertags_handler = tag_handler.filtertags(tag_df)

# tag_list = tag_handler.generate_tag_list(tag_df)

# we need the results_df to collect all the values
results_df = pd.DataFrame(columns=['key','value','area','length']) 

In [8]:
for i in range(n_iter):
    r_bb = tag_handler.random_bb(bb_size,q_poly)
#     r_bb = Polygon([(16.366605567347907, 48.145976748430115),
#  (16.371605567347906, 48.145976748430115),
#  (16.371605567347906, 48.15097674843012),
#  (16.366605567347907, 48.15097674843012),
#  (16.366605567347907, 48.145976748430115)])

    # we use the generated polygon to querry osm for all features within the bounding box
    osm_handle = parse_osm.disect_osm(parse_osm.json_from_osm(r_bb))

    # and filter the querried polygons for a) if a tag from the tag_list is pressent and b) if they are not just a point
    ways_relations_df = select_ways_and_relations(osm_handle.feature_df)
    selected_tags_df = ways_relations_df[ways_relations_df.apply(filtertags_handler.filter_them, axis = 1)]
    type_id_list = selected_tags_df.index.tolist()
    # for the selected features we calculate the geometry:
    [osm_handle.get_geometry(f_type,osm_id) for f_type, osm_id in type_id_list]
    
    # reselect to fetch the generated geometries
    ways_relations_df = select_ways_and_relations(osm_handle.feature_df)
        
    selected_tags_df = ways_relations_df[ways_relations_df.apply(filtertags_handler.filter_them, axis= 1)]
    
    # for these selected feature we calcuate where aplicable either area, length or both
    # and then split them into thier key value pairs
    if len(selected_tags_df) > 0:
        for element in selected_tags_df[['geometry','tags']].apply(filtertags_handler.calcualte_size_for_tags, axis=1).tolist():

            for key_val in element.keys():
                key,value = key_val.split(' ')
                _dict = {'key':key,'value':value,'area':element[key_val].get('area'),'length':element[key_val].get('length')}
                results_df = results_df.append(_dict,ignore_index=True)

        # making shure everything that needs to be pyton None is None
        results_df = results_df.where(pd.notnull(results_df), None)
    if i % 1 == 0:
        print(f'finished iteration {i} so far we grabbed {len(results_df)} features')
    
# here we group the differen key value pairs together with the values we fetched from OSM
results_grouped_df = results_df.groupby(['key','value']).apply(agg_len_area).reset_index()

finished iteration 0 so far we grabbed 110 features
finished iteration 1 so far we grabbed 112 features
next open slot in 50 seconds waiting till then
finished iteration 2 so far we grabbed 116 features
finished iteration 3 so far we grabbed 411 features
finished iteration 4 so far we grabbed 426 features
next open slot in 8 seconds waiting till then
finished iteration 5 so far we grabbed 644 features
next open slot in 62 seconds waiting till then
finished iteration 6 so far we grabbed 748 features
next open slot in 10 seconds waiting till then
finished iteration 7 so far we grabbed 752 features
next open slot in 30 seconds waiting till then
finished iteration 8 so far we grabbed 938 features
next open slot in 8 seconds waiting till then
finished iteration 9 so far we grabbed 1368 features


In [None]:
# for easy access we calculat the median for area and value for each key value pair
results_grouped_df['median_area'] = results_grouped_df['area'].apply(calc_median)
results_grouped_df['median_length'] = results_grouped_df['length'].apply(calc_median)
results_grouped_df

In [None]:
results_grouped_df.to_pickle('../data/spatial_semantics/tag_sizes_median_df.pickle')
results_grouped_df

In [None]:
selected_tags_df.index.tolist()

In [None]:
# next todos:
# construct a vector for one bounding box
#    - fetch all the features within a bb (allready works) [x]
#    - wheig the features according to their median size in sample dataset
#    - feed words acording to weiths into a document 
#    - generate a vector for this document

In [None]:
# fetch all features and fitler 