In [1]:
# next todos:
# construct a vector for one bounding box
#    - fetch all the features within a bb (allready works) [x]
#    - wheig the features according to their median size in sample dataset [x]
#    - feed words acording to weiths into a document  [x]
#    - generate a vector for this document [x]
# need to clip feautres with the boundingbox [x]

In [2]:
import parse_osm
import tag_handler
from shapely.geometry import Polygon
import pandas as pd
import geopandas as gpd
from gensim.corpora.wikicorpus import WikiCorpus
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [3]:
class Spatial_Semantic_Vector:
    
    def __init__(self, tag_df_path, tag_median_size_df_path):
        self.tag_df =  pd.read_pickle(tag_df_path)
        self.tag_median_size_df = pd.read_pickle(tag_median_size_df_path).set_index(['key','value']).sort_index()
        # replace pd nan with None
        self.tag_median_size_df = self.tag_median_size_df.where(pd.notnull(self.tag_median_size_df), None)
        self.filtertags_handler = tag_handler.filtertags(self.tag_df)
        # from here on out we want key value in the tag_df as the index because it makes
        # the look up faster
        # we sort the index to handle the "1: PerformanceWarning: indexing past lexsort 
        # depth may impact performance." Warning
        self.tag_df = self.tag_df.set_index(['key','value']).sort_index()
        self.doc2vec_model = Doc2Vec.load("../data/en_wikipedia_corpus/doc2vec_eng.pickel")


    def _compose_document(self, element):
        # example element:
        # {'leisure pitch': {'area': 373.27528143301606}, 'sport basketball': {'area': 373.27528143301606}}
        doc_shard = ''
        for key in element.keys():
            group, sub_type =  key.split(' ')
            # we lookup the combination in the tag_df
            row = self.tag_df.loc[group, sub_type]
            # and retrieve both the wikidata and the wikipedia text from it
            wikidata_desc = row.iloc[0].wikidata_desc
            wikipedia_desc = row.iloc[0].en_text

            # now we test if we aktualy have wikipedia text
            if wikipedia_desc:
                text = wikipedia_desc
            # if not we take the wikidata description instead
            else:
                text = wikidata_desc

            # now we test if we have an area or a lenght for the group sub_type combo
            if element[key]:
                # this is uqly and needs restructuring:
                if element[key].get('area'):
                    area = element[key].get('area')
                    # now we need to get the median area for this group sub_type combo
                    try:
                        m_area = self.tag_median_size_df.loc[group,sub_type].median_area
                    except:
                        m_area = None

                    if m_area:
                        doc_shard += self._text_weigher(text,area/m_area)
                    else:
                        # if we dont have an median we wiegh with 0
                        doc_shard += self._text_weigher(text,0)  

                if element[key].get('length'):
                    length = element[key].get('length')
                    # now we need to get the median area for this group sub_type combo
                    try:
                        m_length = self.tag_median_size_df.loc[group,sub_type].median_length
                    except:
                        m_length = None

                    if m_length:
                        doc_shard += self._text_weigher(text,length/m_length)
                    else:
                        # if we dont have an median we wiegh with 0
                        doc_shard += self._text_weigher(text,0)                

            else:
                # else is the case when the element is just a node thus it does not
                # have an area or length
                doc_shard += self._text_weigher(text,0)

        return doc_shard

    def _text_weigher(self, text, weight):
        # for the time beeing we just round the weight to the next intiger
        weight = round(weight)

        if weight <= 1 :
            # we want the text atleast once 
            w_text = text

        else:
            text += ' '
            w_text = text*weight

        return w_text

    def _clip_2d_features(self, feature):
        feature_cliped = {}
        for geometry in ['line','polygon', 'multipolygon', 'multiline', 'multipoint']:
            if feature.get(geometry):
                # if we found a geometry we clip it to the extent of the bounding box
                intersection = self.boundingbox.intersection(feature.get(geometry).buffer(0))
                if intersection:
                    feature_cliped[geometry] = intersection
        if 'point' in feature:
            # points are simply copied over
            feature_cliped['point'] = feature['point']

        return feature_cliped

    def generate_vec(self, boundingbox):
        self.boundingbox = boundingbox
        # first we fetch for the given boundingbox features from OSM:
        osm_handle = parse_osm.disect_osm(parse_osm.json_from_osm(boundingbox))
        # from these features we filter out the features with tags that we have documents for 
        selected_tags_df = osm_handle.feature_df[osm_handle.feature_df.apply(self.filtertags_handler.filter_them, axis = 1)]
        # we then cast these features to a list so that in the next step we generate thier geometry
        type_id_list = selected_tags_df.index.tolist()
        # for the selected features we calculate the geometry:
        [osm_handle.get_geometry(f_type,osm_id) for f_type, osm_id in type_id_list]
        #then we have to reslect the dataframe and make a copy of it for further proccesing
        selected_tags_df = osm_handle.feature_df[osm_handle.feature_df.apply(self.filtertags_handler.filter_them, axis = 1)].copy()
        # we need to clip the 2 dimensional feautres to the extent of the $boundingbox
        selected_tags_df['geometry'] = selected_tags_df['geometry'].apply(self._clip_2d_features)
        # filter out feauters who no longer have a geometry after clipping
        selected_tags_df = selected_tags_df.loc[selected_tags_df['geometry']!={}]
        # lastly for the found featres with tags that intresst us we wiegh thier size 
        # (if ther is one) and combine them into a meta document
        combined_document = ''
        for element in selected_tags_df[['geometry','tags']].apply(self.filtertags_handler.calcualte_size_for_tags, axis=1).tolist():
            # and then we add the document shard for a given element
            combined_document += self._compose_document(element)
            # some padding
            combined_document += ' '
        # the combinded document we feed into ou doc2vec model and generate a vector
        vec = self.doc2vec_model.infer_vector(combined_document.split(' '))

        return vec

In [4]:
spatial_semantic_vector_obj = Spatial_Semantic_Vector('../data/spatial_semantics/kv_df_just_eng.pickle','../data/spatial_semantics/tag_sizes_median_df.pickle')
q_poly = Polygon([(16.36896371841431,48.20063653233946),(16.373598575592045,48.19960677385028),(16.371034383773807,48.19771882952509),(16.36772990226746,48.19816221664037)])
spatial_semantic_vector_obj.generate_vec(q_poly)

array([-1.24275461e-01, -1.30907357e-01, -6.22762716e-04, -5.50206602e-01,
        1.39365107e-01, -3.68225366e-01,  5.71599901e-01,  1.99213296e-01,
        3.69874805e-01,  6.50141761e-02,  3.07978004e-01, -2.23174728e-02,
        2.33467370e-01, -4.96434838e-01,  3.07873845e-01,  1.13946892e-01,
       -1.01813659e-01,  2.72812188e-01, -5.24157226e-01, -5.86416006e-01,
        2.76123941e-01,  4.62491930e-01,  4.43891555e-01, -1.99367613e-01,
        3.48979294e-01,  7.27404933e-03,  3.55110914e-01,  3.96709144e-02,
       -2.61587173e-01,  3.71331833e-02,  1.22273289e-01, -7.08876550e-02,
        3.53142351e-01, -2.90880948e-02, -2.13600039e-01, -6.20221347e-02,
       -4.36727703e-02,  5.20475924e-01, -2.60960221e-01,  3.42150852e-02,
       -2.39130139e-01,  2.94416785e-01, -4.03456837e-01, -9.16286930e-02,
       -2.48920530e-01, -2.21156955e-01, -1.37600034e-01, -2.51091868e-01,
        8.15836936e-02,  7.24368542e-03,  3.92527990e-02, -3.04578602e-01,
        5.76640032e-02, -

In [None]:
tag_df =  pd.read_pickle('../data/spatial_semantics/kv_df_just_eng.pickle')
tag_median_size_df = pd.read_pickle('../data/spatial_semantics/tag_sizes_median_df.pickle').set_index(['key','value']).sort_index()
#replace pd nan with None
tag_median_size_df = tag_median_size_df.where(pd.notnull(tag_median_size_df), None)
filtertags_handler = tag_handler.filtertags(tag_df)
# from here on out we want key value in the tag_df as the index because it makes
# the look up faster
# we sort the index to handle the "1: PerformanceWarning: indexing past lexsort 
# depth may impact performance." Warning
tag_df = tag_df.set_index(['key','value']).sort_index()
# load doc2vec model
doc2vec_model = Doc2Vec.load("../data/en_wikipedia_corpus/doc2vec_eng.pickel")

# TU Wien polygon
q_poly = Polygon([(16.36896371841431,48.20063653233946),(16.373598575592045,48.19960677385028),(16.371034383773807,48.19771882952509),(16.36772990226746,48.19816221664037)])

In [None]:
osm_handle = parse_osm.disect_osm(parse_osm.json_from_osm(q_poly))
osm_handle.feature_df

In [None]:
selected_tags_df = osm_handle.feature_df[osm_handle.feature_df.apply(filtertags_handler.filter_them, axis = 1)]
type_id_list = selected_tags_df.index.tolist()
# for the selected features we calculate the geometry:
[osm_handle.get_geometry(f_type,osm_id) for f_type, osm_id in type_id_list]
# reselect to fetch the generated geometries
selected_tags_df = osm_handle.feature_df[osm_handle.feature_df.apply(filtertags_handler.filter_them, axis = 1)].copy()
selected_tags_df

In [None]:
# first we need to clip the 2 dimensional feautres 
selected_tags_df['geometry'] = selected_tags_df['geometry'].apply(clip_2d_features)
# filter out feauters who no longer have a geometry after clipping
selected_tags_df = selected_tags_df.loc[selected_tags_df['geometry']!={}]
selected_tags_df

In [None]:
combined_document = ''
for element in selected_tags_df[['geometry','tags']].apply(filtertags_handler.calcualte_size_for_tags, axis=1).tolist():
    # and then we add the document shard for a given element
    combined_document += compose_document(element)
    # some padding
    combined_document += ' '

In [None]:
import ipyleaflet
from ipyleaflet import Map, basemaps, basemap_to_tiles, GeoJSON, WKTLayer
from ipywidgets import Label




m = Map(
    basemap=basemaps.CartoDB.Positron,
    # for some reason lat lon are switch for centering the map
    center=((q_poly.centroid.coords[0][1],q_poly.centroid.coords[0][0])),
    #center=((test_multi['multipolygon'].centroid.coords[0][1],test_multi['multipolygon'].centroid.coords[0][0])),
    zoom=14
)

wlayer = WKTLayer(
    wkt_string=q_poly.wkt,
    #hover_style={"fillColor": "red"},
    fill_color="red",
    color="red",
)

wlayer2 = WKTLayer(
    wkt_string=selected_tags_df_test.loc['relation',22494].geometry_clipped['multiline'].wkt,
    #hover_style={"fillColor": "red"},
    fill_color="blue",
    color="blue",
)
m.add_layer(wlayer)
m.add_layer(wlayer2)


# m.add_layer(geo_json)


m