In [52]:
import pandas as pd
import geopandas as gpd
import numpy as np

import sys
sys.path.append('..')

from operations import match

## Creation of test set

In [77]:
df = pd.read_pickle('random_sample_nona_matched.pkl') # import of matched footprints
df = df.sample(n=10, random_state=8).reset_index() # selection of 10 random rows, with seed to be reproducible
df = df[['geometry', 'rand_geometry', 'locid', 'rand_locid']] # selection of important columns
df

Unnamed: 0,geometry,rand_geometry,locid,rand_locid
0,"POLYGON ((24.94617 60.17794, 24.94619 60.17793...","POLYGON ((24.94618 60.17770, 24.94599 60.17773...","[1, 1, 0, 2, 3, 1, 2, 1, 0, 2, 3, 2, 1, 3, 3, ...","[1, 1, 0, 2, 3, 1, 2, 1, 0, 2, 3, 2, 1, 3, 3, ..."
1,"POLYGON ((24.95856 60.17206, 24.95892 60.17207...","POLYGON ((24.95899 60.17179, 24.95896 60.17195...","[1, 1, 0, 2, 3, 1, 2, 1, 0, 2, 3, 2, 1, 8, 1, ...","[1, 1, 0, 2, 3, 1, 2, 1, 0, 2, 3, 2, 1, 8, 1, ..."
2,"POLYGON ((-157.82492 21.30654, -157.82484 21.3...","POLYGON ((-157.82503 21.30669, -157.82496 21.3...","[3, 2, 0, 2, 0, 1, 1, 2, 2, 1, 3, 1, 3, 3, 1, ...","[3, 2, 0, 2, 0, 1, 1, 2, 2, 1, 3, 1, 3, 3, 1, ..."
3,"POLYGON ((-157.81974 21.29966, -157.81974 21.2...","POLYGON ((-157.81956 21.29964, -157.81956 21.2...","[3, 2, 0, 2, 0, 1, 1, 2, 2, 1, 3, 1, 3, 2, 1, ...","[3, 2, 0, 2, 0, 1, 1, 2, 2, 1, 3, 1, 3, 2, 1, ..."
4,"POLYGON ((24.94562 60.17816, 24.94516 60.17814...","POLYGON ((24.94585 60.17814, 24.94539 60.17813...","[1, 1, 0, 2, 3, 1, 2, 1, 0, 2, 3, 2, 1, 3, 3, ...","[1, 1, 0, 2, 3, 1, 2, 1, 0, 2, 3, 2, 1, 3, 3, ..."
5,"POLYGON ((24.96072 60.17848, 24.96088 60.17848...","POLYGON ((24.96123 60.17871, 24.96139 60.17871...","[1, 1, 0, 2, 3, 1, 2, 1, 0, 2, 3, 2, 0, 3, 0, ...","[1, 1, 0, 2, 3, 1, 2, 1, 0, 2, 3, 2, 0, 3, 0, ..."
6,"POLYGON ((-157.82166 21.29953, -157.82166 21.2...","POLYGON ((-157.82162 21.29971, -157.82161 21.2...","[3, 2, 0, 2, 0, 1, 1, 2, 2, 1, 3, 1, 3, 8, 3, ...","[3, 2, 0, 2, 0, 1, 1, 2, 2, 1, 3, 1, 3, 8, 3, ..."
7,"POLYGON ((-157.81378 21.29004, -157.81369 21.2...","POLYGON ((-157.81382 21.29024, -157.81374 21.2...","[3, 2, 0, 2, 0, 1, 1, 2, 2, 1, 3, 1, 8, 1, 0, ...","[3, 2, 0, 2, 0, 1, 1, 2, 2, 1, 3, 1, 8, 1, 0, ..."
8,"POLYGON ((-157.82466 21.30767, -157.82463 21.3...","POLYGON ((-157.82463 21.30754, -157.82466 21.3...","[3, 2, 0, 2, 0, 1, 1, 2, 2, 1, 8, 1, 0, 1, 3, ...","[3, 2, 0, 2, 0, 1, 1, 2, 2, 1, 0, 1, 3, 3, 1, ..."
9,"POLYGON ((-157.81525 21.29204, -157.81520 21.2...","POLYGON ((-157.81540 21.29207, -157.81534 21.2...","[3, 2, 0, 2, 0, 1, 1, 2, 2, 1, 3, 1, 3, 2, 2, ...","[3, 2, 0, 2, 0, 1, 1, 2, 2, 1, 3, 1, 3, 2, 2, ..."


In [78]:
# Seperation into two datasets, the randomized ones and the original OSM footprints
# Randomize order of both data frames
# Create a df copy without geometric information
gdf1 = gpd.GeoDataFrame(data=df[['geometry', 'locid']])
gdfx1 = gdf1.sample(frac=1).reset_index(drop=True)
df1= pd.DataFrame(gdf1['locid'])
df1['dataset'] = 1

gdf2 = gpd.GeoDataFrame(geometry = df['rand_geometry'],data=df[['rand_locid']])
gdfx2 = gdf2.sample(frac=1).reset_index(drop=True)
df2 = pd.DataFrame()
df2['locid'] = gdf2['rand_locid']
df2['dataset'] = 2

# Matching

### LocID Matching

In [79]:
%%timeit
df = pd.concat([df1, df2])
sortedf = df.sort_values('locid')
dfs1 = sortedf.iloc[::2].reset_index()
dfs2 = sortedf.iloc[1::2].reset_index()
dfs2.columns = [f'{col}_2' for col in dfs2.columns]
matched = pd.concat([dfs1, dfs2], axis=1)
matched['match_result'] = matched.apply(lambda row: match(row['locid'], row['locid_2']), axis=1)

2.27 ms ± 33.5 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [91]:
df = pd.concat([df1, df2])
sortedf = df.sort_values('locid')
dfs1 = sortedf.iloc[::2].reset_index()
dfs2 = sortedf.iloc[1::2].reset_index()
df2.columns = [f'{col}_2' for col in df2.columns]
matched = pd.concat([df1, df2], axis=1)


In [92]:
# the columns with the two times the same dataset number could be omitted
matched['match_result'] = matched.apply(lambda row: match(row['locid'], row['locid_2']), axis=1)
matched

Unnamed: 0,locid,dataset,locid_2,dataset_2,match_result
0,"[1, 1, 0, 2, 3, 1, 2, 1, 0, 2, 3, 2, 1, 3, 3, ...",1,"[1, 1, 0, 2, 3, 1, 2, 1, 0, 2, 3, 2, 1, 3, 3, ...",2,"(0.9404761904761905, b'\x01\x01\x00\x02\x03\x0..."
1,"[1, 1, 0, 2, 3, 1, 2, 1, 0, 2, 3, 2, 1, 8, 1, ...",1,"[1, 1, 0, 2, 3, 1, 2, 1, 0, 2, 3, 2, 1, 8, 1, ...",2,"(0.0, b'\x01\x01\x00\x02\x03\x01\x02\x01\x00\x..."
2,"[3, 2, 0, 2, 0, 1, 1, 2, 2, 1, 3, 1, 3, 3, 1, ...",1,"[3, 2, 0, 2, 0, 1, 1, 2, 2, 1, 3, 1, 3, 3, 1, ...",2,"(0.0, b'\x03\x02\x00\x02\x00\x01\x01\x02\x02\x..."
3,"[3, 2, 0, 2, 0, 1, 1, 2, 2, 1, 3, 1, 3, 2, 1, ...",1,"[3, 2, 0, 2, 0, 1, 1, 2, 2, 1, 3, 1, 3, 2, 1, ...",2,"(0.02857142857142857, b'\x03\x02\x00\x02\x00\x..."
4,"[1, 1, 0, 2, 3, 1, 2, 1, 0, 2, 3, 2, 1, 3, 3, ...",1,"[1, 1, 0, 2, 3, 1, 2, 1, 0, 2, 3, 2, 1, 3, 3, ...",2,"(0.37566137566137564, b'\x01\x01\x00\x02\x03\x..."
5,"[1, 1, 0, 2, 3, 1, 2, 1, 0, 2, 3, 2, 0, 3, 0, ...",1,"[1, 1, 0, 2, 3, 1, 2, 1, 0, 2, 3, 2, 0, 3, 0, ...",2,"(0.0, b'\x01\x01\x00\x02\x03\x01\x02\x01\x00\x..."
6,"[3, 2, 0, 2, 0, 1, 1, 2, 2, 1, 3, 1, 3, 8, 3, ...",1,"[3, 2, 0, 2, 0, 1, 1, 2, 2, 1, 3, 1, 3, 8, 3, ...",2,"(0.34615384615384615, b'\x03\x02\x00\x02\x00\x..."
7,"[3, 2, 0, 2, 0, 1, 1, 2, 2, 1, 3, 1, 8, 1, 0, ...",1,"[3, 2, 0, 2, 0, 1, 1, 2, 2, 1, 3, 1, 8, 1, 0, ...",2,"(0.1111111111111111, b'\x03\x02\x00\x02\x00\x0..."
8,"[3, 2, 0, 2, 0, 1, 1, 2, 2, 1, 8, 1, 0, 1, 3, ...",1,"[3, 2, 0, 2, 0, 1, 1, 2, 2, 1, 0, 1, 3, 3, 1, ...",2,"(0.0, b'\x03\x02\x00\x02\x00\x01\x01\x02\x02\x..."
9,"[3, 2, 0, 2, 0, 1, 1, 2, 2, 1, 3, 1, 3, 2, 2, ...",1,"[3, 2, 0, 2, 0, 1, 1, 2, 2, 1, 3, 1, 3, 2, 2, ...",2,"(0.0, b'\x03\x02\x00\x02\x00\x01\x01\x02\x02\x..."


### GeoPandas Matching

In [61]:
def overlap(row):
    geom1 = row['geometry_x']
    geom2 = row['geometry_y']
    intersection = geom1.intersection(geom2)
    return intersection.area / (geom1.area + geom2.area - intersection.area)

In [62]:
gdf1 = gdfx1
gdf2 = gdfx2

In [94]:
%%timeit

joined_gdf = gpd.sjoin(gdf1, gdf2, predicate='intersects', how='inner')
gdf2['index_2'] = gdf2.index
joined = joined_gdf.merge(gdf2, on='index_2')
joined['overlap'] = joined.apply(overlap, axis=1)

22.2 ms ± 1.67 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [64]:
gdf1 = gdfx1
gdf2 = gdfx2

joined_gdf = gpd.sjoin(gdf1, gdf2, predicate='intersects', how='inner')
gdf2['index_2'] = gdf2.index
joined = joined_gdf.merge(gdf2, on='index_2')
joined['overlap'] = joined.apply(overlap, axis=1)
joined

Unnamed: 0,geometry_x,locid,index_right,rand_locid_x,index_2,rand_locid_y,geometry_y,overlap
0,"POLYGON ((24.94617 60.17794, 24.94619 60.17793...","[1, 1, 0, 2, 3, 1, 2, 1, 0, 2, 3, 2, 1, 3, 3, ...",9,"[1, 1, 0, 2, 3, 1, 2, 1, 0, 2, 3, 2, 1, 3, 3, ...",9,"[1, 1, 0, 2, 3, 1, 2, 1, 0, 2, 3, 2, 1, 3, 3, ...","POLYGON ((24.94618 60.17770, 24.94599 60.17773...",0.754981
1,"POLYGON ((24.95856 60.17206, 24.95892 60.17207...","[1, 1, 0, 2, 3, 1, 2, 1, 0, 2, 3, 2, 1, 8, 1, ...",6,"[1, 1, 0, 2, 3, 1, 2, 1, 0, 2, 3, 2, 1, 8, 1, ...",6,"[1, 1, 0, 2, 3, 1, 2, 1, 0, 2, 3, 2, 1, 8, 1, ...","POLYGON ((24.95899 60.17179, 24.95896 60.17195...",0.204181
2,"POLYGON ((24.94562 60.17816, 24.94516 60.17814...","[1, 1, 0, 2, 3, 1, 2, 1, 0, 2, 3, 2, 1, 3, 3, ...",2,"[1, 1, 0, 2, 3, 1, 2, 1, 0, 2, 3, 2, 1, 3, 3, ...",2,"[1, 1, 0, 2, 3, 1, 2, 1, 0, 2, 3, 2, 1, 3, 3, ...","POLYGON ((24.94585 60.17814, 24.94539 60.17813...",0.195016
3,"POLYGON ((-157.82166 21.29953, -157.82166 21.2...","[3, 2, 0, 2, 0, 1, 1, 2, 2, 1, 3, 1, 3, 8, 3, ...",1,"[3, 2, 0, 2, 0, 1, 1, 2, 2, 1, 3, 1, 3, 8, 3, ...",1,"[3, 2, 0, 2, 0, 1, 1, 2, 2, 1, 3, 1, 3, 8, 3, ...","POLYGON ((-157.82162 21.29971, -157.82161 21.2...",0.155331
4,"POLYGON ((-157.82466 21.30767, -157.82463 21.3...","[3, 2, 0, 2, 0, 1, 1, 2, 2, 1, 8, 1, 0, 1, 3, ...",7,"[3, 2, 0, 2, 0, 1, 1, 2, 2, 1, 0, 1, 3, 3, 1, ...",7,"[3, 2, 0, 2, 0, 1, 1, 2, 2, 1, 0, 1, 3, 3, 1, ...","POLYGON ((-157.82463 21.30754, -157.82466 21.3...",0.971519
5,"POLYGON ((-157.81974 21.29966, -157.81974 21.2...","[3, 2, 0, 2, 0, 1, 1, 2, 2, 1, 3, 1, 3, 2, 1, ...",4,"[3, 2, 0, 2, 0, 1, 1, 2, 2, 1, 3, 1, 3, 2, 1, ...",4,"[3, 2, 0, 2, 0, 1, 1, 2, 2, 1, 3, 1, 3, 2, 1, ...","POLYGON ((-157.81956 21.29964, -157.81956 21.2...",0.262878
6,"POLYGON ((-157.81525 21.29204, -157.81520 21.2...","[3, 2, 0, 2, 0, 1, 1, 2, 2, 1, 3, 1, 3, 2, 2, ...",8,"[3, 2, 0, 2, 0, 1, 1, 2, 2, 1, 3, 1, 3, 2, 2, ...",8,"[3, 2, 0, 2, 0, 1, 1, 2, 2, 1, 3, 1, 3, 2, 2, ...","POLYGON ((-157.81540 21.29207, -157.81534 21.2...",0.109708
