# 0. Shapely Validation and Checking Geometry Types

Two main issues in input data
- Invalid Geometry
- Shapes of type Linestring, Multilinestring, Multipolygon and GeometryCollection

In [79]:
import geopandas as gpd
from shapely.validation import explain_validity

Raw data found here https://drive.google.com/drive/folders/1v7xLIkI-Va7H_dUAJ1z9yBJgV3YuYp6T

In [80]:
buildings=gpd.read_parquet('buildings_berlin.parquet',columns=['geometry'])

In [81]:
buildings

Unnamed: 0,geometry
0,"POLYGON ((13.30277 52.51964, 13.30332 52.51964..."
1,"POLYGON ((13.35264 52.53331, 13.35276 52.53322..."
2,"POLYGON ((13.35397 52.52302, 13.35395 52.52295..."
3,"POLYGON ((13.35522 52.52727, 13.35512 52.52739..."
4,"POLYGON ((13.36014 52.53152, 13.35987 52.53184..."
...,...
1057114,"POLYGON ((13.4585 52.50118, 13.45827 52.50122,..."
1057115,"POLYGON ((13.11685 52.39058, 13.1174 52.39058,..."
1057116,"POLYGON ((13.2772 52.53379, 13.27694 52.53323,..."
1057117,"POLYGON ((13.43921 52.43791, 13.43934 52.43803..."


In [82]:
buildings = buildings.to_crs(31468)

## Make valid

In [83]:
buildings['validity'] = buildings.apply(lambda row: explain_validity(row.geometry), axis=1)

In [84]:
buildings['validity'].value_counts()

validity
Valid Geometry                                               1057112
Self-intersection[4551345.80642101 5853834.63242366]               1
Self-intersection[4551349.67275136 5853420.89689289]               1
Self-intersection[4612124.234194 5789767.12477346]                 1
Self-intersection[4606844.52017001 5840214.06887203]               1
Ring Self-intersection[4574435.09528988 5803938.14170872]          1
Ring Self-intersection[4574160.59604384 5804459.11605548]          1
Ring Self-intersection[4607184.70404971 5810069.99237562]          1
Name: count, dtype: int64

In [85]:
buildings_valid=buildings.make_valid()

In [86]:
buildings=gpd.GeoDataFrame({'geometry':buildings_valid})

In [87]:
buildings

Unnamed: 0,geometry
0,"POLYGON ((4588533.734 5821521.726, 4588570.786..."
1,"POLYGON ((4591889.97 5823105.866, 4591898.467 ..."
2,"POLYGON ((4592002.064 5821961.774, 4592000.684..."
3,"POLYGON ((4592077.946 5822437.286, 4592070.997..."
4,"POLYGON ((4592403.073 5822915.869, 4592383.652..."
...,...
1057114,"POLYGON ((4599145.094 5819669.998, 4599129.491..."
1057115,"POLYGON ((4576136.11 5806949.507, 4576173.534 ..."
1057116,"POLYGON ((4586770.205 5823065.392, 4586753.742..."
1057117,"POLYGON ((4597976.059 5812604.354, 4597984.496..."


## Check Geometry Type

In [88]:
buildings['geometry'].geom_type.value_counts()

Polygon               1056951
MultiPolygon               82
LineString                 49
MultiLineString            33
GeometryCollection          4
Name: count, dtype: int64

In [89]:
buildings[buildings.geom_type=='MultiPolygon'].explore()

### Explode dataframe to turn multipolygons into polygons and remove linestrings

In [90]:
buildings=buildings.explode(ignore_index=True)

`geo_col_name = gdf.active_geometry_name; gdf.set_geometry(new_geo_col).drop(columns=geo_col_name).rename_geometry(geo_col_name)`.
  return gf.set_geometry(col, drop=drop, inplace=False, crs=crs)


In [91]:
buildings

Unnamed: 0,geometry
0,"POLYGON ((4588533.734 5821521.726, 4588570.786..."
1,"POLYGON ((4591889.97 5823105.866, 4591898.467 ..."
2,"POLYGON ((4592002.064 5821961.774, 4592000.684..."
3,"POLYGON ((4592077.946 5822437.286, 4592070.997..."
4,"POLYGON ((4592403.073 5822915.869, 4592383.652..."
...,...
1057412,"POLYGON ((4599145.094 5819669.998, 4599129.491..."
1057413,"POLYGON ((4576136.11 5806949.507, 4576173.534 ..."
1057414,"POLYGON ((4586770.205 5823065.392, 4586753.742..."
1057415,"POLYGON ((4597976.059 5812604.354, 4597984.496..."


In [92]:
buildings['geometry'].geom_type.value_counts()

Polygon            1057241
LineString             169
MultiLineString          4
MultiPolygon             3
Name: count, dtype: int64

In [93]:
buildings=buildings[buildings['geometry'].geom_type=='Polygon']

In [94]:
buildings['geometry'].geom_type.value_counts()

Polygon    1057241
Name: count, dtype: int64

### Save to parquet

In [95]:
buildings.to_parquet('buildings_berlin_0.parquet')