In [None]:
import pandas as pd
import numpy as np
import geopandas as gpd
from shapely.geometry import Point
from fiona.crs import from_epsg

In [None]:
csv = "osm_data.csv"
df_csv = pd.read_csv(csv)
df_csv

In [None]:
shapefile_path = 'hildesheim_merged.shp'
gdf_shape = gpd.read_file(shapefile_path)
gdf_shape

# Filter osm data to be inside the study area

In [None]:
# Create Point geometries from coordinates
df_csv['location'] = df_csv.apply(lambda row: Point(row['lon'], row['lat']), axis=1)
df_csv

In [None]:
# Create GeoDataFrame from location
gdf = gpd.GeoDataFrame(df_csv, geometry='location', crs=from_epsg(4326))
gdf

In [None]:
# Reproject the shapefile to match the CRS of the GeoDataFrames
gdf_shape = gdf_shape.to_crs(gdf.crs)

In [None]:
# Perform spatial join to only filter rows where the point locations are inside the shapefile
gdf_inside = gpd.sjoin(gdf, gdf_shape, op='intersects', how='inner')
gdf_inside

In [None]:
gdf_inside.drop(columns=['index_right', 'id_right'], inplace=True)
gdf_inside

In [None]:
# Input OSM data
csv = "osm_data.csv"
df_csv = pd.read_csv(csv)

# Input study area shapefile
shapefile_path = 'hildesheim_merged.shp'
gdf_shape = gpd.read_file(shapefile_path)

# Create Point geometries from coordinates of OSM data
df_csv['location'] = df_csv.apply(lambda row: Point(row['lon'], row['lat']),
                                  axis=1)

# Convert Dataframe to GeoDataFrame via the Point location
gdf = gpd.GeoDataFrame(df_csv, geometry='location', crs=from_epsg(4326))

# Reproject the shapefile to match the CRS of the GeoDataFrame
gdf_shape = gdf_shape.to_crs(gdf.crs)

# Perform spatial join to filter only rows that are inside the shapefile
gdf_inside = gpd.sjoin(gdf, gdf_shape, op='intersects', how='inner')

# assign cluster

In [None]:
# Input cluster shapefile as GeoDataFrame
shapefile_path_cluster = 'desd-4-landkreis-hildesheim-1663168323029-shapes.shp'
gdf_shape_cluster = gpd.read_file(shapefile_path_cluster)

# Set the CRS for the GeoDataFrame
gdf_shape_cluster.crs = 'EPSG:4326'

# Reproject the shapefile to match the CRS of other GeoDataFrames
gdf_shape_cluster = gdf_shape_cluster.to_crs(gdf_inside.crs)

# Calculate centroids and create a new column 'centroid' in the GeoDataFrame
gdf_shape_cluster['centroid'] = gdf_shape_cluster.centroid

gdf_shape_cluster

In [None]:
# Input cluster CSV
csv_cluster = 'desd-4-landkreis-hildesheim-1663168323029-shapes.csv'
gdf_csv_cluster = gpd.read_file(csv_cluster)
gdf_csv_cluster

In [None]:
# Merge the GeoDataFrame and DataFrame on their indices
gdf_cluster = gdf_shape_cluster.merge(gdf_csv_cluster, left_index=True, right_index=True, how='inner')
gdf_cluster

In [None]:
# Input cluster shapefile as GeoDataFrame
shapefile_path_cluster = 'desd-4-landkreis-hildesheim-1663168323029-' \
                         'shapes.shp'
gdf_shape_cluster = gpd.read_file(shapefile_path_cluster)

# Set the CRS for the GeoDataFrame
gdf_shape_cluster.crs = 'EPSG:4326'

# Reproject the shapefile to match the CRS of other GeoDataFrames
gdf_shape_cluster = gdf_shape_cluster.to_crs(gdf_inside.crs)

# Calculate centroids and create new column 'centroid' in GeoDataFrame
gdf_shape_cluster['centroid'] = gdf_shape_cluster.centroid

# Input cluster CSV
csv_cluster = 'desd-4-landkreis-hildesheim-1663168323029-shapes.csv'
gdf_csv_cluster = gpd.read_file(csv_cluster)

# Merge the GeoDataFrame and DataFrame on their indices
gdf_cluster = gdf_shape_cluster.merge(gdf_csv_cluster, left_index=True,
                                      right_index=True, how='inner')

# Perform spatial join with OSM data
gdf_joined = gpd.sjoin(gdf_inside, gdf_cluster, op='within')

In [None]:
gdf_cluster.drop(columns=['geometry_y'], inplace=True)
gdf_cluster.rename(columns={'geometry_x': 'geometry'}, inplace=True)

# Convert the merged DataFrame to a GeoDataFrame
gdf_cluster = gpd.GeoDataFrame(gdf_cluster, geometry='geometry')

gdf_cluster

In [None]:
# Perform spatial join
gdf_joined = gpd.sjoin(gdf_inside, gdf_cluster, op='within')
gdf_joined

# Clean Data

In [None]:
gdf_joined.columns

## filter duplicates

In [None]:
# Check for duplicates based on specific columns
duplicates_mask = gdf_joined.duplicated(subset=['id_left'], keep='first')

# In the 'subset' parameter, list the columns that you want to check for duplicates
# 'keep' parameter determines which duplicates to mark, 'first' marks all except the first occurrence

# Invert the mask to get non-duplicate rows
non_duplicates = ~duplicates_mask

# Apply the mask to the GeoDataFrame to filter out duplicates
gdf_filtered = gdf_joined[non_duplicates]

gdf_filtered

## go through each subcategory

### shops

supermarket

In [None]:
gdf_filtered_supermarkets = gdf_filtered[gdf_filtered['subcategory'] == 'supermarket']
gdf_filtered_supermarkets

In [None]:
gdf_filtered_supermarkets.groupby('brand').size()

rest of shops

In [None]:
gdf_filtered_shop = gdf_filtered[(gdf_filtered['category'] == 'shop') & (gdf_filtered['subcategory'] != 'supermarket')]
gdf_filtered_shop

drop rows of insignificant data entries and/or subcategories

In [None]:
gdf_filtered = gdf_filtered.drop([181, 188])

map mall to supermarket, so there are less subcategories

In [None]:
gdf_filtered.at[192, 'subcategory'] = 'supermarket'

## amenities

In [None]:
gdf_filtered_amenities = gdf_filtered[gdf_filtered['category'] == 'amenity']
gdf_filtered_amenities

In [None]:
gdf_filtered_amenities.groupby('subcategory').size()

kindergarten

In [None]:
gdf_filtered_amenities_kg = gdf_filtered_amenities[gdf_filtered_amenities['subcategory'] == 'kindergarten']
gdf_filtered_amenities_kg

drop some kindergarten

In [None]:
gdf_filtered = gdf_filtered.drop([432, 360])
gdf_filtered = gdf_filtered.drop(gdf_filtered[(gdf_filtered['subcategory'] == 'kindergarten') & (gdf_filtered['name_left'].isnull())].index)

schools

In [None]:
gdf_filtered_amenities_sh = gdf_filtered_amenities[gdf_filtered_amenities['subcategory'] == 'school']
gdf_filtered_amenities_sh

drop some schools

In [None]:
gdf_filtered = gdf_filtered.drop([272, 225, 222, 221, 223, 224, 227, 237, 259, 206, 280, 301, 293])
gdf_filtered = gdf_filtered.drop(gdf_filtered[(gdf_filtered['subcategory'] == 'school') & (gdf_filtered['name_left'].isnull())].index)

rest of amenities

In [None]:
gdf_filtered_amenities_rest = gdf_filtered_amenities[(gdf_filtered_amenities['subcategory'] != 'kindergarten') & (gdf_filtered_amenities['subcategory'] != 'school')]
gdf_filtered_amenities_rest

In [None]:
gdf_filtered = gdf_filtered.drop([514, 518, 525, 506])
gdf_filtered.loc[gdf_filtered['subcategory'] == 'college', 'subcategory'] = 'university'

### leisure

In [None]:
gdf_filtered_leisure = gdf_filtered[gdf_filtered['category'] == 'leisure']
gdf_filtered_leisure

In [None]:
gdf_filtered_leisure.groupby('subcategory').size()

sports centre

In [None]:
gdf_filtered_leisure_sc = gdf_filtered_leisure[gdf_filtered_leisure['subcategory'] == 'sports_centre']
gdf_filtered_leisure_sc

In [None]:
keywords = ['reit', 'pferd', 'integrau', 'halle']
pattern = '|'.join(rf'{keyword}' for keyword in keywords)
indices_to_remove = gdf_filtered[(gdf_filtered['subcategory'] == 'sports_centre') & (gdf_filtered['name_left'].str.contains(pattern, case=False))].index
gdf_filtered = gdf_filtered.drop(indices_to_remove)

In [None]:
gdf_filtered = gdf_filtered.drop(gdf_filtered[(gdf_filtered['subcategory'] == 'sports_centre') & (gdf_filtered['name_left'].isnull())].index)

swimming pool

In [None]:
gdf_filtered_leisure_sp = gdf_filtered_leisure[gdf_filtered_leisure['subcategory'] == 'swimming_pool']
gdf_filtered_leisure_sp

In [None]:
gdf_filtered = gdf_filtered[gdf_filtered['subcategory'] != 'swimming_pool']

In [None]:
gdf_filtered_leisure_rest = gdf_filtered_leisure[(gdf_filtered_leisure['subcategory'] != 'swimming_pool') & (gdf_filtered_leisure['subcategory'] != 'sports_centre')]
gdf_filtered_leisure_rest

In [None]:
gdf_filtered = gdf_filtered.drop([609, 604, 622, 606, 622])
gdf_filtered.loc[gdf_filtered['subcategory'] == 'fitness_centre', 'subcategory'] = 'sports_centre'

### public transport

In [None]:
gdf_filtered_pt = gdf_filtered[gdf_filtered['category'] == 'public_transport']
gdf_filtered_pt

In [None]:
gdf_filtered_pt.groupby('subcategory').size()

stop_position

In [None]:
gdf_filtered_pt_sp = gdf_filtered_pt[gdf_filtered_pt['subcategory'] == 'stop_position']
gdf_filtered_pt_sp

In [None]:
indices_to_remove= gdf_filtered[((gdf_filtered['subcategory'] == 'stop_position') & gdf_filtered.duplicated(subset='name_left', keep='first'))].index
gdf_filtered = gdf_filtered.drop(indices_to_remove)

In [None]:
gdf_filtered = gdf_filtered.drop(gdf_filtered[(gdf_filtered['subcategory'] == 'stop_position') & (gdf_filtered['name_left'].isnull())].index)

station

In [None]:
gdf_filtered_pt_st = gdf_filtered_pt[gdf_filtered_pt['subcategory'] == 'station']
gdf_filtered_pt_st

In [None]:
gdf_filtered = gdf_filtered.drop(gdf_filtered[(gdf_filtered['subcategory'] == 'station') & (gdf_filtered['name_left'].isnull())].index)
gdf_filtered = gdf_filtered.drop([1327])

# result

In [None]:
gdf_filtered

In [None]:
size_subcat = gdf_filtered.groupby('subcategory').size()
size_subcat

map mall to supermarket

In [None]:
gdf_filtered.loc[gdf_filtered['subcategory'] == 'mall', 'subcategory'] = 'supermarket'

In [None]:
size_subcat = gdf_filtered.groupby('subcategory').size()
size_subcat

In [None]:
gdf_filtered[gdf_filtered['subcategory'] == 'wholesale']

In [None]:
gdf_filtered = gdf_filtered.drop([196])

In [None]:
size_subcat = gdf_filtered.groupby('subcategory').size()
size_subcat

map university to school

In [None]:
gdf_filtered.loc[gdf_filtered['subcategory'] == 'university', 'subcategory'] = 'school'

map department_store and diy to supermarket

In [None]:
gdf_filtered.loc[gdf_filtered['subcategory'] == 'department_store', 'subcategory'] = 'supermarket'
gdf_filtered.loc[gdf_filtered['subcategory'] == 'doityourself', 'subcategory'] = 'supermarket'

map station to stop_position

In [None]:
gdf_filtered.loc[gdf_filtered['subcategory'] == 'station', 'subcategory'] = 'stop_position'

In [None]:
size_subcat = gdf_filtered.groupby('subcategory').size()
size_subcat

# aggregate subcats to cluster

In [None]:
gdf_filtered.name_right.unique()

In [None]:
# Group by 'name_right' and 'subcategory', and calculate the size of each group
grouped = gdf_filtered.groupby(['name_right', 'subcategory']).size().reset_index(name='count')

# Pivot the table to create the desired structure
pivot_table = grouped.pivot_table(index='name_right', columns='subcategory', values='count', fill_value=0)

# Reset the index of the pivot table
pivot_table.reset_index(inplace=True)

# Display the pivot table
pivot_table

In [None]:
pivot_table.kindergarten.sum()

In [None]:
pivot_table.supermarket.sum()

In [None]:
# Reset the index of the pivot table
pivot_table.reset_index(inplace=True)

# Convert the pivot table back into a DataFrame
df_result = pivot_table.copy()
df_result

In [None]:
df_result.drop(['level_0', 'index'], inplace=True)
df_result

In [None]:
df_result.to_csv('osm_data_cleaned.csv', index=False)

In [None]:
df = pd.read_csv('osm_data_cleaned.csv')
df

In [None]:
df = df.drop(['level_0', 'index'], axis=1)
df

In [None]:
df.rename(columns={'name_right': 'cluster', 'kindergarten': 'kindergartens_count', 'school': 'schools_count', 'sports_centre': 'sportsCentres_count', 'stop_position': 'publicTransportStops_count', 'supermarket': 'supermarkets_count'}, inplace=True)
df

In [None]:
df['cluster'].apply(lambda x: x.split('-')[1][:-1])

In [None]:
df['clusterID'] = df['cluster'].apply(lambda x: x.split('-')[1][:-1])
df

In [None]:
df.to_csv('osm_data_cleaned.csv', index=False)