In [1]:
%load_ext autoreload
%autoreload 2
%cd C:\MAD4AG
%matplotlib inline

C:\MAD4AG


In [2]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
from sklearn.cluster import DBSCAN
import geopandas
import seaborn as sns
import collections
import folium
from folium.plugins import HeatMapWithTime
import matplotlib.pyplot as plt
import warnings

warnings.filterwarnings('ignore')

plt.style.use('seaborn-white')

  shapely_geos_version, geos_capi_version_string


In [3]:
file_name = f'./dbs/big_geodata_se/stops.parquet'

#Read each batchs file
df = pd.read_parquet(file_name)

In [4]:
# gdf = geopandas.GeoDataFrame(
#     df, geometry=geopandas.points_from_xy(df.lng, df.lat), crs="EPSG:4326")
#

# DBSCAN


In [5]:
def add_individual_clusters(data):
    # represent points consistently as (lat, lon)
    coords = data[['lat', 'lng']].values

    # define the number of kilometers in one radian
    kms_per_radian = 6371.0088

    # define epsilon as 0.1 kilometers, converted to radians for use by haversine
    epsilon = 0.1 / kms_per_radian

    db = DBSCAN(eps=epsilon, min_samples=1, algorithm='ball_tree', metric='haversine').fit(np.radians(coords))
    cluster_labels = db.labels_
    data['cluster'] = cluster_labels + 1
    return data


In [6]:
tqdm.pandas()
df = df.groupby('uid').progress_apply(add_individual_clusters)

  0%|          | 0/322919 [00:00<?, ?it/s]

## Intersect MAD data and DeSO data

In [7]:
gdf = geopandas.GeoDataFrame(df, geometry=geopandas.points_from_xy(df.lng, df.lat), crs="EPSG:4326")

In [8]:
DeSO = geopandas.read_file(f'C:/Synthetic_population_new/caglar/synthetic_sweden/input/deso_statistik_shp/Bef_Kon_region.shp')

print(DeSO.crs)

DeSO.to_crs(4326, inplace=True)
print(DeSO.crs)

PROJCS["SWEREF99 TM",GEOGCS["SWEREF99",DATUM["SWEREF99",SPHEROID["GRS 1980",6378137,298.257222101,AUTHORITY["EPSG","7019"]],AUTHORITY["EPSG","6619"]],PRIMEM["Greenwich",0],UNIT["Degree",0.0174532925199433]],PROJECTION["Transverse_Mercator"],PARAMETER["latitude_of_origin",0],PARAMETER["central_meridian",15],PARAMETER["scale_factor",0.9996],PARAMETER["false_easting",500000],PARAMETER["false_northing",0],UNIT["metre",1,AUTHORITY["EPSG","9001"]],AXIS["Easting",EAST],AXIS["Northing",NORTH],AUTHORITY["EPSG","3006"]]
epsg:4326


In [9]:
gdf = gdf.sjoin(DeSO[['Deso','geometry']], how="left")

gdf.drop("index_right", axis=1, inplace=True)
#out of sweden
print('Data points outside Sweden: ', len(gdf[gdf['Deso'].isna()]))

gdf = gdf.dropna(subset=['Deso'])

df = pd.DataFrame(gdf.drop(columns='geometry'))


Data points outside Sweden:  115534


## Calculate mean of clusters and snap to the closest building

In [13]:

df_cluster = df.groupby(['uid','cluster'])[['lat','lng']].mean().reset_index()


## Snap the cluster centre to the closest building.

In [15]:
building_point = geopandas.read_file(f'C:/Synthetic_population_new/caglar/synthetic_sweden/input/Geodatabase.gdb', layer= 'Buildings_SpatJ_SU_1km_DESO_poi_wgs84')

building_point = pd.DataFrame(building_point[['TARGET_FID','ANDAMAL_1',
       'Deso', 'KNKOD',  'POINT_X_sweref99',
       'POINT_Y_sweref99', 'POINT_X', 'POINT_Y']])

building_point.rename(columns={'POINT_Y': 'lat', 'POINT_X': 'lng'}, inplace=True)

In [14]:
df_cluster.to_parquet(f'./dbs/intermediate/df_cluster.parquet')
building_point.to_parquet(f'./dbs/intermediate/building_point.parquet')


In [15]:
# df_cluster = pd.read_parquet(f'./dbs/intermediate/df_cluster.parquet')
# building_point = pd.read_parquet(f'./dbs/intermediate/building_point.parquet')


In [16]:
from lib.nearest_point import nearest_neighbor

In [17]:
closest_build = nearest_neighbor(df_cluster, building_point, return_dist=True)


In [18]:
closest_build= closest_build[['TARGET_FID', 'ANDAMAL_1', 'Deso', 'KNKOD', 'POINT_X_sweref99','POINT_Y_sweref99', 'lat', 'lng', 'distance']]

closest_build.rename(columns={'lat':'building_lat', 'lng':'building_lng'}, inplace=True)

df_cluster = df_cluster.join(closest_build)


In [19]:
df_cluster= df_cluster[['uid', 'cluster', 'lat', 'lng', 'TARGET_FID',
       'ANDAMAL_1', 'Deso', 'KNKOD', 'building_lng', 'building_lat', 'distance']]

df_cluster.rename(columns={'lat':'cluster_lat', 'lng':'cluster_lng'}, inplace=True)

df_cluster.to_parquet(f'./dbs/intermediate/df_cluster.parquet')

### If the distance between the closest building and the cluster centre is more than 200m, use the coordinates of the cluster.

In [20]:
df_cluster.cluster_lat = np.where(df_cluster['distance'] <= 0.2 , df_cluster.building_lat, df_cluster.cluster_lat)

df_cluster.cluster_lng = np.where(df_cluster['distance'] <= 0.2 , df_cluster.building_lng, df_cluster.cluster_lng)

df_cluster.ANDAMAL_1 = np.where(df_cluster['distance'] >= 0.2 , 0, df_cluster.ANDAMAL_1)

In [21]:

df = pd.merge(df, df_cluster[['uid', 'cluster', 'cluster_lat', 'cluster_lng', 'ANDAMAL_1']], on=['uid', 'cluster'], how='left' )

In [23]:
df.to_parquet(f'./dbs/intermediate/stops.parquet')
