In [1]:
%load_ext autoreload
%autoreload 2
%cd C:\MAD4AG
%matplotlib inline

C:\MAD4AG


In [2]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
from sklearn.cluster import DBSCAN
import geopandas
import folium
from folium.plugins import HeatMapWithTime
import warnings

warnings.filterwarnings('ignore')


  shapely_geos_version, geos_capi_version_string


In [3]:
file_name = f'./dbs/big_geodata_se/stops.parquet'

#Read each batchs file
df = pd.read_parquet(file_name)

# DBSCAN to detect jumps


In [4]:
def add_individual_clusters(data):
    # represent points consistently as (lat, lon)
    coords = data[['lat', 'lng']].values

    # define the number of kilometers in one radian
    kms_per_radian = 6371.0088

    # define epsilon as 500 kilometers, converted to radians for use by haversine
    epsilon = 200 / kms_per_radian

    db = DBSCAN(eps=epsilon, min_samples=2, algorithm='ball_tree', metric='haversine').fit(np.radians(coords))
    cluster_labels = db.labels_
    data['cluster'] = cluster_labels + 1
    return data




In [5]:
tqdm.pandas()
df = df.groupby('uid').progress_apply(add_individual_clusters)

print('The number of people: ', len(df['uid'].unique()))
print('The number of people having jump: ', len(df['uid'][df['cluster']>1].unique()))

  0%|          | 0/322919 [00:00<?, ?it/s]

The number of people:  322919
The number of people having jump:  115685


In [6]:
df['uid'] = df['uid']+'-'+df['cluster'].astype(str)

df.drop(columns=['cluster'], inplace=True)

# remove holidays and weekend stops

In [7]:
df = df[df.holiday_s != 1]
df = df[df.weekday_s == 1]

# DBSCAN to detect clusters

In [8]:
def add_individual_clusters(data):
    # represent points consistently as (lat, lon)
    coords = data[['lat', 'lng']].values

    # define the number of kilometers in one radian
    kms_per_radian = 6371.0088

    # define epsilon as 0.1 kilometers, converted to radians for use by haversine
    epsilon = 0.1 / kms_per_radian

    db = DBSCAN(eps=epsilon, min_samples=1, algorithm='ball_tree', metric='haversine').fit(np.radians(coords))
    cluster_labels = db.labels_
    data['cluster'] = cluster_labels + 1
    return data


In [9]:
tqdm.pandas()
df = df.groupby('uid').progress_apply(add_individual_clusters)

  0%|          | 0/430111 [00:00<?, ?it/s]

In [10]:
# the clusters in one individual's different jumps

df[df['uid'] == '9b0b36c9-b1c0-444e-8708-e2751915a587-1']

Unnamed: 0,uid,localtime,lat,lng,loc,h_s,dur,holiday_s,weekday_s,seq,cluster
17673757,9b0b36c9-b1c0-444e-8708-e2751915a587-1,2019-10-16 17:58:42,67.846368,20.257187,17,17.966667,179.983333,0,1,68,1
17673758,9b0b36c9-b1c0-444e-8708-e2751915a587-1,2019-10-16 23:33:36,67.849996,20.229495,2,23.55,279.166667,0,1,69,2
17673759,9b0b36c9-b1c0-444e-8708-e2751915a587-1,2019-10-17 10:55:08,67.85,20.2167,19,10.916667,419.5,0,1,70,3
17673762,9b0b36c9-b1c0-444e-8708-e2751915a587-1,2019-10-21 10:49:35,67.85,20.2167,19,10.816667,32.066667,0,1,73,3
17673765,9b0b36c9-b1c0-444e-8708-e2751915a587-1,2019-10-22 10:12:58,67.85,20.2167,19,10.2,30.55,0,1,76,3
17673767,9b0b36c9-b1c0-444e-8708-e2751915a587-1,2019-10-22 12:30:29,67.85,20.2167,19,12.5,34.683333,0,1,78,3
17673768,9b0b36c9-b1c0-444e-8708-e2751915a587-1,2019-10-23 15:46:57,67.85,20.2167,19,15.766667,61.516667,0,1,80,3
17673772,9b0b36c9-b1c0-444e-8708-e2751915a587-1,2019-10-24 11:21:21,67.85,20.2167,19,11.35,306.45,0,1,84,3
17673779,9b0b36c9-b1c0-444e-8708-e2751915a587-1,2019-11-06 18:40:52,67.85,20.2167,19,18.666667,179.983333,0,1,93,3
17673782,9b0b36c9-b1c0-444e-8708-e2751915a587-1,2019-11-25 10:07:32,67.85,20.2167,19,10.116667,378.533333,0,1,96,3


In [11]:
df[df['uid'] == '9b0b36c9-b1c0-444e-8708-e2751915a587-2']

Unnamed: 0,uid,localtime,lat,lng,loc,h_s,dur,holiday_s,weekday_s,seq,cluster


## Drop individuals with only one cluster

In [12]:
cluster_count = df.groupby('uid')['cluster'].nunique().reset_index(name='cluster_count')

ppl_with_more_cluster = cluster_count['uid'][cluster_count.cluster_count>1].tolist()

df = df[df['uid'].isin(ppl_with_more_cluster)]

## Intersect MAD data and DeSO data

In [13]:
gdf = geopandas.GeoDataFrame(df, geometry=geopandas.points_from_xy(df.lng, df.lat), crs="EPSG:4326")

In [14]:
DeSO = geopandas.read_file(f'C:/Synthetic_population_new/caglar/synthetic_sweden/input/deso_statistik_shp/Bef_Kon_region.shp')

print(DeSO.crs)

DeSO.to_crs(4326, inplace=True)
print(DeSO.crs)

PROJCS["SWEREF99 TM",GEOGCS["SWEREF99",DATUM["SWEREF99",SPHEROID["GRS 1980",6378137,298.257222101,AUTHORITY["EPSG","7019"]],AUTHORITY["EPSG","6619"]],PRIMEM["Greenwich",0],UNIT["Degree",0.0174532925199433]],PROJECTION["Transverse_Mercator"],PARAMETER["latitude_of_origin",0],PARAMETER["central_meridian",15],PARAMETER["scale_factor",0.9996],PARAMETER["false_easting",500000],PARAMETER["false_northing",0],UNIT["metre",1,AUTHORITY["EPSG","9001"]],AXIS["Easting",EAST],AXIS["Northing",NORTH],AUTHORITY["EPSG","3006"]]
epsg:4326


In [15]:
gdf = gdf.sjoin(DeSO[['Deso','geometry']], how="left")

gdf.drop("index_right", axis=1, inplace=True)
#out of sweden
print('Stops outside Sweden: ', len(gdf[gdf['Deso'].isna()]))

gdf = gdf.dropna(subset=['Deso'])

df = pd.DataFrame(gdf.drop(columns='geometry'))


Stops outside Sweden:  26235


## Drop individuals with only one cluster

In [16]:
cluster_count = df.groupby('uid')['cluster'].nunique().reset_index(name='cluster_count')

ppl_with_more_cluster = cluster_count['uid'][cluster_count.cluster_count>1].tolist()

df = df[df['uid'].isin(ppl_with_more_cluster)]

## Calculate mean of clusters and snap to the closest building

In [17]:

df_cluster = df.groupby(['uid','cluster'])[['lat','lng']].mean().reset_index()


## Snap the cluster centre to the closest building.

In [18]:
# building_point = geopandas.read_file(f'C:/Synthetic_population_new/caglar/synthetic_sweden/input/Geodatabase.gdb', layer= 'Buildings_SpatJ_SU_1km_DESO_poi_wgs84')
#
# building_point = pd.DataFrame(building_point[['TARGET_FID','ANDAMAL_1',
#        'Deso', 'KNKOD',  'POINT_X_sweref99',
#        'POINT_Y_sweref99', 'POINT_X', 'POINT_Y']])
# building_point = building_point[building_point.ANDAMAL_1 != 699]
# building_point.rename(columns={'POINT_Y': 'lat', 'POINT_X': 'lng'}, inplace=True)

In [19]:
#HERE
# df_cluster.to_parquet(f'./dbs/intermediate/df_cluster.parquet')
# building_point.to_parquet(f'./dbs/intermediate/building_point.parquet')


In [20]:
# df_cluster = pd.read_parquet(f'./dbs/intermediate/df_cluster.parquet')
building_point = pd.read_parquet(f'./dbs/intermediate/building_point.parquet')


In [21]:
from lib.nearest_point import nearest_neighbor

In [22]:
closest_build = nearest_neighbor(df_cluster, building_point, return_dist=True)


In [23]:
closest_build= closest_build[['TARGET_FID', 'ANDAMAL_1', 'Deso', 'KNKOD', 'POINT_X_sweref99','POINT_Y_sweref99', 'lat', 'lng', 'distance']]

closest_build.rename(columns={'lat':'building_lat', 'lng':'building_lng'}, inplace=True)

df_cluster = df_cluster.join(closest_build)


In [24]:
df_cluster= df_cluster[['uid', 'cluster', 'lat', 'lng', 'TARGET_FID',
       'ANDAMAL_1', 'Deso', 'KNKOD', 'building_lng', 'building_lat', 'distance']]

df_cluster.rename(columns={'lat':'cluster_lat', 'lng':'cluster_lng'}, inplace=True)



### If the distance between the closest building and the cluster centre is more than 200m, use the coordinates of the cluster.

In [25]:
#HERE

df_cluster.cluster_lat = np.where(df_cluster['distance'] <= 0.2 , df_cluster.building_lat, df_cluster.cluster_lat)

df_cluster.cluster_lng = np.where(df_cluster['distance'] <= 0.2 , df_cluster.building_lng, df_cluster.cluster_lng)

df_cluster.ANDAMAL_1 = np.where(df_cluster['distance'] >= 0.2 , 0, df_cluster.ANDAMAL_1)

In [26]:
df_cluster.ANDAMAL_1.value_counts(normalize=True)

133    0.230937
130    0.228191
499    0.160465
0      0.083065
399    0.060628
799    0.034647
299    0.034392
319    0.027255
304    0.017052
131    0.014822
132    0.014560
240    0.011195
318    0.009246
247    0.008453
135    0.008140
317    0.006458
313    0.006379
253    0.005812
310    0.005229
320    0.004017
307    0.003062
599    0.002518
199    0.002365
311    0.002234
242    0.002126
301    0.002123
249    0.001852
246    0.001851
321    0.001376
315    0.001289
303    0.001089
314    0.001072
309    0.000955
302    0.000936
243    0.000907
306    0.000652
324    0.000638
248    0.000454
252    0.000297
308    0.000266
316    0.000220
322    0.000197
305    0.000161
312    0.000160
251    0.000127
250    0.000086
245    0.000028
241    0.000012
244    0.000003
Name: ANDAMAL_1, dtype: float64

In [27]:

df = pd.merge(df, df_cluster[['uid', 'cluster', 'cluster_lat', 'cluster_lng', 'building_lat','building_lng', 'ANDAMAL_1', 'distance']], on=['uid', 'cluster'], how='left' )

In [28]:
#HERE
df.to_parquet(f'./dbs/intermediate/stops_1_new.parquet')
