In [1]:
import pandas as pd
import scipy.spatial as spt
from datetime import datetime
from chinese_calendar import is_workday
from spatial_flow_clustering_fuc import init_bike_record_with_sfc_obj, get_near_record_uuid_list, \
    calculate_spatial_dissimilarity
from spatiotemporal_flow_clustering_fuc import init_bike_record_with_stfc_obj, calculate_temporal_similarity
from ruled_base_decision_tress_fuc import identify_candidate_commuting_flow, identify_transfer_commuting_flow, identify_user_commuting_category
from results_plot_fuc import plot_sfc_obj, plot_stfc_obj, plot_dcf_obj
from utils import get_distance

In [2]:
# Read sample_bike_records.csv
sample_bike_records_df = pd.read_csv('data\sample_bike_records.csv')

# Add the is_weekday field
sample_bike_records_df['is_weekday'] = pd.to_datetime(sample_bike_records_df['date']).apply(is_workday)

# Get the uid for each user in sample_bike_records.csv
uid_list = sample_bike_records_df['uid'].unique()

# Calculate the number of activity weekdays for each user
activity_weekdays_dict = {}
for uid in uid_list:
    each_user_weekday_record_df = sample_bike_records_df.query(f'uid == "{uid}" and is_weekday == True')
    activity_weekdays_dict[uid] = len(each_user_weekday_record_df['date'].unique())
activity_weekdays_dict

{'sample_user1': 94, 'sample_user2': 35}

In [3]:
# Extract the spatial flow clusters for each user, the spatial flow clustering process refers to Gao et al.(2020) https://doi.org/10.1109/ACCESS.2020.3040852
each_spatial_flow_cluster_dict = {}
for uid in uid_list:
    each_user_weekday_record_df = sample_bike_records_df.query(f'uid == "{uid}" and is_weekday == True')
    each_user_weekday_dict = each_user_weekday_record_df.to_dict(orient='records')
    bike_record_dict, init_bike_record_with_sfc_dict = init_bike_record_with_sfc_obj(each_user_weekday_dict)
    bike_record_uuid_list = bike_record_dict.keys()

    for uuid in bike_record_uuid_list:
        this_spatial_flow_cluster = init_bike_record_with_sfc_dict[uuid]
        near_record_uuid_list = get_near_record_uuid_list(bike_record_dict, uuid, this_spatial_flow_cluster.sfc_id)
        for near_uuid in near_record_uuid_list:
            near_spatial_flow_cluster = init_bike_record_with_sfc_dict[near_uuid]
            if this_spatial_flow_cluster.sfc_id != near_spatial_flow_cluster.sfc_id:
                flows_sd = calculate_spatial_dissimilarity(
                    this_spatial_flow_cluster, near_spatial_flow_cluster, _size_coefficient=0.3,
                    _max_circle_boundary_radius=200)
                if flows_sd <= 1:
                    this_spatial_flow_cluster.add_flow(
                        near_spatial_flow_cluster.including_record_detail)
                    for including_record_uuid in near_spatial_flow_cluster.including_record_detail.keys():
                        init_bike_record_with_sfc_dict[including_record_uuid] = this_spatial_flow_cluster
                        bike_record_dict[including_record_uuid]['sfc_id'] = this_spatial_flow_cluster.sfc_id

    final_spatial_flow_cluster_dict = {}
    min_sfc_threshold = activity_weekdays_dict[uid] /5
    for uuid, sfc_obj in init_bike_record_with_sfc_dict.items():
        this_sfc_id = sfc_obj.sfc_id
        if this_sfc_id not in final_spatial_flow_cluster_dict.keys() and sfc_obj.record_num >= min_sfc_threshold:
            final_spatial_flow_cluster_dict[this_sfc_id] = sfc_obj
    each_spatial_flow_cluster_dict[uid] = final_spatial_flow_cluster_dict
each_spatial_flow_cluster_dict

{'sample_user1': {'sfc002': <spatial_flow_clustering_fuc.SpatialClusterFlow at 0x243dbccd580>,
  'sfc013': <spatial_flow_clustering_fuc.SpatialClusterFlow at 0x243dbbe13d0>,
  'sfc023': <spatial_flow_clustering_fuc.SpatialClusterFlow at 0x243dbbe1850>,
  'sfc027': <spatial_flow_clustering_fuc.SpatialClusterFlow at 0x243dbbe1670>},
 'sample_user2': {'sfc000': <spatial_flow_clustering_fuc.SpatialClusterFlow at 0x243dbccd820>,
  'sfc003': <spatial_flow_clustering_fuc.SpatialClusterFlow at 0x243dbd49880>,
  'sfc011': <spatial_flow_clustering_fuc.SpatialClusterFlow at 0x243dbd06f40>,
  'sfc014': <spatial_flow_clustering_fuc.SpatialClusterFlow at 0x243dbd06d90>}}

In [4]:
# Read and visualize one of the user's spatial flow clusters
sfc_obj = each_spatial_flow_cluster_dict['sample_user1']['sfc002']
plot_sfc_obj(sfc_obj, 'sample_user1')

In [5]:
# Read and visualize one of the user's spatial flow clusters
sfc_obj = each_spatial_flow_cluster_dict['sample_user2']['sfc003']
plot_sfc_obj(sfc_obj, 'sample_user1')

In [6]:
# Extract the spatiotemporal flow clusters for each user
each_spatiotemporal_flow_cluster_dict = {}
for uid, spatial_flow_cluster_dict in each_spatial_flow_cluster_dict.items():
    #  The spatiotemporal flow clustering process refers to Yao et al.(2018) https://doi.org/10.1109/ACCESS.2018.2864662
    unmerged_spatiotemporal_flow_cluster_dict = {}
    for spatial_flow_cluster_obj in spatial_flow_cluster_dict.values():
        init_bike_record_with_stfc_dict = init_bike_record_with_stfc_obj(spatial_flow_cluster_obj)
        for uuid, stfc_obj in init_bike_record_with_stfc_dict.items():
            for another_uuid, another_stfc_obj in init_bike_record_with_stfc_dict.items():
                if uuid != another_uuid and stfc_obj.stfc_id != another_stfc_obj.stfc_id:
                    flows_ts = calculate_temporal_similarity(
                        stfc_obj.time_span,
                        another_stfc_obj.time_span, _expansion_coefficient=0.5)
                    if flows_ts >= 0.5:
                        stfc_obj.add_flow(
                            another_stfc_obj.including_record_detail
                        )
                        init_bike_record_with_stfc_dict[another_uuid] = stfc_obj
        
        for uuid, stfc_obj in init_bike_record_with_stfc_dict.items():
            this_stfc_id = stfc_obj.stfc_id
            if this_stfc_id not in unmerged_spatiotemporal_flow_cluster_dict.keys():
                unmerged_spatiotemporal_flow_cluster_dict[this_stfc_id] = stfc_obj

    # Merging neighbourhood spatiotemporal flow clustering
    # The unmerged sfc sets in descending order according to the number of included ride records to ensure that the most representative cycling trajectories are traversed first
    sorted_unmerged_spatiotemporal_flow_cluster = sorted(unmerged_spatiotemporal_flow_cluster_dict.items(),
                                                         key=lambda item: item[1].stfc_record_num,
                                                         reverse=True)
    
    merged_spatiotemporal_flow_cluster_dict = {}
    has_traversed_stfc_id_list = []
    for this_stfc_id, this_stfc_obj in sorted_unmerged_spatiotemporal_flow_cluster:
        this_sfc_id = this_stfc_obj.sfc_id
        has_traversed_stfc_id_list.append(this_stfc_id)
        for another_stfc_id, another_stfc_obj in unmerged_spatiotemporal_flow_cluster_dict.items():
            if another_stfc_id != this_stfc_id and another_stfc_id not in has_traversed_stfc_id_list and another_stfc_obj.sfc_id != this_sfc_id:
                this_stfc_flow_length = this_stfc_obj.flow.length
                another_stfc_flow_length = another_stfc_obj.flow.length
                dist_threshold = min([this_stfc_flow_length, another_stfc_flow_length]) * 0.3
                dist_threshold = 200 if dist_threshold >= 200 else dist_threshold
                origin_dist = get_distance(this_stfc_obj.flow.coords[0], another_stfc_obj.flow.coords[0])
                destination_dist = get_distance(this_stfc_obj.flow.coords[1], another_stfc_obj.flow.coords[1])
                flow_ts = calculate_temporal_similarity(this_stfc_obj.time_span,
                                                        another_stfc_obj.time_span,
                                                        _expansion_coefficient=0.5)
                if flow_ts >= 0.5 and origin_dist < dist_threshold * 2 and destination_dist < dist_threshold * 2:
                    this_stfc_obj.merge_neighbor_tfc(another_stfc_obj)
                    has_traversed_stfc_id_list.append(another_stfc_id)
                    merged_spatiotemporal_flow_cluster_dict[another_stfc_id] = this_stfc_obj
                    merged_spatiotemporal_flow_cluster_dict[this_stfc_id] = this_stfc_obj
        if this_stfc_obj.has_merged is False:
            merged_spatiotemporal_flow_cluster_dict[this_stfc_id] = this_stfc_obj

    final_spatiotemporal_flow_cluster_dict = {}
    for _, stfc_obj in merged_spatiotemporal_flow_cluster_dict.items():
        if stfc_obj.stfc_id not in final_spatiotemporal_flow_cluster_dict.keys():
            min_stfc_threshold = stfc_obj.sfc_record_num * 0.3
            if stfc_obj.stfc_record_num >= min_stfc_threshold:
                final_spatiotemporal_flow_cluster_dict[stfc_obj.stfc_id] = stfc_obj

    each_spatiotemporal_flow_cluster_dict[uid] = final_spatiotemporal_flow_cluster_dict
each_spatiotemporal_flow_cluster_dict

{'sample_user1': {'stfc000_sfc023': <spatiotemporal_flow_clustering_fuc.SpatioTemporalFlowCluster at 0x243dbedbb50>,
  'stfc000_sfc013': <spatiotemporal_flow_clustering_fuc.SpatioTemporalFlowCluster at 0x243dbf0b880>,
  'stfc000_sfc002': <spatiotemporal_flow_clustering_fuc.SpatioTemporalFlowCluster at 0x243dbedb880>,
  'stfc000_sfc027': <spatiotemporal_flow_clustering_fuc.SpatioTemporalFlowCluster at 0x243dbf077c0>},
 'sample_user2': {'stfc000_sfc000': <spatiotemporal_flow_clustering_fuc.SpatioTemporalFlowCluster at 0x243dbea5b20>,
  'stfc000_sfc014': <spatiotemporal_flow_clustering_fuc.SpatioTemporalFlowCluster at 0x243dbef5580>,
  'stfc004_sfc011': <spatiotemporal_flow_clustering_fuc.SpatioTemporalFlowCluster at 0x243dbef5fd0>,
  'stfc011_sfc003': <spatiotemporal_flow_clustering_fuc.SpatioTemporalFlowCluster at 0x243dbf07760>,
  'stfc000_sfc003': <spatiotemporal_flow_clustering_fuc.SpatioTemporalFlowCluster at 0x243dbd48c70>}}

In [7]:
# Check if any spatiotemporal flow clusters have been merged.
[(k, v.has_merged) for (k, v) in each_spatiotemporal_flow_cluster_dict['sample_user2'].items()]

[('stfc000_sfc000', False),
 ('stfc000_sfc014', True),
 ('stfc004_sfc011', False),
 ('stfc011_sfc003', True),
 ('stfc000_sfc003', True)]

In [8]:
# Read and visualize one of the user's spatiotemporal flow clusters
stfc_obj = each_spatiotemporal_flow_cluster_dict['sample_user2']['stfc011_sfc003']
plot_stfc_obj(stfc_obj, 'sample_user2')

In [9]:
# Read and visualize one of the user's spatiotemporal flow clusters
stfc_obj = each_spatiotemporal_flow_cluster_dict['sample_user2']['stfc000_sfc003']
plot_stfc_obj(stfc_obj, 'sample_user2')

In [10]:
# Read metro entrance data and construct KD tree for subsequent searching
metro_df = pd.read_csv(r'data/metro_entrance_2021.csv')
metro_k_tree = spt.KDTree(list(zip(metro_df['x_coord'], metro_df['y_coord'])))
metro_k_tree.query([12691853.378744228,2576658.5064485003])

(138.26251498080254, 1017)

In [11]:
# Identify each user's commuting flows and determine if it transfers to public transport
each_candidate_commuting_flow_dict = {}
for uid, spatiotemporal_flow_cluster_dict in each_spatiotemporal_flow_cluster_dict.items():
    candidate_commuting_flow_dict = {}
    # The sfc sets in descending order according to the number of included ride records to ensure that the most representative cycling trajectories are traversed first
    sorted_stfc = sorted(spatiotemporal_flow_cluster_dict.items(), key=lambda i: i[1].stfc_record_num, reverse=True)
    has_traversed_stfc_id_list = []
    for this_stfc_id, stfc_obj in sorted_stfc:
        this_sfc_id = stfc_obj.sfc_id
        if this_stfc_id not in has_traversed_stfc_id_list:
            for another_stfc_id, another_stfc_obj in sorted_stfc:
                if this_stfc_id != another_stfc_id and another_stfc_id not in has_traversed_stfc_id_list and this_sfc_id != another_stfc_obj.sfc_id:
                    commuting_flow = identify_candidate_commuting_flow(stfc_obj, another_stfc_obj, _boundary_circle_radius=200,
                                                                       _working_hours_threshold=4)
                    if commuting_flow:
                        commuting_flow = identify_transfer_commuting_flow(commuting_flow, metro_k_tree,metro_df,_transfer_distance_threshold=60)
                        candidate_commuting_flow_dict[commuting_flow.cf_id] = commuting_flow
    each_candidate_commuting_flow_dict[uid] = candidate_commuting_flow_dict
each_candidate_commuting_flow_dict

{'sample_user1': {'stfc000_sfc023_stfc000_sfc027': <ruled_base_decision_tress_fuc.SimplifiedCommutingFlow at 0x243dc6091c0>,
  'stfc000_sfc013_stfc000_sfc002': <ruled_base_decision_tress_fuc.SimplifiedCommutingFlow at 0x243dc6090d0>},
 'sample_user2': {'stfc000_sfc014_stfc011_sfc003': <ruled_base_decision_tress_fuc.SimplifiedCommutingFlow at 0x243dc609250>,
  'stfc000_sfc014_stfc000_sfc003': <ruled_base_decision_tress_fuc.SimplifiedCommutingFlow at 0x243dc609220>,
  'stfc000_sfc014_stfc004_sfc011': <ruled_base_decision_tress_fuc.SimplifiedCommutingFlow at 0x243dc609190>,
  'stfc000_sfc000_stfc011_sfc003': <ruled_base_decision_tress_fuc.SimplifiedCommutingFlow at 0x243dc609040>,
  'stfc000_sfc000_stfc000_sfc003': <ruled_base_decision_tress_fuc.SimplifiedCommutingFlow at 0x243dbf6b850>,
  'stfc000_sfc000_stfc004_sfc011': <ruled_base_decision_tress_fuc.SimplifiedCommutingFlow at 0x243dc609100>}}

In [12]:
# Extract and visualize the daily commuting flow of a given user
selected_uid = 'sample_user1'
dcf = identify_user_commuting_category(each_candidate_commuting_flow_dict[selected_uid])
plot_dcf_obj(dcf, selected_uid)

In [13]:
# Extract and visualize the daily commuting flow of a given user
selected_uid = 'sample_user2'
dcf = identify_user_commuting_category(each_candidate_commuting_flow_dict[selected_uid])
plot_dcf_obj(dcf, selected_uid)