In [1]:
import json
import random
import numpy as np
import pandas as pd
import geopandas as gpd
import copy
import pandana
import time
import os, sys
from functools import reduce
sys.path.append(os.path.abspath(".."))
from utils import load_motifs, sampling
from abm_configs import activities_code_to_name, activity_to_landuse

# Functions

In [2]:
def find_destination(simpop_df, 
                     network, 
                     activity_to_landuse, 
                     activity_full_name, 
                     landuse_to_polygons, 
                     external_zones,
                     sample_num=10, 
                     external_amenity_prob=0.3):
    simpop_df["locations"] = None
    simpop_df["locations_internal"] = None
    for p_idx, person in simpop_df.iterrows():
        places, places_internal = [], []
        for a_idx, activity in enumerate(person["activities"]):
            if activity == "H":
                place = person["home_geoid"]
                place_internal = person["home_internal"]
            elif activity == "W":
                place = person["work_geoid"]
                place_internal = person["workplace_internal"]
            else:
                # todo: a better way to decide whether choose external amenity
                if external_amenity_prob>0 and np.random.rand() <= external_amenity_prob:  
                    place_internal = False
                    place = random.choice(external_zones)
                else:  
                    place_internal = True
                    lu_dist = activity_to_landuse.get(activity_full_name[activity])
                    des_lu = sampling(np.array(list(lu_dist.keys()), dtype=object),
                                      list(lu_dist.values()), num=1)[0]
                    candidate_destinations = get_candidate_landuse(des_lu, landuse_to_polygons, sample_num)
                    if len(candidate_destinations) > 1:
                        if a_idx == 0:
                            last_place = person["home_geoid"]
                        else:
                            last_place = places[a_idx-1]
                        if a_idx == 0 or person["activities"][a_idx-1] == "H":
                            imp_name = "from_resi_impedance"
                        elif activity == "H":
                            imp_name = "to_resi_impedance"
                        else:
                            imp_name = "non_resi_impedance"
                        dist = get_distance(network, last_place, candidate_destinations, imp_name)
                        prob, chosen = huff_model(dist, beta=2, predict_y=True, topN=5, alt_names=np.array([candidate_destinations]))
                        place = chosen[0]
                    elif len(candidate_destinations) == 1:
                        place = candidate_destinations[0]
                    else:
                        print(f"No candidate places for activity {activity} and landuse {des_lu}, switch to external amenities")
                        place_internal = False
                        place = random.choice(external_zones)
            places.append(place)
            places_internal.append(place_internal)
        simpop_df.at[p_idx, "locations"] = places
        simpop_df.at[p_idx, "locations_internal"] = places_internal
    return simpop_df


def find_destination_batch(simpop_df, 
                           network, 
                           activity_to_landuse, 
                           activity_full_name, 
                           landuse_to_polygons, 
                           external_zones,
                           sample_num=10, 
                           external_amenity_prob=0.3):
    simpop_lookup = {pid: person for pid, person in zip(simpop_df.index, simpop_df.to_dict("records"))}
    activities = {pid: person["activities"] for pid, person in simpop_lookup.items()}
    steps = {pid: len(act) for pid, act in activities.items()}
    max_step = max(steps.values())
    step_places = [{pid: None for pid, num_steps in steps.items() if step<num_steps} for step in range(max_step)]
    step_places_internal = [{pid: None for pid, num_steps in steps.items() if step<num_steps} for step in range(max_step)]
    for step in range(max_step):
        pids = list(step_places[step].keys())
        if step == 0:
            home_pids = pids
            work_pids = []
            amenity_pids = []
        else:
            home_pids = [pid for pid in pids if activities[pid][step] == "H"]
            work_pids = [pid for pid in pids if activities[pid][step] == "W"]
            amenity_pids = [pid for pid in pids if activities[pid][step] not in ["H", "W"]]
        if home_pids:
            step_places[step].update({pid: simpop_lookup[pid]["home_geoid"] for pid in home_pids})
            step_places_internal[step].update({pid: simpop_lookup[pid]["home_internal"] for pid in home_pids})
        if work_pids:
            step_places[step].update({pid: simpop_lookup[pid]["work_geoid"] for pid in work_pids})
            step_places_internal[step].update({pid: simpop_lookup[pid]["workplace_internal"] for pid in work_pids})
        if amenity_pids:
            # choose external amenities giving a priori probability
            num_external = round(len(amenity_pids) * external_amenity_prob)
            if num_external > 0:
                external_pids = np.random.choice(amenity_pids, size=num_external, replace=True)
                external_amenties = np.random.choice(external_zones, size=num_external, replace=True)
                step_places[step].update({pid: external_zone for pid, external_zone in zip(external_pids, external_amenties)})
                step_places_internal[step].update({pid: False for pid in external_pids})
            # for convenience, we should assure that each landuse has at least 1 internal amenity
            internal_pids = [pid for pid in amenity_pids if pid not in external_pids]
            last_places = {pid: step_places[step-1][pid] for pid in internal_pids}
            last_places_internal = {pid: step_places_internal[step-1][pid] for pid in internal_pids}
            last_activities = {pid: activities[pid][step-1] for pid in internal_pids}
            this_activities = {pid: activities[pid][step] for pid in internal_pids}
            destination_landuse_distrib = {pid: activity_to_landuse[activity_full_name[act]] for pid, act in this_activities.items()}
            t0 = time.time()
            destination_landuse = {pid: sampling(np.array(list(lu_distrib.keys()), dtype=object),list(lu_distrib.values()), num=1)[0] 
                                   for pid, lu_distrib in destination_landuse_distrib.items()}
            candidate_destinations = {pid: get_candidate_landuse(lu, landuse_to_polygons, sample_num) for pid, lu in destination_landuse.items()}
            imp_names = {pid: "from_resi_impedance" if last_activities[pid]=="H" else ("to_resi_impedance" if this_act=="H" else "non_resi_impedance") 
                         for pid, this_act in this_activities.items()}
            # dist1 = {pid: get_distance(network, last_places[pid], candidate_destinations[pid], imp_names[pid]) for pid in internal_pids}  # time consuming
            dist_lookup = {pid: None for pid in internal_pids}
            func_add = lambda x,y: x+y
            for target_imp in ["from_resi_impedance", "to_resi_impedance", "non_resi_impedance"]:
                this_imp_pids = [pid for pid, imp in imp_names.items() if imp==target_imp]
                if len(this_imp_pids) == 0:
                    continue
                num_candidates = {pid: len(candidate_destinations[pid]) for pid in this_imp_pids}
                cumnum_candidates = np.cumsum(list(num_candidates.values())).tolist()
                candidate_idx_range = {pid: (last_idx, this_idx) for pid, last_idx, this_idx in zip(num_candidates.keys(), [0] + cumnum_candidates[:-1], cumnum_candidates)}
                nodes_a = reduce(func_add, [[last_places[pid]] * num_candidate for pid, num_candidate in num_candidates.items()], [])
                nodes_b = reduce(func_add, [candidate_destinations[pid] for pid in this_imp_pids], [])
                this_imp_dist = np.array(net.shortest_path_lengths(nodes_a, nodes_b, target_imp))
                dist_lookup.update({pid: this_imp_dist[idx_range[0]: idx_range[1]] for pid, idx_range in candidate_idx_range.items()})
            max_num_candidates = sample_num if sample_num else max([len(candidates) for pid, candidates in candidate_destinations.items()])
            topN = None if sample_num and sample_num<=10 else 10
            dist_mat = np.array([dist.tolist() + [np.nan] * (max_num_candidates - len(dist)) for pid, dist in dist_lookup.items()])
            alt_names = np.array([candidates + ["placeholder"] * (max_num_candidates - len(candidates)) for pid, candidates in candidate_destinations.items()])
            probs, chosen_amenities = huff_model(dist_mat, beta=2, predict_y=True, topN=topN, alt_names=alt_names)
            step_places[step].update({pid: amenity for pid, amenity in zip(internal_pids, chosen_amenities)})
            step_places_internal[step].update({pid: True for pid in internal_pids})
    # unfold stepwise results
    simpop_df["locations"] = [[step_places[step][pid] for step in range(num_steps)] for pid, num_steps in steps.items()]
    simpop_df["locations_internal"] = [[step_places_internal[step][pid] for step in range(num_steps)] for pid, num_steps in steps.items()]
    return simpop_df
    
   
            
def huff_model(dist, attract=None, alt_names=None, alpha=1, beta=2, predict_y=False, topN=None):
    """ 
    takes a distance matrix and a optional attraction matrix, calculates choice probabilities 
    and predicts choice outcomes by sampleing according to probabilities
    prob = (attract**alpha / dist**beta) / sum_over_all_alternatives(attract**alpha / dist**beta)
    
    Arguments:
    --------------------------------------------
    dist: distance matrix, ncs(number of choice situations) * nalt(number of alternatives), or 1-d array
    attract: optional attraction matrix, ncs * nalt, or 1-d array
    alt_names: optional matrix of alternative names, ncs * nalt, or 1-d array
    alpha, beta: coefficents of attraction and distance
    predict_y: whether or not to predict choice outcomes via sampling
    topN: when predicting choice outcomes, only alternatives with top N probabilities will be considered
    """
    dist = np.array(dist)
    dist = np.maximum(dist, np.ones_like(dist)*0.01)    # avoid dist=0
    if attract is None:
        attract = np.ones_like(dist)
    else:
        attract = np.array(attract)
    if dist.ndim == 1:
        dist = dist.reshape(1, -1)
        attract = attract.reshape(1, -1)
        if alt_names is not None:
            alt_names = alt_names.reshape(1, -1)
    ncs, nalt = dist.shape
    u = (attract ** alpha) / (dist ** beta)
    # prob = u / u.sum(axis=1, keepdims=True)  # considering nan
    prob = u / np.nansum(u, axis=1, keepdims=True)
    if predict_y:
        y = []
        if topN:
            use_prob = -np.sort(-prob, axis=1)[:, :topN]
            use_prob = use_prob / use_prob.sum(axis=1, keepdims=True)
            use_idx = np.argsort(-prob, axis=1)[:, :topN]
            if alt_names is None:
                use_names = use_idx
            else:
                use_names = np.asarray([alt_names[i, use_idx[i,:]] for i in range(ncs)])
        else:
            use_prob = prob
            if alt_names is None:
                use_names = np.asarray([list(range(nalt)) for i in range(ncs)])
            else:
                use_names = alt_names
        for i in range(ncs):
            # get rid of potential placeholder nan
            valid_names= [name for name, p in zip(use_names[i, :], use_prob[i, :]) if not np.isnan(p)]
            valid_probs = [p for p in  use_prob[i, :] if not np.isnan(p)]
            this_y = np.random.choice(valid_names, p=valid_probs)
            y.append(this_y) 
    else:
        y = None
    return prob, y

    
def get_distance(net, origins, destinations, imp_name):
    if type(origins) != list:
        origins = [origins]
    if type(destinations) != list:
        destinations = [destinations]
    node_pairs = [(o,d) for o in origins for d in destinations]
    nodes_a = [pair[0] for pair in node_pairs]
    nodes_b = [pair[1] for pair in node_pairs]
    dist = net.shortest_path_lengths(nodes_a, nodes_b, imp_name)
    dist = np.array(dist).reshape(len(origins), len(destinations))
    return dist


BROAD_LANDUSE_TO_POLYGONS = {}  # this could be precooked
def get_candidate_landuse(landuse_type, landuse_to_polygons, sample_num):
    """
    landuse_type (destination landuse) could be a 
        a exact-matching landuse string (priorly considered),
        or a broad-matching landuse string if exact-matching returns empty list,
        or a tuple of exact-matching landuse string
    """
    if type(landuse_type) == str:
        if landuse_type in landuse_to_polygons:  # exact matching
            candidate_destinations = landuse_to_polygons[landuse_type]
        else:  # broad matching
            tmp = [polygons for lu, polygons in landuse_to_polygons.items() 
                   if lu.startswith(landuse_type)]
            candidate_destinations = reduce(lambda a,b: a+b, tmp, [])
    elif type(landuse_type) in [list, tuple, np.ndarray]:
        tmp = [landuse_to_polygons.get(lu, []) for lu in landuse_type]
        candidate_destinations = reduce(lambda a,b: a+b, tmp, [])
    else:
        raise ValueError(f"Destination landuse type must be a string or iterable, not {type(landuse_type)}")
    if sample_num and len(candidate_destinations) > sample_num:
        candidate_destinations = np.random.choice(candidate_destinations, size=sample_num, replace=False).tolist()
    return candidate_destinations

# Activity Scheduling

In [3]:
motif_fpath = "../data/sample_motifs_nhts.csv"
motifs_df, activitie_classes = load_motifs(motif_fpath)
print("activity types: ", activitie_classes)
activities_list, start_times_list = [], []
for idx, data in motifs_df.iterrows():
    motif = data["motif"]
    activities = [a for ia, a in enumerate(motif) if ia==0 or a!=motif[ia-1]]
    start_times = [ia*3600 + random.randint(-1800, 1700) for ia, a in enumerate(motif) if ia==0 or a!=motif[ia-1]]
    start_times[0] = 0
    activities_list.append(activities)
    start_times_list.append(start_times)
for act in activitie_classes:
    print(f"{act}: {activities_code_to_name[act]}")

activity types:  ['C', 'D', 'E', 'G', 'H', 'P', 'R', 'S', 'V', 'W', 'X', 'Z']
C: College
D: Drop-off
E: Eat
G: Groceries
H: Home
P: Health
R: Recreation
S: Shopping
V: Visit
W: Work
X: Exercise
Z: Religion


In [4]:
sim_pop_df_raw = pd.read_csv("../../temp/checkpoints/sim_pop_df_0131.csv")
sim_pop_df_raw["activities"] = sim_pop_df_raw.apply(lambda row: eval(row["activities"]), axis=1)
sim_pop_df_raw["start_times"] = sim_pop_df_raw.apply(lambda row: eval(row["start_times"]), axis=1)
sim_pop_df_raw.head()

Unnamed: 0,home_geoid,work_geoid,home_internal,workplace_internal,type,presence,activities,start_times
0,9_228,0_4,True,False,second_home_owner,True,"[H, W, X, H]","[0, 26303, 57223, 63805]"
1,3_307,0_223,True,True,resident,True,"[H, C, H, W, H]","[0, 26254, 49522, 64734, 80425]"
2,6_220,1_0,True,False,resident,True,"[H, W, D, R, G, D, H]","[0, 26148, 52460, 58400, 60721, 65362, 73021]"
3,2_528,1_212,True,True,resident,True,"[H, W, H]","[0, 23813, 59587]"
4,9_225,1_543,True,False,resident,True,"[H, G, H]","[0, 63037, 67359]"


In [5]:
%%time
polygons = gpd.read_file("../../graph/outputs/morph_polys_extended.geojson")
net = pandana.network.Network.from_hdf5("../../graph/outputs/morph_net_extended_p4.h5")
landuse_to_polygons = json.load(open("../data/landuses_lookup.json"))
external_zones = json.load(open("../data/external_zones.json"))

Wall time: 15.7 s


# Loop-based 

The loop-based approach loops through each simulated person, and then loops through each of his activities.  
- Destination of "Home" and "Work" will directly use his home location and workplace.
- Desination of amenities other than "Home" and "Work" will be chosen using Huff model. As distance is necessary for Huff model, and distance to the next destination relies on the current destination, we have to use this double loops, which is time consuming

A detailed analysis of CPU time of each lien can be seen at https://github.com/l3cities/NEOM_L3/blob/cw_dev/ABM/profiles/loop_based_activitiy_destination.txt  
Interestingly, the most time-consuming part is calculate network impedance.

### Check computing time

In [6]:
%%timeit
sim_pop_df = sim_pop_df_raw.copy()
sim_pop_df = find_destination(sim_pop_df,
                              net,
                              activity_to_landuse=activity_to_landuse, 
                              activity_full_name=activities_code_to_name, 
                              landuse_to_polygons=landuse_to_polygons, 
                              external_zones=external_zones,
                              sample_num=10,
                              external_amenity_prob=0.3)

43.8 s ± 2.67 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


### Check results

In [8]:
sim_pop_df = sim_pop_df_raw.copy()
sim_pop_df = find_destination(sim_pop_df,
                              net,
                              activity_to_landuse=activity_to_landuse, 
                              activity_full_name=activities_code_to_name, 
                              landuse_to_polygons=landuse_to_polygons, 
                              external_zones=external_zones,
                              sample_num=10,
                              external_amenity_prob=0.3)
sim_pop_df["act_len"] = sim_pop_df.apply(lambda row: len(row["activities"]), axis=1)
sim_pop_df = sim_pop_df.sort_values("act_len", ascending=False)
sim_pop_df.head()

Unnamed: 0,home_geoid,work_geoid,home_internal,workplace_internal,type,presence,activities,start_times,locations,locations_internal,act_len
489,3_380,2_68,True,True,resident,True,"[H, X, P, G, W, H, D, S, E, V, E, D, H]","[0, 28604, 33420, 37368, 37848, 52774, 56744, ...","[3_380, 0_398, 0_456, 0_356, 2_68, 3_380, 2_60...","[True, True, True, True, True, True, True, Fal...",13
674,6_189,1_549,True,False,resident,True,"[H, X, P, G, W, H, D, S, E, V, E, D, H]","[0, 28604, 33420, 37368, 37848, 52774, 56744, ...","[6_189, 0_68, 0_455, 0_361, 1_549, 6_189, 0_55...","[True, True, True, True, False, True, False, T...",13
855,5_300,1_555,True,False,resident,True,"[H, X, P, G, W, H, D, S, E, V, E, D, H]","[0, 28604, 33420, 37368, 37848, 52774, 56744, ...","[5_300, 0_393, 1_5, 1_201, 1_555, 5_300, 1_330...","[True, True, False, True, False, True, True, T...",13
13,0_313,1_548,True,False,tourist,True,"[H, P, H, P, G, H, P, G, H, Z, E, H]","[0, 23150, 29272, 31726, 38472, 45284, 50451, ...","[0_313, 1_145, 0_313, 0_457, 1_137, 0_313, 0_5...","[True, False, True, True, False, True, False, ...",12
2876,6_265,0_527,True,False,resident,True,"[H, P, H, P, G, H, P, G, H, Z, E, H]","[0, 23150, 29272, 31726, 38472, 45284, 50451, ...","[6_265, 0_513, 6_265, 0_456, 0_565, 6_265, 1_6...","[True, False, True, True, False, True, False, ...",12


# Batch

The batch approach aims to greatly reduce the computing time of loop-based approach.  
Assuming all simulated persons have a maximum number of {max_step} activities, then we will only loop {max_step} times. For each step:
- Collect all pids (person_id) that still have activities at this step
- Get those whose activities at this step are Home or Work and directly assign their home locations and workplaces as destinations
- Get those whose activities at this step are amenities other than Home and Work (amenitiy_pids), and then:
    - Randomly assign a external node for those are assumed to choose external amenities (external_pids) according to a predefined ratio, there should be a better way to determine how people choose between internal and external amenities.
    - For the else who choose internal amenties (internal_pids):
        - Collect their last places
        - Collect their next activities to do, and sample corresponding landuse based on predefined {activity_to_landuse}, and get the candidate destinations (polygons/nodes) using predefined {landuse_to_polygons}
        - For different types of network weights, compute the network impedance for all corresponding persons from their last places to candidate destinations in batch.
        - Now we have distance information for all persons, run Huff model in batch approach and directly got all desinations for all persons at this step. Note that placeholders like "nan" are need as persons may have different numbers of alternatives.  

When all steps are finished, the stepwise results are unfolded to the format we want.  

### Check computing time

In [9]:
%%timeit
sim_pop_df = sim_pop_df_raw.copy()
sim_pop_df = find_destination_batch(sim_pop_df,
                                    net,
                                    activity_to_landuse=activity_to_landuse, 
                                    activity_full_name=activities_code_to_name, 
                                    landuse_to_polygons=landuse_to_polygons, 
                                    external_zones=external_zones,
                                    sample_num=10,
                                    external_amenity_prob=0.3)

1.6 s ± 17.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


### Check results

In [10]:
sim_pop_df = sim_pop_df_raw.copy()
sim_pop_df = find_destination_batch(sim_pop_df,
                                    net,
                                    activity_to_landuse=activity_to_landuse, 
                                    activity_full_name=activities_code_to_name, 
                                    landuse_to_polygons=landuse_to_polygons, 
                                    external_zones=external_zones,
                                    sample_num=10,
                                    external_amenity_prob=0.3)
sim_pop_df["act_len"] = sim_pop_df.apply(lambda row: len(row["activities"]), axis=1)
sim_pop_df = sim_pop_df.sort_values("act_len", ascending=False)
sim_pop_df.head()

Unnamed: 0,home_geoid,work_geoid,home_internal,workplace_internal,type,presence,activities,start_times,locations,locations_internal,act_len
489,3_380,2_68,True,True,resident,True,"[H, X, P, G, W, H, D, S, E, V, E, D, H]","[0, 28604, 33420, 37368, 37848, 52774, 56744, ...","[3_380, 1_501, 0_354, 0_361, 2_68, 3_380, 1_33...","[True, False, True, True, True, True, True, Tr...",13
674,6_189,1_549,True,False,resident,True,"[H, X, P, G, W, H, D, S, E, V, E, D, H]","[0, 28604, 33420, 37368, 37848, 52774, 56744, ...","[6_189, 2_116, 1_503, 0_361, 1_549, 6_189, 2_2...","[True, True, False, True, False, True, True, F...",13
855,5_300,1_555,True,False,resident,True,"[H, X, P, G, W, H, D, S, E, V, E, D, H]","[0, 28604, 33420, 37368, 37848, 52774, 56744, ...","[5_300, 1_502, 0_456, 0_358, 1_555, 5_300, 1_3...","[True, False, True, True, False, True, True, F...",13
13,0_313,1_548,True,False,tourist,True,"[H, P, H, P, G, H, P, G, H, Z, E, H]","[0, 23150, 29272, 31726, 38472, 45284, 50451, ...","[0_313, 0_558, 0_313, 0_461, 1_138, 0_313, 0_4...","[True, False, True, True, False, True, True, T...",12
2876,6_265,0_527,True,False,resident,True,"[H, P, H, P, G, H, P, G, H, Z, E, H]","[0, 23150, 29272, 31726, 38472, 45284, 50451, ...","[6_265, 0_459, 6_265, 0_352, 1_140, 6_265, 0_4...","[True, True, True, True, False, True, True, Tr...",12
