# Step 3: Calculate OD

### This notebook takes into the processed graph with assigned speeds from Step 2. It will also take origins and destinations as inputs. It will assign groups of destinations to each origin, and create an Origin-Destination (OD) matrix for each group. From these OD matrices it will generate the shortest paths as well as tables of edges that make up the shortest paths.

In [None]:
import os, sys, time, importlib
import osmnx

import geopandas as gpd
import pandas as pd
import networkx as nx
import numpy as np
sys.path.append("../../../GOSTnets")
import GOSTnets as gn

from shapely.geometry import LineString, Point

In [None]:
# This is a Jupyter Notebook extension which reloads all of the modules whenever you run the code
# This is optional but good if you are modifying and testing source code
%load_ext autoreload
%autoreload 2

In [None]:
from GOSTnets.load_traffic2 import *

In [None]:
# read graph
G = nx.read_gpickle('../mapbox_traffic/sri_lanka_clean_w_time_largest_max_speeds.pickle')

In [None]:
#len(G.edges)

In [None]:
gn.example_edge(G, 25)

## load origins
Depending on your analysis, your would load either your airports as origins, or your cities

In [None]:
#origins = gpd.read_file('./origins_destinations/intl_airport_updated.shp')
origins = gpd.read_file('../mapbox_traffic/origins_destinations/cities_top10_32644.shp')

In [None]:
origins

## load destinations
Depending on your analysis, your would load either your cities as destinations, or your tourist points

In [None]:
#destinations = gpd.read_file('./origins_destinations/cities_top10.shp')
destinations = gpd.read_file('../mapbox_traffic/origins_destinations/tourism_on_land_32644.shp')

In [None]:
destinations

### Snap your origins and destinations to the graph

In [None]:
origins_gdf = gn.pandana_snap_c(G, origins, source_crs = 'epsg:32644', target_crs = 'epsg:32644')

In [None]:
#origins_gdf

In [None]:
origins_list = list(set(origins_gdf.NN))

In [None]:
destinations_gdf = gn.pandana_snap_c(G, destinations, source_crs = 'epsg:32644', target_crs = 'epsg:32644')

In [None]:
#destinations_gdf

In [None]:
destinations_list = list(set(destinations_gdf.NN))

In [None]:
#destinations_list

## Calculate OD

In [None]:
# It will use the default weight of 'time'
import time

start = time.time()

OD = gn.calculate_OD(G, origins_list, destinations_list, fail_value = 9999999)

end = time.time()
print(end - start)

In [None]:
#OD

In [None]:
OD_df = pd.DataFrame(OD, index = origins_list, columns = destinations_list)

In [None]:
OD_df

In [None]:
#OD_df.min(axis=0)

### Now we need to find the nearest destination for each origin point, and based on this assign a group of origin points to each of the destinations

In [None]:
# takes the min index value of each column, then groups by origin (first index (0)) and takes the first entry
groupby_obj = OD_df.idxmin(axis=0).to_frame(0).groupby(0)[0]

In [None]:
groupby_obj

In [None]:
# visualize groupby_obj
#groupby_obj.apply(list)

In [None]:
#type(groupby_obj.apply(list))

In [None]:
# a nice way to visualize the groupby_obj
#groupby_obj.describe()

### create a dictionary that associates assigned origin points with each destination

In [None]:
origin_destination_pt_dict = {}
for name, group in groupby_obj:
    #print(group)
    for items in group.iteritems(): 
        #print(items[1])
        if items[1] not in origin_destination_pt_dict:
            origin_destination_pt_dict[items[1]] = [items[0]]
        else:
            #append value to list in dict value
            origin_destination_pt_dict[items[1]].append(items[0])
    #print(type(group))
    #print(group.head(1))

    #print(name)
    #print(city_tourist_pt_dict[group])

In [None]:
origin_destination_pt_dict

## Loop through dictionary in order to do a calculate_OD for each destination's nearest origins

In [None]:
OD = {}
OD_df_dict = {}
for origin, destination in origin_destination_pt_dict.items():
    OD[origin] = gn.calculate_OD(G, [origin], destination, fail_value = 9999999)
    OD_df_dict[origin] = pd.DataFrame(OD[origin], index = [origin], columns = destination)

In [None]:
OD_df_dict

## Now work on generating routes and visualizing them

In [None]:
from shapely.ops import linemerge
from itertools import islice

### speed dictionaries used to model bringing an improved highway to an average speed. Our default methodology is to import the graph where edges that did not have a traffic speed used the OSM Max speeds as their default speeds and for the tabulate_edges function to apply the mapbox_mean_speeds dictionary to apply to for all edges that have traffic to compare their mapbox traffic speed to the Mapbox mean speeds per class

In [None]:
max_speeds = {
    'secondary': 50,
    'secondary_link': 45,
    'tertiary': 40,
    'tertiary_link': 40,
    'residential': 25,
    'unclassified': 25,
}

In [None]:
mapbox_mean_speeds = {
    'secondary': 34,
    'secondary_link': 9,
    'tertiary': 25,
    'tertiary_link': 13,
    'residential': 20,
    'unclassified': 20,
}

### The tabulate_edges function will get called by the generate_complete_edges_and_routes for each route. The tabulate edges function loops through each segment of the route and calculates various metrics into both an edge list and also for each route. One of these metrics is calculating the seconds saved if a road segment could be improved. It calculates this potential improvement using the input mean speed dictionary. If the road segment's speed could be improved to the mean speed then it will calculate a positive seconds saved value.

In [None]:
def tabulate_edges(route, mean_speed_dict = mapbox_mean_speeds):
    edge_table = []
    route_geometry = LineString()
    improved_time = 0
    for idx in range(0, len(route) - 1):
        # look up line
        #print('to node')
        #print(route[idx])
        #print('from node')
        #print(route[idx+1])
        
        edge_geometry = G.get_edge_data(route[idx],route[idx+1])[0]['geometry']
        
        # get edge speed
        edge_speed = G.get_edge_data(route[idx],route[idx+1])[0]['speed']
        #print('print edge_speed')
        #print(edge_speed)
        
        # compare edge speed to median speed
        rural_roads_list = ['residential','secondary','secondary_link','tertiary','tertiary_link','unclassified']
        
        edge_infra_type = G.get_edge_data(route[idx],route[idx+1])[0]['infra_type']
        edge_length = G.get_edge_data(route[idx],route[idx+1])[0]['length']
        edge_time = G.get_edge_data(route[idx],route[idx+1])[0]['time']
        
        try:
            mean_speed = G.get_edge_data(route[idx],route[idx+1])[0]['traffic_mean_speed']
        except:
            mean_speed = 0
            pass
        
        try:
            edge_imp_cost = G.get_edge_data(route[idx],route[idx+1])[0]['imp_cost']
        except:
            edge_imp_cost = 0
            pass
        
        new_time_s = None
        
        if mean_speed > 0:      
            if edge_infra_type in rural_roads_list:
                #print('print edge attributes')
                #print(G.get_edge_data(route[idx],route[idx+1])[0])

                #assumes that current edge length is in km
                #use either the max_speeds dictionary or the mapbox_mean_speeds dictionary here
                new_time_s = (edge_length / mean_speed_dict[edge_infra_type]) * 3600

                edge_savings = edge_time - new_time_s
                # assign savings time
                edge_table.append([route[idx], route[idx+1], edge_savings, edge_imp_cost, edge_length,edge_time, edge_infra_type, mean_speed, edge_geometry])
            else:
                # sec_saved and improvement costs become 0
                edge_table.append([route[idx], route[idx+1], 0, 0, edge_length, edge_time, edge_infra_type, mean_speed, edge_geometry])
        else:
            # sec_saved and improvement costs become 0
            edge_table.append([route[idx], route[idx+1], 0, 0, edge_length, edge_time, edge_infra_type, mean_speed, edge_geometry])
            
        route_geometry = route_geometry.union(edge_geometry)
        
        # here if the road can be improved, we are using the improved time, or else we are using the time to traverse the edge
        # we are summing this up for each route
        if new_time_s:
            improved_time += new_time_s
        else:
            improved_time += edge_time
        
    #print('print route_geometry')
    #print(route_geometry)
    
    return(edge_table, route_geometry, improved_time)

### The generate_complete_edges_and_routes function loops through each origin and destination and gets the complete shortest path, this means it gets each intermediary stop along the shortest path. This allows us to process each edge along each shortest path. The function returns a GeoDataFrame that contains each along each shortest path. It is possible that an edge can be traversed more than once if more than one shortest path traverses the same edge. In this case the edge becomes weighted and its 'weighted_sec_saved' value is multiplied for each time it is traversed. The generate_complete_edges_and_routes function also returns a GeoDataFrame of all of the shortest paths.

In [None]:
def generate_complete_edges_and_routes(input_df, mean_speed_dict = mapbox_mean_speeds):

    LIMIT = 9999999999

    complete_edges = []
    complete_routes = []

    count = 0

    # for origin, row in sample_df.iterrows(): 
    for origin, row in islice(input_df.iterrows(), LIMIT):    
        for destination, value in islice(row.items(), LIMIT):
            try:
                origin = int(origin)
            except:
                pass
            try:
                destination = int(destination)
            except:
                pass

            count = count + 1

            route = nx.dijkstra_path(G, origin, destination, weight = 'time')

            edge_table, route_geometry, improved_time = tabulate_edges(route, mean_speed_dict = mean_speed_dict)
            #print('print edge_table:')
            #print(edge_table)
            complete_edges = complete_edges + edge_table

            complete_routes.append([edge_table[0][0], edge_table[-1][1], value, improved_time, route_geometry])
            #print('edge_table[:-1]')
            #print(edge_table[-1][1])
            
    # convert complete_edges to gdf
    complete = pd.DataFrame(complete_edges, columns = ['o', 'd', 'sec_saved', 'imp_cost', 'length', 'time', 'infra_type', 'mean_speed', 'geometry'])
    complete['w'] = 1
    complete_count = complete.groupby(['o','d']).agg(
        {
            'w':"count",
            'sec_saved': 'first',
            'imp_cost': 'first',
            'mean_speed': 'first',
            'length':'first',
            'time':'first',
            'infra_type':'first',
            'geometry':'first'
        }
    )
    complete_count.reset_index(inplace = True)
    complete_count['o'] = complete_count['o'].astype(str)
    complete_count['d'] = complete_count['d'].astype(str)
    complete_count['weighted_sec_saved'] = complete_count.w * complete_count.sec_saved
    complete_count.sort_values(by=['weighted_sec_saved'], ascending=False)
    complete_count_gdf = gpd.GeoDataFrame(complete_count, crs = 'epsg:4326')
    
    # convert complete_routes to gdf
    complete_routes_df = pd.DataFrame(complete_routes, columns = ['origin','destination','time','improved_time','geometry'])
    complete_routes_gdf = gpd.GeoDataFrame(complete_routes_df, crs = 'epsg:4326')
        
    return [complete_count_gdf, complete_routes_gdf]

### We will be calculating metrics for each origin and its group of destinations

In [None]:
OD_df_dict

In [None]:
for key in OD_df_dict:
    print(key)

In [None]:
#results = generate_complete_edges_and_routes(OD_df_dict[3935302581])

### Here we loop through each origin and its group of destinations and run the generate_complete_edges_and_routes using the appropriate speed dictionary

In [None]:
import time

start = time.time()

results = {}

count = 0
for key in OD_df_dict:
    results[key] = generate_complete_edges_and_routes(OD_df_dict[key], mean_speed_dict = mapbox_mean_speeds)
    count += 1
    
print(time.time() - start)

### Save results into shapefiles

In [None]:
for key in results:
    # print edges
    #print(results[key][0])
    file_name = f"./output_edges/cities_weighted_sec_saved_edges_{key}.shp"
    #print(file_name)
    results[key][0].to_file(driver = 'ESRI Shapefile', filename = file_name)
    routes_file_name = f"./output_routes/cities_weighted_sec_saved_routes_{key}.shp"
    #print(file_name)
    results[key][1].to_file(driver = 'ESRI Shapefile', filename = routes_file_name)

### Read the saved shapefiles for the edges back in as one GeoDataFrame

In [None]:
import glob
path = r'./output_edges' # use your path
all_files = glob.glob(path + "/*.shp")

li = []

for filename in all_files:
    gdf = gpd.read_file(filename)
    li.append(gdf)

study_area = gpd.GeoDataFrame(pd.concat( li, ignore_index=True))

In [None]:
study_area.crs = 'epsg:32644'

### Read the saved shapefiles for the Routes back in as one GeoDataFrame

In [None]:
path = r'./output_routes' # use your path
all_files = glob.glob(path + "/*.shp")

li = []

for filename in all_files:
    gdf = gpd.read_file(filename)
    li.append(gdf)

study_area_routes = gpd.GeoDataFrame(pd.concat( li, ignore_index=True))

In [None]:
study_area_routes.crs = 'epsg:32644'

In [None]:
# save merged routes
study_area_routes.to_file(driver = 'ESRI Shapefile', filename = "./output_merged/post_step3_merged_results_routes.shp")

### The edges have varying lengths, so we can divide the weighted seconds by length (in km), in order to compare them 

In [None]:
study_area['w_sec_per_km'] = study_area['weighted_s'] / study_area['length']

In [None]:
study_area[:5]

In [None]:
sorted_study_area = study_area.sort_values(by=['w_sec_per_km'], ascending=False)

In [None]:
# let's also add the time in min
sorted_study_area['time_min'] = sorted_study_area['time'] / 60

In [None]:
sorted_study_area[:5]

### Save the merged edges into a shapefile

In [None]:
sorted_study_area.to_file(driver = 'ESRI Shapefile', filename = "./output_merged/post_step3_merged_results_edges.shp")

### We can generate some other metrics

In [None]:
# generate a subset for all edges that have a positive seconds saved if improved
positive_subset = sorted_study_area.loc[sorted_study_area['sec_saved'] > 0]

In [None]:
positive_subset['sec_saved'].sum()

In [None]:
positive_subset['length'].sum()

In [None]:
positive_subset['imp_cost'].sum()

In [None]:
# generate a subset for all edges that would save at least 20 'weighted' seconds if improved
w_sec_per_km_greater_than_20 = sorted_study_area.loc[sorted_study_area['w_sec_per_km'] > 20]

In [None]:
w_sec_per_km_greater_than_20['sec_saved'].sum()

In [None]:
w_sec_per_km_greater_than_20['length'].sum()

In [None]:
w_sec_per_km_greater_than_20['imp_cost'].sum()