### Step 2 - Network Cleaning

Having extracted the raw network from the .osm.pbf, we now implement the standard process to topologically clean / simplify the network, per standard GOSTnets. 

Import the usual suspects

In [33]:
import geopandas as gpd
import os, sys, time
import pandas as pd
sys.path.append(r'C:\Users\charl\Documents\GitHub\GOST_PublicGoods\GOSTNets\GOSTNets')
import GOSTnet as gn
import importlib
importlib.reload(gn)
import networkx as nx
import osmnx as ox
importlib.reload(ox)
from shapely.ops import unary_union
from shapely.wkt import loads
from shapely.geometry import LineString, MultiLineString, Point

peartree version: 0.6.1 
networkx version: 2.3 
matplotlib version: 3.0.3 
osmnx version: 0.9 


### Full Process

This read in process is specific to Yemen, which used the originally developed 'extract from a combo .csv' process. Newer implmentations (See: Sierra Leone, GOSTnets examples folder) read in directly from a saved gpickle. This is the preferred technique as it saves a lot of faff. Nonetheless, the below works. 

In [34]:
def InitialReadIn(fpath, country):
    
    # define the precis filename
    ffile = r'%s_combo.csv' % country
    
    # read in the combo dataframe, rename as edges_1
    edges_1 = pd.read_csv(os.path.join(fpath, ffile))

    # copy
    edges = edges_1.copy()

    # the list of nodes will be all the unique values in the u and v columns, by definition (start and end of each edge)
    node_bunch = list(set(list(edges['u']) + list(edges['v'])))

    # Write a function for converting the rows into edges that a nx graph will accept (i.e. in format (u,v,data))
    def convert(x):
        # obviously the start (u) will be equal to ... u
        u = x.u
        v = x.v
        
        # here we load the attributes in the DataFrame we want to retain on the edges in the graph
        data = {'Wkt':loads(x.Wkt),
               'id':x.id,
               'infra_type':x.infra_type, 
               'osm_id':x.osm_id,
               'country': x.country,
               'key': x.key, 
               'length':x.length}
        
        # return an object in the correct edge format 
        return (u, v, data)
    
    # apply this function to every row in the edges DF
    edge_bunch = edges.apply(lambda x: convert(x), axis = 1).tolist()
    
    # open a blank multidigraph
    G = nx.MultiDiGraph()
    
    # add the nodes first, then the edges
    G.add_nodes_from(node_bunch)
    G.add_edges_from(edge_bunch)
    
    # for each node, split out the x and y coordinates (see Step 1 for why this works)
    for u, data in G.nodes(data = True):
        q = tuple(float(x) for x in u[1:-1].split(','))
        data['x'] = q[0]
        data['y'] = q[1]

    # relabel node IDs as integers for simplicity's sake
    G = nx.convert_node_labels_to_integers(G)

    # make a GeoDataFrame of the nodes, save down
    gdfnodes = gn.node_gdf_from_graph(G)
    gdfnodes.to_csv(os.path.join(wpath, '%s_pre_processing_nodes.csv' % country))
    
    # make a GeoDataFrame of the edges, save down
    gdfedges = gn.edge_gdf_from_graph(G, geom_col = 'Wkt')
    gdfedges.to_csv(os.path.join(wpath, '%s_pre_processing_edges.csv' % country))
    
    # we shouldn't have lost any edges along the way. Check this here!    
    print('These two should equal: A) length of final df: %s | B) length of original df: %s' % (len(gdfedges), len(edges_1)))
    
    return G

Having loaded the raw data into a nx MultiDiGraph type object, we can now proceed with the standard GOSTnets cleaning process

In [35]:
def CleanNetwork(G, wpath, country, UTM, WGS = {'init': 'epsg:4326'}, junctdist = 50, verbose = False):
    
    # Squeezes clusters of nodes down to a single node if they are within the snapping tolerance
    a = gn.simplify_junctions(G, UTM, WGS, junctdist)

    # ensures all streets are two-way
    a = gn.add_missing_reflected_edges(a)
    
    #save progress
    if verbose is True: 
        gn.save(a, 'a', wpath)
    
    # Finds and deletes interstital nodes based on node degree
    b = gn.custom_simplify(a)
    
    # rectify geometry
    for u, v, data in b.edges(data = True):
        if type(data['Wkt']) == list:
                data['Wkt'] = gn.unbundle_geometry(data['Wkt'])
    
    # save progress
    if verbose is True: 
        gn.save(b, 'b', wpath)
    
    # For some reason CustomSimplify doesn't return a MultiDiGraph. Fix that here
    c = gn.convert_to_MultiDiGraph(b)

    # This is the most controversial function - removes duplicated edges. This takes care of two-lane but separate highways, BUT
    # destroys internal loops within roads. Can be run with or without this line
    c = gn.remove_duplicate_edges(c)

    # Run this again after removing duplicated edges
    c = gn.custom_simplify(c)

    # Ensure all remaining edges are duplicated (two-way streets)
    c = gn.add_missing_reflected_edges(c)
    
    # save final
    gn.save(c, '%s_processed' % country, wpath)
    
    print('Edge reduction: %s to %s (%d percent)' % (G.number_of_edges(), 
                                               c.number_of_edges(), 
                                               ((G.number_of_edges() - c.number_of_edges())/G.number_of_edges()*100)))
    return c

The above two cells define our two main processes - loading the graph and cleaning the graph. 

We go ahead and run this for Yemen below. Note that you can run multiple countries at a time if needed using this loop formation

In [36]:
# set UTM zone for measuring distances (Relevant for junction collapsing)
UTMZs = {'YEM':32638}

# set the base projection
WGS = {'init': 'epsg:4326'}

# add as many countries as you have networks to be cleaned; here, we have 1
countries = ['YEM']

# set the filepath
fpath = r'C:\Users\charl\Documents\GOST\Yemen\YEM\Round 3'

# work through the countries list
for country in countries:
    
    print('\n--- processing for: %s ---\n' % country)
    print('start: %s\n' % time.ctime())
    
    # set a write path for the outputs, and make it if it doesn't already exist
    wpath = os.path.join(fpath, r'output' )
    if not os.path.exists(wpath):
        os.mkdir(wpath)
    
    print('Outputs can be found at: %s\n' % (wpath))
        
    # make a string in the format GeoPandas will accept for reprojection
    UTM = {'init': 'epsg:%d' % UTMZs[country]}
    
    # read that graph in...
    G = InitialReadIn(fpath, country)
    
    #...and clean that graph!!
    G = CleanNetwork(G, wpath, country, UTM, WGS, 0.5, verbose = False)
    print('\nend: %s' % time.ctime())
    print('\n--- processing complete for: %s ---' % country)


--- processing for: YEM ---

start: Tue Apr 30 14:29:47 2019

Outputs can be found at: C:\Users\charl\Documents\GOST\Yemen\YEM\Round 3\output

These two should equal: A) length of final df: 154550 | B) length of original df: 154550
154538
308119
293625
147581
293608
Edge reduction: 154550 to 293608 (-89 percent)

end: Tue Apr 30 14:57:31 2019

--- processing complete for: YEM ---
