In [1]:
import json;
import copy

In [63]:
def checkIfNameExistsInData(name, data):
    nameCounter = 0
    for feature in data['features']:
        if feature['properties']['name'] == name:
            nameCounter += 1
            # print('Found ', feature['properties']['name'])
    return True if nameCounter > 0 else False

'''
check to see if feature names match a given name,
if they do, add their index in features to a list
'''
def getIndicesOfDups(name, data):
    lst = []
    for i, feature in enumerate(data['features']):
        if feature['properties']['name'] == name:
            lst.append(i)
    return lst
'''
create a dict (hashmap) that stores the name as key
and the value as an array of indices for features with
the same name
'''
def createMultiplesDict(data):
    multiplesDict = {}
    indexList = []
    # enumerate lets you iterate through a list
    # and access item and index
    for i, feature in enumerate(data['features']):
        name = feature['properties']['name']
        # if feature name already exists in dict
        # add the feature index to the list
        if name in multiplesDict:
            multiplesDict[name].append(i)
        # otherwise create the key (name)
        # value (index list) pair
        else:
            multiplesDict[name] = [i]
    return multiplesDict

'''
create brute force unique properties dict for each name
'''
def createUniquePropertiesDict(multi_dict, data):
    propertiesDict = {}
    # iterate over keys in multi_dict for unique names
    for key in multi_dict:
        # do a brute force approach and choose props from first index
        # get first index out of multi_dict
        ind = multi_dict[key][0]
        # set properties dict to properties of first index at unique name
        propertiesDict[key] = data['features'][ind]['properties']

    return propertiesDict    

'''
helper function to make sure all geometry is
converted to MultiLineStrings so as to compare
them more easily
'''
def insertCoordMultiLineString(geom):
    if geom['type'] == 'LineString':
        coordList = [geom['coordinates']]
    elif geom['type'] == 'MultiLineString':
        coordList = geom['coordinates']
    return coordList

'''
from the dict (hashmap) of feature names and lists of
duplicate indices, create another dict of names and
merged MultiLineString geometry - doesn't need to be sorted
since draw order doesn't matter?
'''
def createMultiLineStrings(multiDict, data):
    unsorted_LineString_dct = {}
    for key in multiDict:
        # only get the ones with multiple features
#         if len(multiDict[key]) > 1:
        for index in multiDict[key]:
            geom = data['features'][index]['geometry']
            if key in unsorted_LineString_dct:
                # add new multiLineString at key
                unsorted_LineString_dct[key] += insertCoordMultiLineString(geom)
            else:
                unsorted_LineString_dct[key] = insertCoordMultiLineString(geom)
              
    return unsorted_LineString_dct

'''
attempt to sort MultiLineStrings geometry for the same feature name
compare first and last coordinates of LineStrings to decide where
to insert LineString
'''
def getSortedGeometry(unsorted_LineString_dct):
    # create a new dict to store the sorted geometry
    sorted_LineString_dct = {}
    # iterate through the unsorted dict of names / geometry
    for key in unsorted_LineString_dct:
        # create a counter to keep track of how long we've
        # been in the while loop - to break out of the sorting
        count = 0
        # only look at the names with multiple features
        while len(unsorted_LineString_dct[key]) > 0:
            if count > 9999:
#                 print("these keys are already sorted")
#                 print(sorted_LineString_dct.keys(), key)
#                 print("this is the thing we are going to compare:")
#                 print(unsorted_LineString_dct[key].pop(-1))
#                 print(key, " still unsorted")
#                 print("Within the key, these are already sorted")
#                 print(sorted_LineString_dct[key])
#                 print("length: ", len(sorted_LineString_dct[key]))
#                 print("Within the key these are not sorted yet")
#                 print(unsorted_LineString_dct[key])
#                 print("length: ", len(unsorted_LineString_dct[key]))
                break
                
            # get the last item in the multiLineStringDict structure (instead of iterating)
            geom = unsorted_LineString_dct[key].pop(-1)
            # easiest and first case - key hasn't been added to sorted list yet
            # add geometry at new key
            if key not in sorted_LineString_dct:
                sorted_LineString_dct[key] = [geom]
            # otherwise figure out where to put lineString compared to other
            # lineStrings already at key
            else:
                # boolean to keep track of whether lineStrings overlap
                found = False
                # iterate through the lineStrings that are already sorted
                for i, lineString in enumerate(sorted_LineString_dct[key]):
                    firstLat = lineString[0][0] # get latitude of first coord
                    firstLon = lineString[0][1] # get longitude of first coord
                    lastLat = lineString[-1][0] # get latitude of last coord
                    lastLon = lineString[-1][1] # get longitude of last coord
                    
                    # get the absolute value of the difference between the coordinate pairs
                    # to see if coords are near each other (but don't exactly overlap)
                    diffComparator = 0.01
                    if abs(firstLat - geom[-1][0]) < diffComparator and abs(firstLon - geom[-1][1]) < diffComparator:
                        # last coordinate is the same as the first coordinate in another lineString that's been sorted
                        # place geom before current lineString
                        sorted_LineString_dct[key].insert(i, geom)
                        found = True
                        break # make sure to break once a match is found
                    elif abs(lastLat - geom[0][0]) < diffComparator and abs(lastLon - geom[0][1]) < diffComparator:
                        # first coordinate is the same as the last coordinate in another lineString that's been sorted
                        # place geom after current lineString
                        sorted_LineString_dct[key].insert(i+1, geom)
                        found = True
                        break
                    elif abs(firstLat - geom[0][0]) < diffComparator and abs(firstLon - geom[0][1]) < diffComparator:
                        # first coordinate is the same as the first coordinate in another linestring
                        # reverse second linestring before merging
                        reversedLineString = list(reversed(geom))
                        sorted_LineString_dct[key].insert(i, reversedLineString)
                        found = True
                        break
                    elif abs(lastLat - geom[-1][0]) < diffComparator and abs(lastLon - geom[-1][1]) < diffComparator:
                        # last coordinate is the same as the last coordinate in another linestring
                        # reverse second linestring before merging
                        reversedLineString = list(reversed(geom))
                        sorted_LineString_dct[key].insert(i+1, reversedLineString)
                        found = True
                        break
                if found == False:
                    # if it's not a match with any other coordinates
                    # put it back into the unsorted dict to try again
                    unsorted_LineString_dct[key].insert(0, geom)
            count += 1
    return sorted_LineString_dct,unsorted_LineString_dct

'''
create a function to merge properties of original data into 
merged geometry for duplicate names
'''
def createGeoJson(merged_Geom_Dct, props_dct):
    # create final output object structure
    finalGeoJson = {};
    finalGeoJson['type'] = 'FeatureCollection'
    finalGeoJson['features'] = []
    
    # add merged geom with correct properties to final features
    # iterate through unique keys dict
    for key in merged_Geom_Dct:
        # create object to hold feature data
        feature = {}
        props = {}
        geometry = {}
        # get geometry first
        geometry['type'] = 'MultiLineString'
        geometry['coordinates'] = merged_Geom_Dct[key]
        
        # get properties from properties dict
#         props = props_dct[key]
        props['name'] = props_dct[key]['name']
        props['name_alt'] = props_dct[key]['name_alt']
        props['scalerank'] = props_dct[key]['scalerank']
        
        feature['type'] = 'Feature'
        feature['geometry'] = geometry
        feature['properties'] = props
        
        # add feature to features list
        finalGeoJson['features'].append(feature)
    
    return finalGeoJson
        

In [64]:
with open('data/ne_50m_rivers_lake_centerlines.json', 'rb') as f:
    string = f.read()
    data = json.loads(string)

print(len(data['features']), " features in original data")    

multiples = createMultiplesDict(data)
print(len(multiples.keys()), " features with multiples - indices")

properties = createUniquePropertiesDict(multiples, data)
print(properties['Colorado'])

mLineStrings = createMultiLineStrings(multiples, data)
print(len(mLineStrings.keys()), " features with multiple linestrings")

# deep copy is used to ensure that you're not altering the unsorted dict
c_mLineStrings = copy.deepcopy(mLineStrings)

merge_sort_dct,unsorted_dct = getSortedGeometry(c_mLineStrings)

print(len(merge_sort_dct.keys()), " features with sorted linestrings")
print(len(unsorted_dct.keys()), " features with unsorted linestrings - most likely empty")

# print(merge_sort_dct["SudColorado"])

finalOutput = createGeoJson(merge_sort_dct, properties)
print(len(finalOutput['features']), " final output features")

with open('data/merged_rivers.json', 'w') as f:
  json.dump(finalOutput, f, sort_keys = True, indent = 4, ensure_ascii=False)

# for k,v in merge_sort_dct.items():
#     print(k, len(v))

462  features in original data
344  features with multiples - indices
{'scalerank': 5, 'featurecla': 'Lake Centerline', 'name': 'Colorado', 'note': '', 'min_zoom': 4.7, 'name_alt': '', 'name_en': 'Colorado', 'min_label': 5.7}
344  features with multiple linestrings
344  features with sorted linestrings
344  features with unsorted linestrings - most likely empty
344  final output features


In [4]:
alabama_unsorted = mLineStrings["Alabama"]
print(len(alabama_unsorted))

print("Linestrings before sorting anything")
# for ls in alabama_unsorted:
#     print(ls)

print(" ")
    
alabama_sorted = []
count = 0
while len(alabama_unsorted) > 0:
    print("length unsorted: ", len(alabama_unsorted))
    if count > 9:
        print("Failed!")
        print("length should be 5: ", len(alabama_sorted) + len(alabama_unsorted))
        print("These are the sorted LineStrings")
        for lString in alabama_sorted:
            print(lString)
        print(alabama_sorted)
        print("These are the unsorted LineStrings")
        for lString in alabama_unsorted:
            print(lString)
        break

    # get the last item in the multiLineStringDict structure (instead of iterating)
    geom = alabama_unsorted.pop(-1)
    # easiest case - 
    if len(alabama_sorted) == 0:
        alabama_sorted += [geom]
    else:
        found = False
        for i, lineString in enumerate(alabama_sorted):
            #print(lineString)
            firstLat = lineString[0][0] # get latitude of first coord
            firstLon = lineString[0][1] # get longitude of first coord
            lastLat = lineString[-1][0] # get latitude of last coord
            lastLon = lineString[-1][1] # get longitude of last coord

            diffComparator = 0.01
            if abs(firstLat - geom[-1][0]) < diffComparator and abs(firstLon - geom[-1][1]) < diffComparator:
                # first coordinate is the same as the last coordinate in another linestring
                # place geom before current coordinate list
                alabama_sorted.insert(i, geom)
                found = True
                break
            elif abs(lastLat - geom[0][0]) < diffComparator and abs(lastLon - geom[0][1]) < diffComparator:
                # last coordinate is the same as the first coordinate in another linestring
                # place geom after current coordinate list
                alabama_sorted.insert(i+1, geom)
                found = True
                break
            elif abs(firstLat - geom[0][0]) < diffComparator and abs(firstLon - geom[0][1]) < diffComparator:
                # first coordinate is the same as the first coordinate in another linestring
                # reverse second linestring before merging
                reversedLineString = list(reversed(geom))
                alabama_sorted.insert(i, reversedLineString)
                found = True
                break
            elif abs(lastLat - geom[-1][0]) < diffComparator and abs(lastLon - geom[-1][1]) < diffComparator:
                # last coordinate is the same as the last coordinate in another linestring
                # reverse second linestring before merging
                reversedLineString = list(reversed(geom))
                alabama_sorted.insert(i+1, reversedLineString)
                found = True
                break
        if found == False:
            alabama_unsorted.insert(0, geom)
    count += 1


5
Linestrings before sorting anything
 
length unsorted:  5
length unsorted:  4
length unsorted:  4
length unsorted:  4
length unsorted:  3
length unsorted:  3
length unsorted:  2
length unsorted:  2
length unsorted:  1


In [5]:
print(alabama_sorted)

[[[-85.18262447803721, 34.25945954052543], [-85.50128719774597, 34.20917837208424], [-85.6570916413661, 34.20457916899352], [-85.70279944513823, 34.20274465634088], [-85.72667395703746, 34.192150987078435], [-85.73328853993411, 34.17357331026011], [-85.70698523617966, 34.15414297092144], [-85.70538326662444, 34.13352407497615], [-85.7339603344019, 34.11244009013808], [-85.77121904172537, 34.106988227224974], [-85.8128702456653, 34.112569281347305], [-85.83803666875724, 34.09988271701383], [-85.84547807485325, 34.07590485322646], [-85.86284135574391, 34.05771474823712], [-85.88103146073331, 34.05771474823712], [-85.89774878647728, 34.05024750461871], [-85.90087521032268, 34.03208323805103], [-85.92162329747714, 34.010456651753145], [-85.9496577619961, 34.00975901976301], [-85.96454057508743, 33.991568915672985], [-85.96671098542362, 33.966686713421154], [-85.98394507510511, 33.94415578865875], [-86.03277930362191, 33.927903550908184], [-86.05481930196919, 33.88506378856306], [-86.046241