In [1]:
import geopandas
import numpy as np
import pandas as pd

# Load the data

In [2]:
# We'll load the data from geojson files, but see also the shapefiles in the same directory, or these could
# be any OGR readable vector format

watersheds = geopandas.read_file('https://raw.githubusercontent.com/thomaspingel/geodata/master/roanoke_watershed/watersheds.geojson')
gdf = geopandas.read_file('https://raw.githubusercontent.com/thomaspingel/geodata/master/roanoke_watershed/storm_points.geojson')

In [3]:
watersheds.head(3)

Unnamed: 0,Watershed,TMDL_Water,Phase,Area_Ac,TIA_Ac,TIA_Pct,GlobalID,geometry
0,Roanoke River above Masons Creek,Roanoke River 2,,40530.632918,0.0,0.0,a6b2fd42-5f46-4486-9101-77d3554abd41,"POLYGON ((11024671.733 3652589.891, 11024699.5..."
1,Back Creek,Back Creek,,37561.660519,0.0,0.0,abbb57b8-7d38-4ed4-b70d-d41ed83ea617,"POLYGON ((11086384.494 3616511.612, 11086405.9..."
2,Barnhardt Creek,"Mud Lick Creek, Murray Run, and Ore Branch",,2630.613126,0.0,0.0,4e65165c-536f-4482-b9a1-ba88363f735a,"POLYGON ((11041922.519 3625650.977, 11041777.2..."


In [4]:
gdf.head(10)

Unnamed: 0,UNIQUEID,WATERSHED,geometry
0,1001,Back Creek,POINT (11088195.428 3611249.643)
1,1002,Back Creek,POINT (11025054.510 3597777.981)
2,1003,Carvin Creek,POINT (11058512.062 3680230.599)
3,1004,Roanoke River above Masons Creek,POINT (11012522.583 3639462.329)
4,1005,Roanoke River above Masons Creek,POINT (11018717.113 3637251.922)
5,0,,POINT (11064369.069 3643759.987)
6,0,,POINT (11031997.087 3641956.674)
7,0,,POINT (11039318.002 3643422.669)
8,0,,POINT (11054564.431 3675578.631)
9,0,,POINT (11051263.146 3586817.029)


# Add IDs

In [5]:
# What is the max valid id?  Compare this output to your data above.  Does it seem correct?

max_id = np.max(gdf['UNIQUEID'])
print(max_id)

1005


In [6]:
# Which ones are missing values?  Locate them by finding which one have a UNIQUEID equal to zero
# Inspect the output.  Does it seem correct?

idx = gdf['UNIQUEID'] == 0
print(idx)

0    False
1    False
2    False
3    False
4    False
5     True
6     True
7     True
8     True
9     True
Name: UNIQUEID, dtype: bool


In [7]:
# How many are there?  Find the sum by adding up all the "True" values (for the computer Trues are Ones)
# Inspect the output.  Does it seem correct?  What would have gone wrong if we used the len(idx)?

n = np.sum(idx)
print(n)

5


In [8]:
# Make new IDs by generating numbers starting from one higher than the current highest, and going from that to
# that, plus n, plus 1.  Why "plus n plus 1"?  Inspect the output.  Does it make sense?

new_ids = np.arange(max_id+1,max_id+n+1)
print(new_ids)

[1006 1007 1008 1009 1010]


In [9]:
# Set the UNIQUEID for rows in which IDX is True to the new_ids we just generated.
# Inspect the output.  Does it make sense?

gdf.loc[idx,'UNIQUEID'] = new_ids
gdf.head(10)

Unnamed: 0,UNIQUEID,WATERSHED,geometry
0,1001,Back Creek,POINT (11088195.428 3611249.643)
1,1002,Back Creek,POINT (11025054.510 3597777.981)
2,1003,Carvin Creek,POINT (11058512.062 3680230.599)
3,1004,Roanoke River above Masons Creek,POINT (11012522.583 3639462.329)
4,1005,Roanoke River above Masons Creek,POINT (11018717.113 3637251.922)
5,1006,,POINT (11064369.069 3643759.987)
6,1007,,POINT (11031997.087 3641956.674)
7,1008,,POINT (11039318.002 3643422.669)
8,1009,,POINT (11054564.431 3675578.631)
9,1010,,POINT (11051263.146 3586817.029)


# Add Watersheds

In [10]:
# Find rows were Watershed is null.  Why can't we just use the previously generated list?  Because we 
# can't be certain that the missing watershed rows are exactly where unique was missing.  They could be 
# missing independently of each other.

# Notice that we are overwriting the idx variable we used previously.  Why do that?  Because we don't need that variable
# anymore, and this prevents creating too many variables that can hamper understanding of code.
# https://techbeacon.com/app-dev-testing/why-unnecessary-variables-are-bad-your-code

# Inspect the output.  Does it make sense?

idx = gdf['WATERSHED'].isnull()
print(idx)

0    False
1    False
2    False
3    False
4    False
5     True
6     True
7     True
8     True
9     True
Name: WATERSHED, dtype: bool


In [11]:
# from GDF, pull out only those with missing watersheds
missing_df = gdf[idx]

# And join to them the data from the watershed layer.  Notice that "WATERSHED" is the point data, and "Watershed"
# comes from the polygonal boundaries layer
joined_df = geopandas.sjoin(missing_df,watersheds,how='left')

# Inspect the output!  Does this look correct?
joined_df

Unnamed: 0,UNIQUEID,WATERSHED,geometry,index_right,Watershed,TMDL_Water,Phase,Area_Ac,TIA_Ac,TIA_Pct,GlobalID
5,1006,,POINT (11064369.069 3643759.987),3,Carvin Creek,Carvin Creek,III,14830.576649,0.0,0.0,01525076-e9f5-45be-a686-5a5f664a3835
6,1007,,POINT (11031997.087 3641956.674),7,Mason Creek,Mason Creek,,18922.360268,0.0,0.0,2fffb4ea-8287-4b17-b6ef-6f4f751fea89
7,1008,,POINT (11039318.002 3643422.669),12,Peters Creek,Peters Creek,V,5784.105221,0.0,0.0,83d0497e-b346-4c88-aa0a-9efbd0f8055f
8,1009,,POINT (11054564.431 3675578.631),3,Carvin Creek,Carvin Creek,III,14830.576649,0.0,0.0,01525076-e9f5-45be-a686-5a5f664a3835
9,1010,,POINT (11051263.146 3586817.029),1,Back Creek,Back Creek,,37561.660519,0.0,0.0,abbb57b8-7d38-4ed4-b70d-d41ed83ea617


In [12]:
# Now that that's done on our "temporary" dataframe, join it back to the main dataframe.
# We could have done this without that temp dataframe.  How could we do it?  

# Inspect the output.  Does it look correct?

gdf.loc[idx,'WATERSHED'] = joined_df['Watershed']
gdf

Unnamed: 0,UNIQUEID,WATERSHED,geometry
0,1001,Back Creek,POINT (11088195.428 3611249.643)
1,1002,Back Creek,POINT (11025054.510 3597777.981)
2,1003,Carvin Creek,POINT (11058512.062 3680230.599)
3,1004,Roanoke River above Masons Creek,POINT (11012522.583 3639462.329)
4,1005,Roanoke River above Masons Creek,POINT (11018717.113 3637251.922)
5,1006,Carvin Creek,POINT (11064369.069 3643759.987)
6,1007,Mason Creek,POINT (11031997.087 3641956.674)
7,1008,Peters Creek,POINT (11039318.002 3643422.669)
8,1009,Carvin Creek,POINT (11054564.431 3675578.631)
9,1010,Back Creek,POINT (11051263.146 3586817.029)


# Write out the data

In [13]:
# We might really just overwrite the data, but for clarity in pedagogy we'll write out a separate file

gdf.to_file('out/storm_points_updated.shp')
gdf.to_file('out/storm_points_updated.geojson',driver='GeoJSON')

# Can I see that all at once?

It's nice to break things up step-by-step and inspect the output as you go.  But too many codeblocks can make understanding all the pieces difficult.  So let's see it again, all at once.

In [14]:
# Load data
watersheds = geopandas.read_file('https://raw.githubusercontent.com/thomaspingel/geodata/master/roanoke_watershed/watersheds.geojson')
gdf = geopandas.read_file('https://raw.githubusercontent.com/thomaspingel/geodata/master/roanoke_watershed/storm_points.geojson')

# Fix missing UNIQUEIDs
max_id = np.max(gdf['UNIQUEID'])
idx = gdf['UNIQUEID'] == 0
n = np.sum(idx)
new_ids = np.arange(max_id+1,max_id+n+1)
gdf.loc[idx,'UNIQUEID'] = new_ids

# Fix missing WATERSHED data
idx = gdf['WATERSHED'].isnull()
missing_df = gdf[idx]
joined_df = geopandas.sjoin(missing_df,watersheds,how='left')
gdf.loc[idx,'WATERSHED'] = joined_df['Watershed']

# Write out the data
gdf.to_file('storm_points_updated.shp')