### Y2018M02D16_RH_Number_Streams_Per_Basin_V01

* Purpose of script: determine the number of streams per GDBD basin
* Author: Rutger Hofste
* Kernel used: python35
* Date created: 20180216

Approach:  
1. load line data  
1. buffer lines with tiny value
1. group lines based on overlap
1. Add group numbers to basin polygons
1. basins with more than 1 (>1) line groups are delta regions

Old approach:
[Strategy](https://gis.stackexchange.com/questions/132723/unsplit-dissolve-multiple-touching-lines-in-stream-network-using-arcgis-desktop)

1. Explode multilines into single lines 
1. Tiny buffer around single lines
1. Take Union
1. Spatial join single line geodataframe and dissolved ID's 
1. Aggregate using polyline ID from previous step






In [1]:
import time, datetime, sys
dateString = time.strftime("Y%YM%mD%d")
timeString = time.strftime("UTC %H:%M")
start = datetime.datetime.now()
print(dateString,timeString)
sys.version

Y2018M02D19 UTC 11:20


'3.5.4 |Continuum Analytics, Inc.| (default, Aug 14 2017, 13:26:58) \n[GCC 4.4.7 20120313 (Red Hat 4.4.7-1)]'

In [2]:
SCRIPT_NAME = "Y2018M02D16_RH_Number_Streams_Per_Basin_V01"

EC2_INPUT_PATH  = ("/volumes/data/{}/input/").format(SCRIPT_NAME)
EC2_OUTPUT_PATH = ("/volumes/data/{}/output/").format(SCRIPT_NAME)

S3_INPUT_PATH = "s3://wri-projects/Aqueduct30/processData/Y2018M02D15_RH_GDBD_Merge_V01/output/"
S3_OUTPUT_PATH = "s3://wri-projects/Aqueduct30/processData/{}/output".format(SCRIPT_NAME)


INPUT_VERSION = 6
OUTPUT_VERSION = 1

TESTING = 1


In [None]:
!rm -r {EC2_INPUT_PATH}
!rm -r {EC2_OUTPUT_PATH}

!mkdir -p {EC2_INPUT_PATH}
!mkdir -p {EC2_OUTPUT_PATH}

In [None]:
!aws s3 cp {S3_INPUT_PATH} {EC2_INPUT_PATH} --recursive

In [3]:
import geopandas as gpd
import pandas as pd
import eeconvert
import folium


%matplotlib inline

In [4]:
file_name_streams = "{}GDBD_streams_EPSG4326_V{:02.0f}.shp".format(EC2_INPUT_PATH,INPUT_VERSION)
file_name_basins = "{}GDBD_basins_EPSG4326_V{:02.0f}.shp".format(EC2_INPUT_PATH,INPUT_VERSION)

In [5]:
gdf_streams = gpd.GeoDataFrame.from_file(file_name_streams)
gdf_basins = gpd.GeoDataFrame.from_file(file_name_basins)

In [6]:
gdf_streams_backup = gdf_streams.copy()
tiny_value = 0.00001

if TESTING:
    gdf_streams = gdf_streams[0:200]
    tiny_value = 0.1


In [7]:
gdf_streams['GDBD_ID'] = gdf_streams['GDBD_ID'].astype('int64')

In [8]:
gdf_streams.dtypes

OBJECTID        int64
GDBD_ID         int64
Ave_Str_Sl    float64
Shape_Leng    float64
geometry       object
dtype: object

## Functions 

In [9]:
def explode(gdf):
    """
    Will explode the geodataframe's muti-part geometries into single
    geometries. Each row containing a multi-part geometry will be split into
    multiple rows with single geometries, thereby increasing the vertical size
    of the geodataframe. The index of the input geodataframe is no longer
    unique and is replaced with a multi-index.

    The output geodataframe has an index based on two columns (multi-index)
    i.e. 'level_0' (index of input geodataframe) and 'level_1' which is a new
    zero-based index for each single part geometry per multi-part geometry
        
    Args:
        gdf (gpd.GeoDataFrame) : input geodataframe with multi-geometries
        
    Returns:
        gdf (gpd.GeoDataFrame) : exploded geodataframe with each single
                                 geometry as a separate entry in the
                                 geodataframe. The GeoDataFrame has a multi-
                                 index set to columns level_0 and level_1
        
    """
    gs = gdf.explode()
    gdf2 = gs.reset_index().rename(columns={0: 'geometry'})
    gdf_out = gdf2.merge(right=gdf.drop('geometry', axis=1),
                         left_on='level_0',
                         right_index=True)
    gdf_out = (gdf_out.set_index(['level_0', 'level_1'])
                      .set_geometry('geometry'))
    gdf_out.crs = gdf.crs
    return gdf_out


def group_geometry(gdf, buffer_value=0.01, out_column_name="geometry_group"):
    """
    Adds a column to the dataframe with a geometry group number. The
    group number is determined by overlapping or touching geometries.
    A geodataframe will be exploded before assigning the group number.
    If the input geodataframe contains multi-geometries, the shape will
    increase. This function can also be used to dissolve a geodataframe
    on intersecting geometries instead of attributes. Use this function
    followed by the .dissolve(by='geometry_group') method. 

    Args:
        gdf (gpd.GeoDataFrame)  : input geodataframe with multi-geometries.
        buffer_value (float)    : buffer distance in crs units. Defaults
                                  to 0.0001.
        out_column_name (string): name of output column containing the group
                                  number. Defaults to 'geometry_group'

    Returns:
        gdf (gpd.GeoDataFrame) : geodataframe with new index column
                                 and grouped geometries.
    """
    gdf_polygon = gdf.copy()
    gdf_polygon['geometry'] = gdf_polygon.geometry.buffer(buffer_value,
                                                          resolution=1)
    gdf_polygon["group"] = 1
    gdf_polygon_dissolved = gdf_polygon.dissolve(by="group")
    gdf_out = explode(gdf_polygon_dissolved)
    gdf_out = gdf_out.reset_index()
    gdf_grouped = gpd.GeoDataFrame(gdf_out["level_1"],
                                   geometry=gdf_out.geometry)
    gdf_grouped.rename(columns={'level_1': out_column_name},
                       inplace=True)

    gdf_exploded = explode(gdf)
    gdf_exploded = gdf_exploded.reset_index()
    gdf_exploded_out = gpd.sjoin(gdf_exploded,
                                 gdf_grouped,
                                 how="left",
                                 op='within')
    return gdf_exploded_out


def stream_groups_per_basin():
    pass

    

In [14]:
%%time
gdf_stream_groups = group_geometry(gdf_streams)

CPU times: user 240 ms, sys: 4 ms, total: 244 ms
Wall time: 243 ms


In [15]:
gdf_streams_simple = gpd.GeoDataFrame(gdf_stream_groups["geometry_group"],
                                      geometry=gdf_stream_groups.geometry)

In [43]:
gdf_streams_simple["geometry_group_copy"] = gdf_streams_simple["geometry_group"]

In [45]:
gdf_streams_grouped_simple = gdf_streams_simple.dissolve(by="geometry_group_copy")

In [46]:
gdf_streams_grouped_simple

Unnamed: 0_level_0,geometry,geometry_group
geometry_group_copy,Unnamed: 1_level_1,Unnamed: 2_level_1
0,(LINESTRING (32.23592856349511 30.880802110121...,0
1,LINESTRING (27.36131703823457 30.7937046138648...,1
2,LINESTRING (28.99847059899493 30.6683881323825...,2
3,LINESTRING (28.16840064225422 30.6357947070989...,3
4,(LINESTRING (32.56729025076923 30.174823195219...,4
5,LINESTRING (30.40176689316669 30.2761326669720...,5
6,(LINESTRING (31.91423985992394 30.131198343100...,6
7,LINESTRING (25.5572082631426 31.30988128546547...,7
8,LINESTRING (26.28007659743832 31.0197519763545...,8
9,(LINESTRING (29.97812733339701 30.763012436616...,9


In [47]:
%%time
gdf_test = gpd.sjoin(gdf_basins, gdf_streams_grouped_simple, how="left", op='intersects')

CPU times: user 7.41 s, sys: 4 ms, total: 7.41 s
Wall time: 7.41 s


In [48]:
gdf_test.columns

Index(['Accum_Area', 'Ave_Elev', 'Ave_Slp', 'Basin_NO', 'Cntry_1',
       'Cntry_1_Rt', 'Cntry_2', 'Cntry_2_Rt', 'Cntry_3', 'Cntry_3_Rt',
       'Cntry_4', 'Cntry_4_Rt', 'Cntry_5', 'Cntry_5_Rt', 'Cntyr_2_Rt',
       'Dwn_Pfa_Co', 'GDBD_ID', 'LULC_1', 'LULC_10', 'LULC_11', 'LULC_12',
       'LULC_13', 'LULC_14', 'LULC_15', 'LULC_16', 'LULC_17', 'LULC_2',
       'LULC_3', 'LULC_4', 'LULC_5', 'LULC_6', 'LULC_7', 'LULC_8', 'LULC_9',
       'OBJECTID', 'Pfa_Code', 'Pop', 'Pop_Dnsty', 'Region_NO', 'Shape_Area',
       'Shape_Leng', 'SubRegion_', 'geometry', 'index_right',
       'geometry_group'],
      dtype='object')

In [49]:
gdf_test_simple = gpd.GeoDataFrame(gdf_test[["geometry_group","GDBD_ID"]],
                                   geometry=gdf_test.geometry)

In [50]:
gdf_match = gdf_test_simple.loc[gdf_test["geometry_group"]>=0]

In [51]:
gdf_match.groupby(['GDBD_ID']).agg(['mean', 'count'])

Unnamed: 0_level_0,geometry_group,geometry_group
Unnamed: 0_level_1,mean,count
GDBD_ID,Unnamed: 1_level_2,Unnamed: 2_level_2
1.0,8.000000,1
2.0,7.000000,1
3.0,11.000000,1
4.0,1.000000,1
5.0,3.000000,1
6.0,2.000000,1
7.0,5.000000,1
8.0,6.142857,7
9.0,12.000000,1
10.0,12.000000,1


In [52]:
gdf_deltas = gdf_match.loc[gdf_match["geometry_group"]>=0]

In [53]:
gdf_deltas

Unnamed: 0,geometry_group,GDBD_ID,geometry
6563,8.0,1.0,"POLYGON ((26.44199696356858 31.17308236804826,..."
6564,7.0,2.0,"POLYGON ((25.94694098147164 31.10916563448658,..."
8272,12.0,27.0,"POLYGON ((30.13305330667534 27.76536870684649,..."
8273,12.0,28.0,"POLYGON ((30.07181247813452 27.74895471038016,..."
8274,12.0,29.0,"POLYGON ((30.08385096938841 27.77600880219996,..."
8275,12.0,30.0,"POLYGON ((30.07181247813452 27.74895471038016,..."
8276,12.0,31.0,"POLYGON ((30.14164900221842 27.74682671914119,..."
8277,12.0,32.0,"POLYGON ((30.31974404631066 27.06550225587304,..."
8278,12.0,33.0,"POLYGON ((29.9815280476925 27.20354383020021, ..."
8279,12.0,34.0,"POLYGON ((29.92190882210782 27.20531443938485,..."


In [None]:
gdf_test["stream_group"].unique()

In [None]:
a = gdf_test.loc[gdf_test['stream_group'] > 0]

In [None]:
a

In [None]:
output_path_shp = "{}gdf_streams_group_V{:02.0f}.shp".format(EC2_OUTPUT_PATH,OUTPUT_VERSION)
print(output_path_shp)

In [None]:
gdf_streams_group.to_file(output_path_shp,driver='ESRI Shapefile')

In [None]:
!aws s3 cp --recursive {EC2_OUTPUT_PATH} {S3_OUTPUT_PATH}

In [None]:
test = gdf_basins.loc[7:9]

In [None]:
gdf_basins

In [None]:
gdf_test = gdf_out[1:100]

In [None]:
gdf_test.plot()

In [None]:
gdf_test.shape

In [None]:
tiny_value = 0.01
gdf_test_polygon = gdf_test.copy()
gdf_test_polygon['geometry'] = gdf_test_polygon.geometry.buffer(tiny_value)

In [None]:
gdf_test.head()

In [None]:
gdf_test["group"] = 1
gdf_test_dissolved = gdf_test.dissolve(by="group")
gdf_test_out = explode(gdf_test_dissolved)
gdf_test_out = gdf_test_out.reset_index()

In [None]:
gdf_test_out

In [None]:
gdf_test_out.plot(column="level_1")

In [None]:
gdf_test = gdf_test.reset_index()

In [None]:
gdf_test2 = gdf_test.copy()

In [None]:
gdf_test2.head()

## Tiny buffer

In [None]:
gdf_test2.head()

In [None]:
gdf_test2.plot()

## whatever

In [None]:
intersection = gpd.overlay(gdf_test2,gdf_test2, how='intersection')

In [None]:
gdf4 = gpd.sjoin(gdf_test, gdf_test2, how="inner", op='intersects')

In [None]:
gdf4.head()

In [None]:
shape = "streams"

In [None]:
print(output_path_shp)

In [None]:
end = datetime.datetime.now()
elapsed = end - start
print(elapsed)