# Create Line shapefile from CSV File 

* Purpose of script: Create a shapefile to visualize the flow network
* Author: Rutger Hofste
* Kernel used: python35
* Date created: 20171009

In [1]:
import time, datetime, sys
dateString = time.strftime("Y%YM%mD%d")
timeString = time.strftime("UTC %H:%M")
start = datetime.datetime.now()
print(dateString,timeString)
sys.version

Y2017M10D09 UTC 16:21


'3.5.4 |Continuum Analytics, Inc.| (default, Aug 14 2017, 13:26:58) \n[GCC 4.4.7 20120313 (Red Hat 4.4.7-1)]'

In [2]:
INPUT_VERSION = 1
OUTPUT_VERSION = 4

S3_INPUT_PATH =  "s3://wri-projects/Aqueduct30/processData/Y2017M08D29_RH_Merge_FAONames_Upstream_V01/output/"
S3_OUTPUT_PATH = "s3://wri-projects/Aqueduct30/processData/Y2017M10D09_RH_create_Line_Shape_File_V01/output/"

INPUT_FILE_NAME = "hybas_lev06_v1c_merged_fiona_upstream_downstream_FAO_V%0.2d" %(INPUT_VERSION)
OUTPUT_FILE_NAME = "Y2017M10D09_RH_create_Line_Shape_File_V%s.shp" %(OUTPUT_VERSION)

EC2_INPUT_PATH = "/volumes/data/Y2017M10D09_RH_create_Line_Shape_File_V01/input"
EC2_OUTPUT_PATH = "/volumes/data/Y2017M10D09_RH_create_Line_Shape_File_V01/output"


In [3]:
!rm -r {EC2_INPUT_PATH} 
!rm -r {EC2_OUTPUT_PATH} 

In [4]:
!mkdir -p {EC2_INPUT_PATH} 
!mkdir -p {EC2_OUTPUT_PATH} 

In [5]:
!aws s3 cp {S3_INPUT_PATH} {EC2_INPUT_PATH} --recursive

download: s3://wri-projects/Aqueduct30/processData/Y2017M08D29_RH_Merge_FAONames_Upstream_V01/output/hybas_lev06_v1c_merged_fiona_upstream_downstream_FAO_V01.cpg to ../../../../data/Y2017M10D09_RH_create_Line_Shape_File_V01/input/hybas_lev06_v1c_merged_fiona_upstream_downstream_FAO_V01.cpg
download: s3://wri-projects/Aqueduct30/processData/Y2017M08D29_RH_Merge_FAONames_Upstream_V01/output/hybas_lev06_v1c_merged_fiona_upstream_downstream_FAO_V01.shx to ../../../../data/Y2017M10D09_RH_create_Line_Shape_File_V01/input/hybas_lev06_v1c_merged_fiona_upstream_downstream_FAO_V01.shx
download: s3://wri-projects/Aqueduct30/processData/Y2017M08D29_RH_Merge_FAONames_Upstream_V01/output/hybas_lev06_v1c_merged_fiona_upstream_downstream_FAO_V01.dbf to ../../../../data/Y2017M10D09_RH_create_Line_Shape_File_V01/input/hybas_lev06_v1c_merged_fiona_upstream_downstream_FAO_V01.dbf
download: s3://wri-projects/Aqueduct30/processData/Y2017M08D29_RH_Merge_FAONames_Upstream_V01/output/hybas_lev06_v1c_merged_fio

In [6]:
import os
import pandas as pd
import numpy as np
import geopandas as gpd
from shapely.geometry import Point, LineString


In [7]:
df = pd.read_pickle(os.path.join(EC2_INPUT_PATH,INPUT_FILE_NAME+".pkl"))

In [8]:
gdf = gpd.read_file(os.path.join(EC2_INPUT_PATH,INPUT_FILE_NAME+".shp"))

In [9]:
gdf = gdf.set_index("PFAF_ID",drop=False)

In [10]:
print(df.shape,gdf.shape)

(16397, 25) (16397, 2)


In [11]:
df = df.drop_duplicates(subset="PFAF_ID",keep='first')

In [12]:
gdf = gdf.drop_duplicates(subset="PFAF_ID",keep='first')

In [13]:
print(df.shape,gdf.shape)

(16396, 25) (16396, 2)


In [14]:
gdfOut = gdf.copy()

In [15]:
gdfOut['geometry'] = gdf.geometry.centroid

In [16]:
df["centroid_x"] = gdfOut.geometry.x

In [17]:
df["centroid_y"] = gdfOut.geometry.y

In [18]:
df.head()

Unnamed: 0_level_0,HYBAS_ID2,Unnamed: 0,HYBAS_ID,NEXT_DOWN,NEXT_SINK,MAIN_BAS,DIST_SINK,DIST_MAIN,SUB_AREA,UP_AREA,...,Downstream_HYBAS_IDs,Downstream_PFAF_IDs,NEXT_SINK_PFAF,Basin_HYBAS_IDs,Basin_PFAF_IDs,SUB_NAME,MAJ_NAME,FAOid_copy,centroid_x,centroid_y
PFAF_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
611001,6060000010,0,6060000010,0,6060000010,6060000010,0.0,0.0,4317.4,4317.4,...,[],[],611001.0,[6060000010],[611001],"['Archipielago de San Blas Coast', 'Altrato 1']","['Caribbean Coast', 'Caribbean Coast']","['MAJ_BAS_3001_SUB_BASE_0001002', 'MAJ_BAS_300...",-78.013646,8.927
611002,6060000200,1,6060000200,0,6060000200,6060000200,0.0,0.0,35995.5,35996.7,...,[],[],611002.0,[6060000200],[611002],"['Altrato 1', 'Sucio', 'Altrato 2']","['Caribbean Coast', 'Caribbean Coast', 'Caribb...","['MAJ_BAS_3001_SUB_BASE_0001003', 'MAJ_BAS_300...",-76.765133,6.639787
611003,6060000210,2,6060000210,0,6060000210,6060000210,0.0,0.0,443.9,443.9,...,[],[],611003.0,[6060000210],[611003],"['Altrato 1', 'Golfo del Darien Coast']","['Caribbean Coast', 'Caribbean Coast']","['MAJ_BAS_3001_SUB_BASE_0001003', 'MAJ_BAS_300...",-76.909605,7.883699
611004,6060000240,3,6060000240,0,6060000240,6060000240,0.0,0.0,2186.3,2186.3,...,[],[],611004.0,[6060000240],[611004],['Golfo del Darien Coast'],['Caribbean Coast'],['MAJ_BAS_3001_SUB_BASE_0001006'],-76.645155,7.684698
611005,6060000250,4,6060000250,0,6060000250,6060000250,0.0,0.0,6533.8,6533.8,...,[],[],611005.0,[6060000250],[611005],"['Golfo del Darien Coast', 'Sinu']","['Caribbean Coast', 'Caribbean Coast']","['MAJ_BAS_3001_SUB_BASE_0001006', 'MAJ_BAS_300...",-76.438453,8.536851


In [19]:
df = df.set_index("HYBAS_ID",drop=False)   

In [20]:
for index, row in df.iterrows():    
    df.set_value(index,"next_centroid_x",df.loc[index]["centroid_x"])
    df.set_value(index,"next_centroid_y",df.loc[index]["centroid_y"])

In [21]:
df.head()

Unnamed: 0_level_0,HYBAS_ID2,Unnamed: 0,HYBAS_ID,NEXT_DOWN,NEXT_SINK,MAIN_BAS,DIST_SINK,DIST_MAIN,SUB_AREA,UP_AREA,...,NEXT_SINK_PFAF,Basin_HYBAS_IDs,Basin_PFAF_IDs,SUB_NAME,MAJ_NAME,FAOid_copy,centroid_x,centroid_y,next_centroid_x,next_centroid_y
HYBAS_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6060000010,6060000010,0,6060000010,0,6060000010,6060000010,0.0,0.0,4317.4,4317.4,...,611001.0,[6060000010],[611001],"['Archipielago de San Blas Coast', 'Altrato 1']","['Caribbean Coast', 'Caribbean Coast']","['MAJ_BAS_3001_SUB_BASE_0001002', 'MAJ_BAS_300...",-78.013646,8.927,-78.013646,8.927
6060000200,6060000200,1,6060000200,0,6060000200,6060000200,0.0,0.0,35995.5,35996.7,...,611002.0,[6060000200],[611002],"['Altrato 1', 'Sucio', 'Altrato 2']","['Caribbean Coast', 'Caribbean Coast', 'Caribb...","['MAJ_BAS_3001_SUB_BASE_0001003', 'MAJ_BAS_300...",-76.765133,6.639787,-76.765133,6.639787
6060000210,6060000210,2,6060000210,0,6060000210,6060000210,0.0,0.0,443.9,443.9,...,611003.0,[6060000210],[611003],"['Altrato 1', 'Golfo del Darien Coast']","['Caribbean Coast', 'Caribbean Coast']","['MAJ_BAS_3001_SUB_BASE_0001003', 'MAJ_BAS_300...",-76.909605,7.883699,-76.909605,7.883699
6060000240,6060000240,3,6060000240,0,6060000240,6060000240,0.0,0.0,2186.3,2186.3,...,611004.0,[6060000240],[611004],['Golfo del Darien Coast'],['Caribbean Coast'],['MAJ_BAS_3001_SUB_BASE_0001006'],-76.645155,7.684698,-76.645155,7.684698
6060000250,6060000250,4,6060000250,0,6060000250,6060000250,0.0,0.0,6533.8,6533.8,...,611005.0,[6060000250],[611005],"['Golfo del Darien Coast', 'Sinu']","['Caribbean Coast', 'Caribbean Coast']","['MAJ_BAS_3001_SUB_BASE_0001006', 'MAJ_BAS_300...",-76.438453,8.536851,-76.438453,8.536851


In [32]:
geometry = [Point(xy) for xy in zip(df['centroid_x'], df['centroid_y'])]

In [38]:
gs = gpd.GeoSeries(geometry, index=df['HYBAS_ID'])

In [33]:
gdfNew = gpd.GeoDataFrame(df,geometry="geometry")

TypeError: Input geometry column must contain valid geometry objects.

In [34]:
def createLine(row):
    line = LineString([Point({row.centroid_x,row.centroid_y}),Point({row.next_centroid_x,row.next_centroid_y})])
    return line

In [29]:
gdfNew.set_value(6060000200,"geometry",Point({1,2})) 

TypeError: float() argument must be a string or a number, not 'Point'

In [25]:
for index, row in gdfNew.iterrows():
    print(index)
    gdfNew.set_value(index,"geometry",createLine(row))    

6060000010


TypeError: float() argument must be a string or a number, not 'LineString'

In [None]:
d = {'Lat' : [1., 2., 3., 4.],
     'Lon' : [4., 3., 2., 1.],
     'Id': [1,1,2,2],
     'rutger':[42,43,44,45]}

In [None]:
df = pd.DataFrame(d)

In [None]:
df

In [None]:
geometry = [Point(xy) for xy in zip(df.Lon, df.Lat)]

In [None]:
point1 = Point({42,43})
point2 = Point({44,45})

lineString = LineString([point1,point2])


In [None]:
gdf = gpd.GeoDataFrame(df, geometry=geometry)

In [None]:
gdf

In [None]:
gdf = df.groupby(['Id'])['geometry'].apply(lambda x: LineString(x.tolist()))

In [None]:
# Aggregate these points with the GroupBy
geometry = df.groupby(['Id'])['geometry'].apply(lambda x: LineString(x.tolist()))

In [None]:
gdf = gpd.GeoDataFrame(gdf, geometry='geometry')

In [None]:
gdf

In [None]:
gdf.crs = {'init' :'epsg:4326'}

In [None]:
gdf.to_file(driver = 'ESRI Shapefile', filename = os.path.join(EC2_OUTPUT_PATH,OUTPUT_FILE_NAME))

In [None]:
gdf2 = gpd.read_file(os.path.join(EC2_INPUT_PATH,INPUT_FILE_NAME))

In [None]:
gdf2['geometry'] = gdf2.geometry.centroid

In [None]:
gdf2

In [None]:
gdf2.to_file(driver = 'ESRI Shapefile', filename = os.path.join(EC2_OUTPUT_PATH,OUTPUT_FILE_NAME2))

In [None]:
!aws s3 cp {EC2_OUTPUT_PATH} {S3_OUTPUT_PATH} --recursive