In [None]:
#选取站点匹配的COMID

In [2]:
import glob
import geopandas as gpd
import pandas as pd
from shapely.geometry import Point

def find_nearest_river(dfpp, dfll, buffersize):
    # Ensure both dataframes use the same CRS (coordinate reference system)
    # Example: Convert dfpp to dfll's CRS if they are different
    if dfpp.crs != dfll.crs:
        print('   Reprojecting the data to a common CRS... please wait.')
        dfpp = dfpp.to_crs(dfll.crs)

    # Create buffer around the points (stations)
    print('Creating buffer... please wait.')
    poly = dfpp.buffer(buffersize)
    polygpd = gpd.GeoDataFrame(dfpp, geometry=poly)

    # Perform spatial join with flowlines to identify intersecting river segments
    print('Performing spatial join with flowlines... please wait.')
    join = gpd.sjoin(polygpd, dfll, how='inner', predicate='intersects')

    # Merge the river flowline data into the result
    print('Merging river data...')
    merge = join.merge(dfll[['COMID', 'geometry']], on='COMID', how='left')
    print(merge.columns)

    # Calculate distance to the river segment for each station
    print('Calculating distance to river segments... please wait.')
    merge['distance'] = merge.apply(lambda row: row['geometry_y'].distance(Point(row['lon'], row['lat'])), axis=1)
    print(merge.columns)


    # Find the minimum distance per station
    print('Finding minimum distance per station...')
    join_min_dist = merge.groupby('stationid')['distance'].min().reset_index()

    # Merge the minimum distances back to the original dataframe to get the final results
    final_merge = pd.merge(merge, join_min_dist, on=['stationid', 'distance'], how='inner')

    # Select the final relevant columns
    final = final_merge[['stationid', 'COMID', 'distance', 'lon', 'lat']]

    return final


if __name__ == '__main__':
    # Load station data
    df = pd.read_csv('GAGES_LATLON.csv')

    # Convert stations to GeoDataFrame (assume it's in WGS84 initially)
    points = [Point(lon, lat) for lon, lat in zip(df['lon'], df['lat'])]
    dfpp = gpd.GeoDataFrame(df, geometry=points, crs="EPSG:4326")  # Assume stations are in WGS84

    # 获取所有匹配的shp文件
    shp_files = glob.glob('/shared1/RESEARCH_DATA/MERIT_Basins/riv_pfaf_*.shp')
    
    # 读取所有shp文件并合并
    gdfs = [gpd.read_file(f) for f in shp_files]
    dfll = gpd.GeoDataFrame(pd.concat(gdfs, ignore_index=True))

    # Ensure both dataframes use the same CRS
    buffersize = 0.05  # ~5km buffer
    aa = find_nearest_river(dfpp, dfll, buffersize)

    # Save the result to a CSV file
    output_file = 'GAGE3000points_MERIT_Basin_join.csv'
    print(f'Writing to {output_file}...')
    aa.to_csv(output_file, index=False)

Creating buffer... please wait.
Performing spatial join with flowlines... please wait.



  poly = dfpp.buffer(buffersize)


Merging river data...
Index(['stationid', 'lat', 'lon', 'geometry_x', 'index_right', 'COMID',
       'lengthkm', 'lengthdir', 'sinuosity', 'slope', 'uparea', 'order',
       'strmDrop_t', 'slope_taud', 'NextDownID', 'maxup', 'up1', 'up2', 'up3',
       'up4', 'geometry_y'],
      dtype='object')
Calculating distance to river segments... please wait.
Index(['stationid', 'lat', 'lon', 'geometry_x', 'index_right', 'COMID',
       'lengthkm', 'lengthdir', 'sinuosity', 'slope', 'uparea', 'order',
       'strmDrop_t', 'slope_taud', 'NextDownID', 'maxup', 'up1', 'up2', 'up3',
       'up4', 'geometry_y', 'distance'],
      dtype='object')
Finding minimum distance per station...
Writing to GRADES_3000points_MERIT_Basin_join.csv...


In [6]:
#将河道两端2倍河宽裁掉并进行600m切割
from shapely.geometry import LineString
from shapely.ops import substring

# 读取数据
# dfll = dfll.to_crs(epsg=5070)

width_files = glob.glob('/shared1/RESEARCH_DATA/GLOW_reach_average_zimin/region_*.csv')

# 读取所有CSV文件并合并
width_list = [pd.read_csv(f) for f in width_files]
width = pd.concat(width_list, ignore_index=True)
meanwidth= width.groupby('COMID')['width'].mean()
output_file = 'meanwidth_all_regions.csv'
meanwidth.to_csv(output_file, index=False)

aa_width = aa.merge(meanwidth.reset_index()[['COMID', 'width']], on='COMID', how='left')
aa_width_gdf = gpd.GeoDataFrame(aa_width, geometry=gpd.points_from_xy(aa.lon, aa.lat), crs='EPSG:4326')#.to_crs(epsg=5070)

# 提取匹配COMID的线段
matching_lines = dfll[dfll['COMID'].isin(aa_width_gdf['COMID'])]
matching_lines.to_file('GRADES_3000points_MERIT_Basin_line.shp')  # 导出匹配的线段到shp文件

# 假设RiverWidth信息在aa表中
width_df = aa_width[['COMID', 'width']]

# 合并获取宽度信息
matching_lines = matching_lines.merge(width_df, on='COMID').to_crs(epsg=3395)

# 根据RiverWidth裁剪线段两端（保留折点信息）
def clip_line_keep_vertices(line, width):
    length = line.length
    if length > 4 * width:
        return substring(line, 2 * width, length - 2 * width)
    else:
        return None  # 过短线段舍弃

# 创建副本避免警告
matching_lines_cut = matching_lines.copy()

# 应用裁剪函数并保留属性
matching_lines_cut['geometry'] = matching_lines_cut.apply(
    lambda row: clip_line_keep_vertices(row.geometry, row.width), axis=1)

print(f"裁剪前线段数量: {len(matching_lines)}")
matching_lines_cut = matching_lines_cut.dropna(subset=['geometry']).reset_index(drop=True)
print(f"裁剪后线段数量: {len(matching_lines_cut)}")

# 分割长度
segment_length = 600  # 如果经纬度需转换为度数: 600/110000

# 线段切割函数（保留属性）
def split_line_keep_attrs(row, segment_length):
    line = row.geometry
    length = line.length
    segments = []

    if length <= segment_length:
        segments.append((row.drop('geometry').to_dict(), line))
    else:
        distances = list(range(0, int(length), segment_length)) + [length]
        for i in range(len(distances) - 1):
            segment = substring(line, distances[i], distances[i + 1])
            segments.append((row.drop('geometry').to_dict(), segment))

    return segments

# 执行分割（属性与几何单独保存）
segment_attrs = []
segment_geoms = []

for idx, row in matching_lines_cut.iterrows():
    segments = split_line_keep_attrs(row, segment_length)
    for attr_dict, geom in segments:
        segment_attrs.append(attr_dict)
        segment_geoms.append(geom)
    if idx % 100 == 0:
        print(f"已处理线段数: {idx+1}/{len(matching_lines_cut)}")

# 创建DataFrame和GeoSeries，组合成GeoDataFrame
attrs_df = pd.DataFrame(segment_attrs)
segments_gdf = gpd.GeoDataFrame(attrs_df, geometry=segment_geoms, crs=matching_lines_cut.crs)

# 保存为shp文件
output_file = 'GAGE_3000points_MERIT_Basin_600m_line.shp'
segments_gdf.to_file(output_file)

print(f"生成600m线段总数: {len(segments_gdf)}")
print(f"600m segments shapefile 已保存为 '{output_file}'.")


裁剪前线段数量: 3087
裁剪后线段数量: 2021
已处理线段数: 1/2021
已处理线段数: 101/2021
已处理线段数: 201/2021
已处理线段数: 301/2021
已处理线段数: 401/2021
已处理线段数: 501/2021
已处理线段数: 601/2021
已处理线段数: 701/2021
已处理线段数: 801/2021
已处理线段数: 901/2021
已处理线段数: 1001/2021
已处理线段数: 1101/2021
已处理线段数: 1201/2021
已处理线段数: 1301/2021
已处理线段数: 1401/2021
已处理线段数: 1501/2021
已处理线段数: 1601/2021
已处理线段数: 1701/2021
已处理线段数: 1801/2021
已处理线段数: 1901/2021
已处理线段数: 2001/2021
生成600m线段总数: 58454
600m segments shapefile 已保存为 'GRADES_3000points_MERIT_Basin_600m_line.shp'.


In [10]:
dfll = gpd.read_file('GAGE_3000points_MERIT_Basin_600m_line.shp').reset_index().to_crs(epsg=3395)
def find_nearest_river(dfpp, dfll, buffersize):
    # Ensure both dataframes use the same CRS (coordinate reference system)
    if dfpp.crs != dfll.crs:
        print('Reprojecting the data to a common CRS... please wait.')
        dfpp = dfpp.to_crs(dfll.crs)

    # Create buffer around the points (stations)
    print('Creating buffer... please wait.')
    poly = dfpp.buffer(buffersize)
    polygpd = gpd.GeoDataFrame(dfpp, geometry=poly)

    # Perform spatial join with flowlines to identify intersecting river segments
    print('Performing spatial join with flowlines... please wait.')
    join = gpd.sjoin(polygpd, dfll, how='inner', predicate='intersects')

    # Merge the river flowline data into the result
    print('Merging river data...')
    merge = join.merge(dfll[['index',  'geometry']], on='index', how='left')

    # Calculate distance to the river segment for each station
    print('Calculating distance to river segments... please wait.')
    merge['distance'] = merge.apply(lambda row: row['geometry_y'].distance(Point(row['lon'], row['lat'])), axis=1)

    # Find the minimum distance per station and the corresponding river segment
    print('Finding minimum distance per station...')
    join_min_dist = merge.loc[merge.groupby('stationid')['distance'].idxmin()]

    # Select the final relevant columns
    final = join_min_dist[['stationid', 'lat', 'lon', 'distance', 'COMID','width', 'index', 'geometry_y']]

    # Rename geometry_y to geometry for clarity
    final = final.rename(columns={'geometry_y': 'geometry'})

    # Convert to GeoDataFrame and set the correct CRS
    final_gdf = gpd.GeoDataFrame(final, geometry='geometry', crs=dfll.crs)

    return final_gdf

if __name__ == '__main__':
    # Load station data
    df = pd.read_csv('GAGE_LATLON.csv')

    # Convert stations to GeoDataFrame (assume it's in WGS84 initially)
    points = [Point(lon, lat) for lon, lat in zip(df['lon'], df['lat'])]
    dfpp = gpd.GeoDataFrame(df, geometry=points, crs="EPSG:4326")  # Assume stations are in WGS84

    # Read river segments data (assuming it's in NAD83)
    dfll = gpd.read_file('GAGE_3000points_MERIT_Basin_600m_line.shp').reset_index()

    # Ensure both dataframes use the same CRS
    buffersize = 600  # ~5km buffer
    GRADES_points_MERIT_600m_nearest_match = find_nearest_river(dfpp, dfll, buffersize)

    # Save the result to a Shapefile
    output_file = 'GAGE_3000points_MERIT_Basin_600m_nearest_match.shp'
    print(f'Writing to {output_file}...')
    print(len(GRADES_points_MERIT_600m_nearest_match))
    GRADES_points_MERIT_600m_nearest_match.to_file(output_file)



# 定义函数处理每条线段
def straight_line_buffer(row):
    line_geom = row.geometry
    width = row.width * 4
    # 提取线段两端点坐标
    start_point = line_geom.coords[0]
    end_point = line_geom.coords[-1]
    # 创建直线连接两端点
    straight_line = LineString([start_point, end_point])
    # 创建 buffer，buffer的两端取直角（cap_style=2）
    buffer_poly = straight_line.buffer(width, cap_style=2)
    return buffer_poly


# 对matching_lines_600m创建buffer的polygon
GRADES_points_MERIT_600m_nearest_match['buffer_polygon'] = GRADES_points_MERIT_600m_nearest_match.apply(straight_line_buffer, axis=1)

# 转换为 GeoDataFrame，并设置 geometry 为 buffer_polygon
buffered_gdf = gpd.GeoDataFrame(GRADES_points_MERIT_600m_nearest_match, geometry='buffer_polygon', crs=dfll.crs)
buffered_gdf = buffered_gdf.drop(columns='geometry')
# 输出为shapefile（可选）
buffered_gdf.to_file('GAGE_3000points_MERIT_Basin_buffer_polygon_4widths.shp')

Reprojecting the data to a common CRS... please wait.
Creating buffer... please wait.
Performing spatial join with flowlines... please wait.
Merging river data...
Calculating distance to river segments... please wait.
Finding minimum distance per station...
Writing to GRADES_3000points_MERIT_Basin_600m_nearest_match.shp...
1951


  buffered_gdf.to_file('GRADES_3000points_MERIT_Basin_buffer_polygon_4widths.shp')
