In [1]:
import pandas as pd
import geopandas as gpd
import glob
import os

pd.set_option('display.max_columns', None)      # 显示所有列
pd.set_option('display.max_rows', None)         # 显示所有行
pd.set_option('display.width', None)            # 不限制显示宽度
pd.set_option('display.max_colwidth', None)     # 不限制列宽

csv_path = '/shared5/RESEARCH_DATA/SWOT/VersionC/processed/node_20250210.csv'
df = pd.read_csv(csv_path)
print(df.columns)
print(df.head(2))

Index(['node_id', 'reach_id', 'time', 'lat', 'lon', 'wse', 'wse_u', 'width',
       'width_u', 'node_q_b', 'node_q', 'ice_clim_f', 'dark_frac', 'p_width',
       'p_n_ch_max', 'p_n_ch_mod', 'xovr_cal_q', 'xtrk_dist', 'wse_r_u',
       'p_length', 'file_name'],
      dtype='object')
          node_id     reach_id          time           lat           lon  \
0  22160300290761  22160300291  7.925395e+08 -1.000000e+12 -1.000000e+12   
1  22160300290771  22160300291  7.925395e+08 -1.000000e+12 -1.000000e+12   

            wse         wse_u      width   width_u  node_q_b  node_q  \
0 -1.000000e+12 -1.000000e+12   2.135025  0.004025  58723843       3   
1 -1.000000e+12 -1.000000e+12  78.151521  0.228697  58722307       3   

   ice_clim_f  dark_frac  p_width  p_n_ch_max  p_n_ch_mod  xovr_cal_q  \
0           1        1.0     63.0           1           1           2   
1           1        1.0     84.0           1           1           2   

     xtrk_dist       wse_r_u    p_length  \
0  5964

In [2]:
import pandas as pd
import numpy as np

data_v = 'VersionD'
model = 'strict'

# 读取数据
df_original = pd.read_csv(f'./1.all_matched_points_{data_v}.csv')

# 统计过滤前每个stationid的数据量
before_counts = df_original.groupby('stationid').size().reset_index(name='count_before')

# 开始过滤
df = df_original.copy()
print(f"初始数据量: {len(df)}")

# 替换缺失值并删除
df = df.replace(-999999999999, np.nan).dropna()
# print(f"删除缺失值后: {len(df)}")

# 应用各种过滤条件
'''bad node_q_b quality'''
df = df[df['node_q_b'] <= 2194304]   
# print(f"过滤 node_q_b <= 2194304 后: {len(df)}")

df = df[df['node_q'] <= 1]
# print(f"过滤 node_q <= 1 后: {len(df)}")

df = df[df['dark_frac'] <= 0.5]
# print(f"过滤 dark_frac <= 0.4 后: {len(df)}")

'''Quality of the cross-over calibration, Height correction from KaRIn crossover calibration.'''
df = df[df['xovr_cal_q'] <= 1] 
# print(f"过滤 xovr_cal_q <= 1 后: {len(df)}")

df = df[(df['xtrk_dist'].abs() > 15000) & (df['xtrk_dist'].abs() < 60000)]
# print(f"过滤 15 < |xtrk_dist| < 60 后: {len(df)}")

df = df[df['ice_clim_f'] <= 1]
# print(f"过滤 ice_clim_f <= 1 后: {len(df)}")
print(f"过滤后: {len(df)}")

if model == 'strict':
    ''' classification_qual_suspect '''
    df = df[df['node_q_b'] & (1 << 1) == 0]
    # print(f"过滤 node_q_b bit 1 后: {len(df)}")
    
    ''' geolocation_qual_suspect '''
    df = df[df['node_q_b'] & (1 << 2) == 0]
    # print(f"过滤 node_q_b bit 2 后: {len(df)}")
    
    ''' water_fraction_suspect '''
    df = df[df['node_q_b'] & (1 << 3) == 0]
    # print(f"过滤 node_q_b bit 3 后: {len(df)}")
    
    ''' few_wse_observations '''
    df = df[df['node_q_b'] & (1 << 11) == 0]
    # print(f"过滤 node_q_b bit 11 后: {len(df)}")
    
    ''' geolocation_qual_degraded '''
    df = df[df['node_q_b'] & (1 << 19) == 0]
    # print(f"过滤 node_q_b bit 19 后: {len(df)}")
    
    df = df[df['p_width'] >= 30.0]
    # print(f"过滤 p_width >= 30.0 后: {len(df)}")
    
    df = df[df['wse_r_u'] < 0.5]
    # print(f"过滤 wse_r_u < 0.5 后: {len(df)}")
    
    # df = df[abs(df['p_length']) > 7]
    # print(f"过滤 |p_length| > 7 后: {len(df)}")
    print(f"严格过滤后: {len(df)}")

df = df.drop(columns=['index_right'])
df['date'] = pd.to_datetime('2000-01-01') + pd.to_timedelta(df['time'], unit='s')
df['date'] = df['date'].dt.date
df = df.drop_duplicates()

# print(f"\n最终数据量: {len(df)}")
# print(f"总体保留率: {len(df)/len(df_original)*100:.2f}%")

# 统计过滤后每个stationid的数据量
after_counts = df.groupby('stationid').size().reset_index(name='count_after')

# 合并前后统计结果
comparison = before_counts.merge(after_counts, on='stationid', how='left')
comparison['count_after'] = comparison['count_after'].fillna(0).astype(int)

# 计算变化
comparison['retention_rate(%)'] = (comparison['count_after'] / comparison['count_before'] * 100).round(2)

# 排序（按照保留数据量降序）
comparison = comparison.sort_values('count_after', ascending=False)

# 添加汇总行
summary_row = pd.DataFrame({
    'stationid': ['TOTAL'],
    'count_before': [comparison['count_before'].sum()],
    'count_after': [comparison['count_after'].sum()],
    'retention_rate(%)': [(comparison['count_after'].sum() / comparison['count_before'].sum() * 100)]
})
comparison = pd.concat([comparison, summary_row], ignore_index=True)

# # 保存对比结果
# output_file = './2.swot_qa_comparison.csv'
# comparison.to_csv(output_file, index=False)
# print(f"\n对比结果已保存到: {output_file}")

# 显示统计摘要
print("\n=== 统计摘要 ===")
print(f"总站点数: {len(comparison)-1}")  # 减去汇总行
print(f"完全被过滤的站点数: {(comparison['count_after'] == 0).sum() - 1}")  # 减去汇总行
print(f"平均保留率: {comparison[comparison['stationid'] != 'TOTAL']['retention_rate(%)'].mean():.2f}%")
print(f"\n保留数据最多的前10个站点:")
print(comparison.head(11)[['stationid', 'count_before', 'count_after', 'retention_rate(%)']])

# 保存过滤后的数据
df.to_csv(f'./2.swot_qa_node_qa{model}_{data_v}.csv', index=False)
print(f"\n过滤后的数据已保存到: ./2.swot_qa.csv")

result = df.groupby(['date', 'stationid','node_id','reach_id','COMID'])[['wse', 'wse_u', 'width','width_u']].mean().reset_index()
result.to_csv(f'2.swot_qa_datemean_qa{model}_{data_v}.csv', index=False)

初始数据量: 127625
过滤后: 58947
严格过滤后: 27785

=== 统计摘要 ===
总站点数: 1938
完全被过滤的站点数: 453
平均保留率: 19.90%

保留数据最多的前10个站点:
          stationid  count_before  count_after  retention_rate(%)
0     USGS_11377100           177          139              78.53
1      GRDC_6935500           132          110              83.33
2      GRDC_4207320           115           93              80.87
3     USGS_01011000           124           86              69.35
4     USGS_06192500           118           85              72.03
5     USGS_12414500           125           85              68.00
6      GRDC_4206250           122           84              68.85
7     USGS_05443500           143           84              58.74
8      GRDC_4206920           120           83              69.17
9      GRDC_5654140           104           78              75.00
10  Brazil_20489100           115           77              66.96

过滤后的数据已保存到: ./2.swot_qa.csv
