In [1]:
import pandas as pd
import numpy as np

data_vs = ['VersionD']
models  = ['loose','strict']

for data_v in data_vs:
    for model in models:
    
        # 读取数据
        df_original = pd.read_csv(f'./1.all_matched_points_{data_v}.csv')
        
        # 统计过滤前每个stationid的数据量
        before_counts = df_original.groupby('stationid').size().reset_index(name='count_before')
        
        # 开始过滤
        df = df_original.copy()
        print(f"初始数据量: {len(df)}")
        
        # 替换缺失值并删除
        df = df.replace(-999999999999, np.nan).dropna()
        # print(f"删除缺失值后: {len(df)}")
        
        # 应用各种过滤条件
        '''bad node_q_b quality'''
        df = df[df['node_q_b'] <= 2194304]   
        # print(f"过滤 node_q_b <= 2194304 后: {len(df)}")
        
        df = df[df['node_q'] <= 1]
        # print(f"过滤 node_q <= 1 后: {len(df)}")
        
        df = df[df['dark_frac'] <= 0.5]
        # print(f"过滤 dark_frac <= 0.4 后: {len(df)}")
        
        '''Quality of the cross-over calibration, Height correction from KaRIn crossover calibration.'''
        df = df[df['xovr_cal_q'] <= 1] 
        # print(f"过滤 xovr_cal_q <= 1 后: {len(df)}")
        
        df = df[(df['xtrk_dist'].abs() > 15000) & (df['xtrk_dist'].abs() < 60000)]
        # print(f"过滤 15 < |xtrk_dist| < 60 后: {len(df)}")
        
        df = df[df['ice_clim_f'] <= 1]
        # print(f"过滤 ice_clim_f <= 1 后: {len(df)}")
        print(f"过滤后: {len(df)}")
        
        if model == 'strict':
            ''' classification_qual_suspect '''
            df = df[df['node_q_b'] & (1 << 1) == 0]
            # print(f"过滤 node_q_b bit 1 后: {len(df)}")
            
            ''' geolocation_qual_suspect '''
            df = df[df['node_q_b'] & (1 << 2) == 0]
            # print(f"过滤 node_q_b bit 2 后: {len(df)}")
            
            ''' water_fraction_suspect '''
            df = df[df['node_q_b'] & (1 << 3) == 0]
            # print(f"过滤 node_q_b bit 3 后: {len(df)}")
            
            ''' few_wse_observations '''
            df = df[df['node_q_b'] & (1 << 11) == 0]
            # print(f"过滤 node_q_b bit 11 后: {len(df)}")
            
            ''' geolocation_qual_degraded '''
            df = df[df['node_q_b'] & (1 << 19) == 0]
            # print(f"过滤 node_q_b bit 19 后: {len(df)}")
            
            df = df[df['p_width'] >= 30.0]
            # print(f"过滤 p_width >= 30.0 后: {len(df)}")
            
            df = df[df['wse_r_u'] < 0.5]
            # print(f"过滤 wse_r_u < 0.5 后: {len(df)}")
            
            # df = df[abs(df['p_length']) > 7]
            # print(f"过滤 |p_length| > 7 后: {len(df)}")
            print(f"严格过滤后: {len(df)}")
        
        df = df.drop(columns=['index_right'])
        df['date'] = pd.to_datetime('2000-01-01') + pd.to_timedelta(df['time'], unit='s')
        df['date'] = df['date'].dt.date
        df = df.drop_duplicates()
        
        # print(f"\n最终数据量: {len(df)}")
        # print(f"总体保留率: {len(df)/len(df_original)*100:.2f}%")
        
        # 统计过滤后每个stationid的数据量
        after_counts = df.groupby('stationid').size().reset_index(name='count_after')
        
        # 合并前后统计结果
        comparison = before_counts.merge(after_counts, on='stationid', how='left')
        comparison['count_after'] = comparison['count_after'].fillna(0).astype(int)
        
        # 计算变化
        comparison['retention_rate(%)'] = (comparison['count_after'] / comparison['count_before'] * 100).round(2)
        
        # 排序（按照保留数据量降序）
        comparison = comparison.sort_values('count_after', ascending=False)
        
        # 添加汇总行
        summary_row = pd.DataFrame({
            'stationid': ['TOTAL'],
            'count_before': [comparison['count_before'].sum()],
            'count_after': [comparison['count_after'].sum()],
            'retention_rate(%)': [(comparison['count_after'].sum() / comparison['count_before'].sum() * 100)]
        })
        comparison = pd.concat([comparison, summary_row], ignore_index=True)
        
        # # 保存对比结果
        # output_file = './2.swot_qa_comparison.csv'
        # comparison.to_csv(output_file, index=False)
        # print(f"\n对比结果已保存到: {output_file}")
        
        # 显示统计摘要
        print("\n=== 统计摘要 ===")
        print(f"总站点数: {len(comparison)-1}")  # 减去汇总行
        print(f"完全被过滤的站点数: {(comparison['count_after'] == 0).sum() - 1}")  # 减去汇总行
        print(f"平均保留率: {comparison[comparison['stationid'] != 'TOTAL']['retention_rate(%)'].mean():.2f}%")
        print(f"\n保留数据最多的前10个站点:")
        print(comparison.head(11)[['stationid', 'count_before', 'count_after', 'retention_rate(%)']])
        
        # 保存过滤后的数据
        df.to_csv(f'./2.swot_node_qa{model}_{data_v}.csv', index=False)
        print(f"\n过滤后的数据已保存到: ./2.swot_node_qa{model}_{data_v}.csv")
        
        result = df.groupby(['date', 'stationid','node_id','reach_id','COMID'])[['wse', 'wse_u', 'width','width_u']].mean().reset_index()
        result.to_csv(f'2.swot_datemean_qa{model}_{data_v}.csv', index=False)

初始数据量: 2195
过滤后: 780

=== 统计摘要 ===
总站点数: 21
完全被过滤的站点数: 0
平均保留率: 34.80%

保留数据最多的前10个站点:
      stationid  count_before  count_after  retention_rate(%)
0       tieling           164           83              50.61
1       lianhua           105           75              71.43
2        tonghe           116           53              45.69
3   longmenzhen           120           52              43.33
4     liujiatun           117           49              41.88
5     shihuiyao           141           48              34.04
6       jiamusi            81           47              58.02
7   shuanghetun           118           43              36.44
8       kuerbin           107           43              40.19
9    heishiguan            92           41              44.57
10      tonghua           112           38              33.93

过滤后的数据已保存到: ./2.swot_node_qaloose_VersionD.csv
初始数据量: 2195
过滤后: 780
严格过滤后: 418

=== 统计摘要 ===
总站点数: 21
完全被过滤的站点数: 0
平均保留率: 18.35%

保留数据最多的前10个站点:
        stationid  coun