In [1]:
import numpy as np
import pandas as pd
from sklearn.cluster import DBSCAN
import os
os.chdir('E:\\热浪\\data\\WBGT\\dailyly-table-WBGT')
demo = pd.read_csv('./WBGT-Daily-1980-2019.csv', parse_dates=['system:time_start']).iloc[:, 1:]
demo.head(3)

Unnamed: 0,system:time_start,WBGT,name
0,1980-05-01,16.645,Anhui
1,1980-05-02,17.496,Anhui
2,1980-05-03,20.237,Anhui


In [2]:
threshold = pd.read_csv('./threshold.csv').set_index('name')
threshold.head()

Unnamed: 0_level_0,WBGT
name,Unnamed: 1_level_1
Anhui,28.9789
Beijing,24.4103
Chongqing,27.4242
Fujian,27.4795
Gansu,23.2062


In [3]:
def extract_info(group_name, group_df):
    
    # 初始化DBSCAN模型
    model = DBSCAN(eps=1.1, min_samples=1, n_jobs=-2)
    
    group_df = (
        group_df
        # 计算聚类用量化日期
        .assign(
            day_num=(group_df['system:time_start'] - group_df['system:time_start'].min()).dt.days
        )
        # 保留超出阈值的记录
        .query('WBGT > {}'.format(threshold.at[group_name, 'WBGT']))
        .assign(
            # 利用DBSCAN快速计算得到日期连续的记录片段
            cluster_label=lambda df: model.fit_predict(df['day_num'].values.reshape(-1, 1))
        )
        .groupby('cluster_label')
        .filter(lambda df: df.shape[0] >= 3)
        # 按年度统计各项指标
        .assign(year=lambda df: df['system:time_start'].dt.year)
        .groupby('year')
        .agg(
            发生频次=pd.NamedAgg(column='cluster_label', aggfunc='nunique'),
            总和持续天数=pd.NamedAgg(column='cluster_label', aggfunc='size'),
            平均持续天数=pd.NamedAgg(column='cluster_label', aggfunc=lambda s: s.size / s.nunique())
        )
        .assign(地区=group_name)
        .join(
            # 补充不同计算逻辑下的高温均值指标
            group_df
            .assign(year=lambda df: df['system:time_start'].dt.year)
            # 保留超出阈值的记录
            .query('WBGT > {}'.format(threshold.at[group_name, 'WBGT']))
            .groupby('year')
            .agg(高温均值=pd.NamedAgg(column='WBGT', aggfunc='mean'))
        )
        .reset_index(drop=False)
    )
    
    return group_df

In [4]:
result = (
    pd
    .concat(
        [
            extract_info(*group)
            for group in demo.groupby('name')
        ],
        ignore_index=True
    )
)
# result.to_csv(r'分省分年热浪事件.csv')
result


Unnamed: 0,year,发生频次,总和持续天数,平均持续天数,地区,高温均值
0,1980,1,6,6.000000,Anhui,30.477125
1,1981,4,17,4.250000,Anhui,29.910963
2,1983,1,8,8.000000,Anhui,30.157700
3,1984,4,16,4.000000,Anhui,29.646211
4,1985,1,4,4.000000,Anhui,29.582222
...,...,...,...,...,...,...
1242,2015,3,14,4.666667,Zhejiang,29.592188
1243,2016,4,35,8.750000,Zhejiang,29.868162
1244,2017,3,43,14.333333,Zhejiang,29.804340
1245,2018,4,22,5.500000,Zhejiang,29.289115
