In [2]:
import pandas as pd
import numpy as np

'''读取表格数据'''
df_original = pd.read_excel(
    '../data/附件1 监测点A空气质量预报基础数据.xlsx',
    sheet_name='监测点A逐小时污染物浓度与气象实测数据'
)

columns = ['time', 'place', 'SO2', 'NO2', 'PM10',
           'PM2.5', 'O3', 'CO', 'temperature', 'humidity',
           'air_pressure', 'wind_speed', 'wind_direction']
df_original.columns = columns

df = df_original

'''数据预处理'''
# 删除值为 '—' 的数据
df = df[(~df['SO2'].isin(['—'])) & (~df['NO2'].isin(['—'])) &
        (~df['PM10'].isin(['—'])) & (~df['PM2.5'].isin(['—'])) &
        (~df['O3'].isin(['—'])) & (~df['CO'].isin(['—']))]
# 只保留值不小于 0 的数据
df = df[(df['SO2'].astype(np.float64) >= 0) & (df['NO2'].astype(np.float64) >= 0) &
        (df['PM10'].astype(np.float64) >= 0) & (df['PM2.5'].astype(np.float64) >= 0) &
        (df['O3'].astype(np.float64) >= 0) & (df['CO'].astype(np.float64) >= 0)]
# 只保留污染物相关列数据，用于计算 AQI
df_pollutant = df.drop(columns=['time', 'place', 'temperature',
                       'humidity', 'air_pressure', 'wind_speed', 'wind_direction'])
# 用于计算 IAQI
IAQI_DICT = {
    'IAQI': [0, 50, 100, 150, 200, 300, 400, 500],
    'SO2': [0, 50, 150, 475, 800, 1600, 2100, 2620],
    'NO2': [0, 40, 80, 180, 280, 565, 750, 940],
    'PM10': [0, 50, 150, 250, 350, 420, 500, 600],
    'PM2.5': [0, 35, 75, 115, 150, 250, 350, 500],
    'O3': [0, 100, 160, 215, 265, 800],
    'CO': [0, 2, 4, 14, 24, 36, 48, 60],
}
POLLUTANT_KEYS = list(IAQI_DICT.keys())[1:]


def get_aqi_index(key, val):
    """
    获取 aqi 索引，返回污染物浓度在数组中的左右下标
    """

    i = 0
    while i < len(IAQI_DICT[key]):
        if val >= IAQI_DICT[key][i] and val < IAQI_DICT[key][i+1]:
            return i, i+1
        elif IAQI_DICT[key][i] == val:
            return i-1, i
        else:
            i += 1


# 保存 aqi 结果
aqi_results = np.zeros((df_pollutant.shape[0]), dtype='float64')
top_pollutant_results = np.array(
    ["" for _ in range(df_pollutant.shape[0])], dtype='U10')

# 循环计算每一行数据的 AQI
for i in range(df_pollutant.shape[0]):
    row_data = df_pollutant.iloc[[i], :]  # 获取当前行数据
    iaqi_list = np.zeros((row_data.shape[1]), dtype='float64')  # 保存 iaqi
    # 循环计算每种污染物对应的 iaqi
    for j in range(row_data.shape[1]):
        key = POLLUTANT_KEYS[j]
        val = row_data.iloc[:, j].values[0]
        # 只计算污染物浓度大于 0 且不超标的 iaqi
        if val > 0 and val <= max(IAQI_DICT[key]):
            left, right = get_aqi_index(key, val)
            iaqi = (IAQI_DICT['IAQI'][right] - IAQI_DICT['IAQI'][left]) / (IAQI_DICT[key][right] - IAQI_DICT[key][left])\
                * (val-IAQI_DICT[key][left]) + IAQI_DICT['IAQI'][left]
            iaqi_list[j] = iaqi
    # print(iaqi_list)
    max_index = np.argmax(iaqi_list)
    top_pollutant_results[i] = (POLLUTANT_KEYS[max_index])
    aqi = iaqi_list.max()  # 计算最大的 iaqi 值作为当前行的 AQI
    aqi_results[i] = aqi
aqi_results = np.around(aqi_results)  # 取整

# 保存数据
df['AQI'] = aqi_results
df['top_pollutant'] = top_pollutant_results
columns = ['time', 'place', 'SO2', 'NO2', 'PM10',
           'PM2.5', 'O3', 'CO', 'AQI', 'top_pollutant',
           'temperature', 'humidity', 'air_pressure', 'wind_speed', 'wind_direction']
writer = pd.ExcelWriter('./data/data.xlsx')
df.to_excel(writer, columns=columns, index=False,
            encoding='utf-8', sheet_name='Sheet1')
writer.save()
writer.close()
