In [3]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
pd.options.mode.chained_assignment = None

In [22]:
def preprocessX(df,mode="train"):
    if mode=="train":
        df = df["20160531230000":]  # aqi数据从2016年6月1号开始，气象数据需要以此作为起点

    # 风矢量长度
    for level in ["", "925", "850", "750", "500"]:
        df["ws" + level] = np.sqrt(df["u" + level] ** 2 + df["v" + level] ** 2)

    # 升采样为1小时间隔
    df = df.resample("H").asfreq().interpolate()

    # 不同层温差作为直接特征
    levels = ["995", "925", "850", "750", "500"]
    for i, _ in enumerate(levels):
        df["dt" + levels[i]] = df["tmp"] - df["t" + levels[i]]
        if not levels[i] == "500":
            df["dt" + levels[i] + "_" + levels[i + 1]] = df["t" + levels[i]] - df["t" + levels[i + 1]]

    #时间特征
    t_year = np.cos( df.index.month * np.pi/6)
    df["t_year"]=t_year

    # df["workday"]=(df.index.weekday<=3)*1
    # df["weekend"]=(df.index.weekday>=4)*1

    # 风向特征
    wd=np.arctan2(df["u"],df["v"])/np.pi*180
    wd=((wd+180)/45).round()
    wd=wd.astype(np.uint8)
    wd=wd.replace({8:0})
    dummies=pd.get_dummies(wd,prefix="wd")
    dummies_columns=["wd_" + str(i) for i in range(8)]
    for c in dummies_columns:
        if c not in dummies.columns:
            dummies[c] = 0
    dummies=dummies.reindex(columns=dummies_columns)
    df=pd.concat([df,dummies],axis=1)
    
    #气压/10000
    df['p'] = df['p']/10000
    return df


def preprocessY(df,mode="train"):
    df.replace(0, np.nan, inplace=True)
    df = df.resample("H").asfreq().interpolate()
    df = np.log1p(df)  # aqi数据偏度较大，取对数使数据分布接近正态分布
    return df


In [25]:
if __name__ == '__main__':
    df = pd.read_csv('../aqi_pre/aqi/aqi_all.csv',index_col= 0)
    df = df[['aqi','tmp', 'v850', 't925', 'p', 't750', 'v750', 'v500', 'u850',
             'u925', 'u750', 'u', 't500', 'v', 'v925', 't995', 'rh', 'u500', 't850']]
    df.index = pd.to_datetime(df.index)
    df = preprocessX(df)
#     df = preprocessY(df)
    df.to_csv('data/aqi_upsample_1.csv')