定义了生成训练样本和验证集的两个函数，该 .ipynb 文件为测试样例，两个函数已经放在了 `seq2seq/seq2seq_data_util.py` 中

疑问：
- 每一次生成数据的特征之间的顺序如何保持一致？
- y_train 的特征排序如何保持和 y_eva 一致？


- 在生成数据时候需要考虑的因素
    - 训练数据和验证数据是分开产生的
    - 验证数据中不存在 NAN，但是训练数据中存在 NAN。如何保证生成的训练数据没有 NAN
    - 数据的生成是不是严格按照“天”的概念
        - 验证集
            - 验证集一定严格按照“天”取生成
        - 训练集
            - 可以是严格按照“天”去生成，这样训练样本合理，但是数量少
            - 也可以是宽松的按照每一个小时去生成，这样训练样本的数量会更多
            - 也可以在训练的过程中综合使用上述两种训练集生成策略
    - 特征的灵活选择
        - 应设计生成训练集和验证集的两个函数，使之可以灵活的生成包含各种特征的数据
            - 站点
                - 所有站点数据
                - 个别站点数据
            - 天气/空气质量
                - 可以只包含空气质量
                - 可以包含空气质量和天气
            - 详细特征
                - 如天气中的几个特征和空气质量的几个特征

In [29]:
import pandas as pd
import numpy as np

In [30]:
def generate_dev_set(station_list, X_aq_list, y_aq_list, X_meo_list=None, pre_days=5):
    '''
   
    Args:
        station_list : a list of used stations.
        X_aq_list : a list of used aq features as input.
        y_aq_list : a list of used aq features as output. 
        X_meo_list : a list of used meo features.
        
    station_list = ['dongsi_aq','tiantan_aq','guanyuan_aq','wanshouxigong_aq','aotizhongxin_aq',
                'nongzhanguan_aq','wanliu_aq','beibuxinqu_aq','zhiwuyuan_aq','fengtaihuayuan_aq',
                'yungang_aq','gucheng_aq','fangshan_aq','daxing_aq','yizhuang_aq','tongzhou_aq',
                'shunyi_aq','pingchang_aq','mentougou_aq','pinggu_aq','huairou_aq','miyun_aq',
                'yanqin_aq','dingling_aq','badaling_aq','miyunshuiku_aq','donggaocun_aq',
                'yongledian_aq','yufa_aq','liulihe_aq','qianmen_aq','yongdingmennei_aq',
                'xizhimenbei_aq','nansanhuan_aq','dongsihuan_aq']            
    X_aq_list = ["PM2.5","PM10","O3","CO","SO2","NO2"]  
    y_aq_list = ["PM2.5","PM10","O3"]
    X_meo_list = ["temperature","pressure","humidity","direction","speed/kph"]
    '''
    aq_dev = pd.read_csv("data/aq_dev_data.csv")
    meo_dev = pd.read_csv("data/meo_dev_data.csv")
    
    dev_df = pd.concat([aq_dev, meo_dev], axis=1)
    
    # step 1 : keep all features about the stations
    station_filters = []
    for station in station_list : 
        station_filter = [index for index in dev_df.columns if station in index]
        station_filters += station_filter
    
    # step 2 : filter of X features
    X_feature_filters = []
    if X_meo_list :
        X_features = X_aq_list + X_meo_list
    else :
        X_features = X_aq_list
        
    for i in station_filters : 
        if i.split("_")[-1] in X_features :
            X_feature_filters += [i]
            
    X_feature_filters.sort()  # 排序，保证训练集和验证集中的特征的顺序一致
    X_df = dev_df[X_feature_filters]
    
    # step 3 : filter of y features
    y_feature_filters = []
    y_features = y_aq_list
    
    for i in station_filters : 
        if i.split("_")[-1] in y_features :
            y_feature_filters += [i]
    
    y_feature_filters.sort()  # 排序，保证训练集和验证集中的特征的顺序一致
    y_df = dev_df[y_feature_filters]   
    
    # step 4 : 按天生成数据
    X_df_list = []
    y_df_list = []
    
    m = int(np.floor(X_df.shape[0] / 24 + 1 - (pre_days + 2)))

    for i in range(m):

        X_start_index = 24 * i
        X_end_index = 24 * (i + pre_days) - 1

        y_start_index = 24 * (i + pre_days)
        y_end_index = 24 * (i + pre_days + 2) - 1


        X = X_df.loc[X_start_index : X_end_index]
        y = y_df.loc[y_start_index : y_end_index]

        X = np.array(X)
        y = np.array(y)

        X = np.expand_dims(X, axis=0)
        y = np.expand_dims(y, axis=0)

        X_df_list.append(X)
        y_df_list.append(y)

    X_dev_batch = np.concatenate(X_df_list, axis=0)
    y_dev_batch = np.concatenate(y_df_list, axis=0)
    
    return X_dev_batch, y_dev_batch

In [31]:
def generate_training_set(station_list, X_aq_list, y_aq_list, X_meo_list=None, use_day=True, pre_days=5, batch_size=32):
    '''
    
    Args:
        station_list : a list of used stations.
        X_aq_list : a list of used aq features as input.
        y_aq_list : a list of used aq features as output. 
        X_meo_list : a list of used meo features.
        use_day : bool, True to just use 0-24 h days.
        pre_days : use pre_days history days to predict.
        batch_size
        
    station_list = ['dongsi_aq','tiantan_aq','guanyuan_aq','wanshouxigong_aq','aotizhongxin_aq',
                'nongzhanguan_aq','wanliu_aq','beibuxinqu_aq','zhiwuyuan_aq','fengtaihuayuan_aq',
                'yungang_aq','gucheng_aq','fangshan_aq','daxing_aq','yizhuang_aq','tongzhou_aq',
                'shunyi_aq','pingchang_aq','mentougou_aq','pinggu_aq','huairou_aq','miyun_aq',
                'yanqin_aq','dingling_aq','badaling_aq','miyunshuiku_aq','donggaocun_aq',
                'yongledian_aq','yufa_aq','liulihe_aq','qianmen_aq','yongdingmennei_aq',
                'xizhimenbei_aq','nansanhuan_aq','dongsihuan_aq']            
    X_aq_list = ["PM2.5","PM10","O3","CO","SO2","NO2"]  
    y_aq_list = ["PM2.5","PM10","O3"]
    X_meo_list = ["temperature","pressure","humidity","direction","speed/kph"]
    '''
    
    aq_train = pd.read_csv("data/aq_train_data.csv")
    meo_train = pd.read_csv("data/meo_train_data.csv")
    
    dev_df = pd.concat([aq_train, meo_train], axis=1)
    
    # step 1 : keep all features about the stations
    station_filters = []
    for station in station_list : 
        station_filter = [index for index in dev_df.columns if station in index]
        station_filters += station_filter
    
    # step 2 : filter of X features
    X_feature_filters = []
    if X_meo_list :
        X_features = X_aq_list + X_meo_list
    else :
        X_features = X_aq_list
        
    for i in station_filters : 
        if i.split("_")[-1] in X_features :
            X_feature_filters += [i]
            
    X_feature_filters.sort()  # 排序，保证训练集和验证集中的特征的顺序一致
    X_df = dev_df[X_feature_filters]
    
    # step 3 : filter of y features
    y_feature_filters = []
    y_features = y_aq_list
    
    for i in station_filters : 
        if i.split("_")[-1] in y_features :
            y_feature_filters += [i]
    
    y_feature_filters.sort()  # 排序，保证训练集和验证集中的特征的顺序一致
    y_df = dev_df[y_feature_filters]
    
    # step 4 : generate training batch
    X_df_list = []
    y_df_list = []
    
    max_start_points = X_df.shape[0] - (pre_days + 2) * 24
    if use_day : 
        total_start_points = range(0, max_start_points, 24)
    else :
        total_start_points = range(0, max_start_points, 1)
    
    for i in range(batch_size):       
        flag = True        
        while flag :
            X_start_index = int(np.random.choice(total_start_points, 1, replace = False))
            X_end_index = X_start_index + pre_days * 24 - 1

            y_start_index = X_end_index + 1
            y_end_index = X_end_index + 48
    
            # print(X_start_index, X_end_index, y_start_index, y_end_index)

            X = X_df.loc[X_start_index : X_end_index]
            y = y_df.loc[y_start_index : y_end_index]

            # 判断是不是有 NAN
            if pd.isnull(X).any().any() :
                pass
            else :     
                X = np.array(X)
                y = np.array(y)
                X = np.expand_dims(X, axis=0)
                y = np.expand_dims(y, axis=0)
                X_df_list.append(X)
                y_df_list.append(y)
                flag = False

    X_train_batch = np.concatenate(X_df_list, axis=0)
    y_train_batch = np.concatenate(y_df_list, axis=0)
    
    return X_train_batch, y_train_batch

In [32]:
station_list = ['dongsi_aq','tiantan_aq','guanyuan_aq','wanshouxigong_aq','aotizhongxin_aq',
            'nongzhanguan_aq','wanliu_aq','beibuxinqu_aq','zhiwuyuan_aq','fengtaihuayuan_aq',
            'yungang_aq','gucheng_aq','fangshan_aq','daxing_aq','yizhuang_aq','tongzhou_aq',
            'shunyi_aq','pingchang_aq','mentougou_aq','pinggu_aq','huairou_aq','miyun_aq',
            'yanqin_aq','dingling_aq','badaling_aq','miyunshuiku_aq','donggaocun_aq',
            'yongledian_aq','yufa_aq','liulihe_aq','qianmen_aq','yongdingmennei_aq',
            'xizhimenbei_aq','nansanhuan_aq','dongsihuan_aq']            
X_aq_list = ["PM2.5","PM10","O3","CO","SO2","NO2"]  
y_aq_list = ["PM2.5","PM10","O3"]
X_meo_list = ["temperature","pressure","humidity","direction","speed/kph"]
use_day=True
pre_days=5
batch_size=32

In [33]:
X_batch, y_batch = generate_dev_set(station_list, X_aq_list, y_aq_list, X_meo_list, pre_days=5)

In [34]:
print(X_batch.shape, y_batch.shape)

(17, 120, 385) (17, 48, 105)


In [35]:
X_batch, y_batch = generate_training_set(station_list, X_aq_list, y_aq_list, X_meo_list, pre_days=5)

In [36]:
print(X_batch.shape, y_batch.shape)

(32, 120, 385) (32, 48, 105)


### Changelog
- 20180425 v0
    - 完成了两个函数