In [1]:
# Using pandas to process data
import numpy as np
import pandas as pd
import datetime
from matplotlib import pyplot as plt
%matplotlib inline

from utils.data_util import parse_bj_aq_data, generate_model_data
from utils.plot_util import plot_station, plot_stations

# 数据解析

In [2]:
bj_aq_data, stations, bj_aq_stations, bj_aq_stations_merged = parse_bj_aq_data()

NaN in PM2.5 is 23459, 6.508615 %
NaN in PM10 is 96175, 26.683406 %
NaN in NO2 is 21720, 6.026135 %
NaN in CO is 46144, 12.802486 %
NaN in O3 is 23732, 6.584358 %
NaN in SO2 is 21664, 6.010598 %
There are 35 air quality stations in Beijing

The stations in Beijing are:
 {'miyun_aq', 'pinggu_aq', 'xizhimenbei_aq', 'donggaocun_aq', 'wanliu_aq', 'guanyuan_aq', 'dongsi_aq', 'nansanhuan_aq', 'nongzhanguan_aq', 'fengtaihuayuan_aq', 'beibuxinqu_aq', 'daxing_aq', 'aotizhongxin_aq', 'miyunshuiku_aq', 'yanqin_aq', 'qianmen_aq', 'dingling_aq', 'dongsihuan_aq', 'yongdingmennei_aq', 'liulihe_aq', 'yungang_aq', 'pingchang_aq', 'yongledian_aq', 'badaling_aq', 'huairou_aq', 'tiantan_aq', 'mentougou_aq', 'wanshouxigong_aq', 'tongzhou_aq', 'yizhuang_aq', 'zhiwuyuan_aq', 'shunyi_aq', 'fangshan_aq', 'yufa_aq', 'gucheng_aq'}


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  bj_aq_station.drop("utc_time", axis=1, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  bj_aq_station.drop("stationId", axis=1, inplace=True)


In [3]:
# 还有一些特征仍然存在缺失值
bj_aq_stations_merged.isnull().any()

miyunshuiku_aq_PM2.5       False
miyunshuiku_aq_PM10         True
miyunshuiku_aq_NO2         False
miyunshuiku_aq_CO          False
miyunshuiku_aq_O3          False
miyunshuiku_aq_SO2         False
wanshouxigong_aq_PM2.5     False
wanshouxigong_aq_PM10      False
wanshouxigong_aq_NO2       False
wanshouxigong_aq_CO        False
wanshouxigong_aq_O3        False
wanshouxigong_aq_SO2       False
pinggu_aq_PM2.5            False
pinggu_aq_PM10              True
pinggu_aq_NO2              False
pinggu_aq_CO               False
pinggu_aq_O3               False
pinggu_aq_SO2              False
guanyuan_aq_PM2.5          False
guanyuan_aq_PM10           False
guanyuan_aq_NO2            False
guanyuan_aq_CO             False
guanyuan_aq_O3             False
guanyuan_aq_SO2            False
yanqin_aq_PM2.5            False
yanqin_aq_PM10             False
yanqin_aq_NO2              False
yanqin_aq_CO               False
yanqin_aq_O3               False
yanqin_aq_SO2              False
          

In [4]:
# generate all training data at a time.
X_batches, Y_batches = generate_model_data(merged_data=bj_aq_stations_merged, m=10, X_hours=32, Y_hours=48, step=1)

In [5]:
print(len(X_batches), X_batches[0].shape, Y_batches[0].shape)

1012 (10, 32, 210) (10, 48, 210)


# 可视化

In [6]:
bj_aq_stations.keys()

dict_keys(['miyunshuiku_aq', 'wanshouxigong_aq', 'pinggu_aq', 'guanyuan_aq', 'yanqin_aq', 'xizhimenbei_aq', 'donggaocun_aq', 'wanliu_aq', 'qianmen_aq', 'dingling_aq', 'dongsihuan_aq', 'dongsi_aq', 'liulihe_aq', 'yungang_aq', 'pingchang_aq', 'nansanhuan_aq', 'nongzhanguan_aq', 'badaling_aq', 'zhiwuyuan_aq', 'tiantan_aq', 'huairou_aq', 'yongledian_aq', 'fengtaihuayuan_aq', 'beibuxinqu_aq', 'tongzhou_aq', 'miyun_aq', 'daxing_aq', 'aotizhongxin_aq', 'shunyi_aq', 'fangshan_aq', 'mentougou_aq', 'yufa_aq', 'yizhuang_aq', 'gucheng_aq', 'yongdingmennei_aq'])

In [7]:
bj_aq_stations['badaling_aq'].shape

(10298, 6)

In [8]:
x = bj_aq_stations['badaling_aq'].index[0]
type(x)

str

In [9]:
data = bj_aq_stations['badaling_aq'].loc[datetime.datetime(2017,4,1):datetime.datetime(2018,3,30)]

TypeError: unorderable types: str() < datetime.datetime()

In [10]:
bj_aq_stations

{'aotizhongxin_aq':                      aotizhongxin_aq_PM2.5  aotizhongxin_aq_PM10  \
 format_time                                                        
 2017-01-01 14:00:00                  453.0                 467.0   
 2017-01-01 15:00:00                  417.0                 443.0   
 2017-01-01 16:00:00                  395.0                 467.0   
 2017-01-01 17:00:00                  420.0                 484.0   
 2017-01-01 18:00:00                  453.0                 520.0   
 2017-01-01 19:00:00                  429.0                 520.0   
 2017-01-01 20:00:00                  211.0                 520.0   
 2017-01-01 21:00:00                  116.0                 520.0   
 2017-01-01 22:00:00                   51.0                 520.0   
 2017-01-01 23:00:00                   38.0                 520.0   
 2017-01-02 00:00:00                   21.0                 520.0   
 2017-01-02 01:00:00                   16.0                 520.0   
 2017-01-02 02:

In [11]:
plot_stations(bj_aq_stations, "PM2.5", datetime.datetime(2017,1,1), datetime.datetime(2017,2,1))

TypeError: unorderable types: str() < datetime.datetime()

<Figure size 1440x720 with 0 Axes>

# 数据预处理
- 对缺失值进行了补全，使用 ffill