In [1]:
# Using pandas to process data
import numpy as np
import pandas as pd
import datetime
from matplotlib import pyplot as plt
%matplotlib inline

from utils.data_util import parse_bj_aq_data, generate_model_data
from utils.plot_util import plot_station, plot_stations

%load_ext autoreload
%autoreload 2

# 数据解析

In [2]:
bj_aq_data, stations, bj_aq_stations, bj_aq_stations_merged = parse_bj_aq_data()

NaN in PM2.5 is 23459, 6.508615 %
NaN in PM10 is 96175, 26.683406 %
NaN in NO2 is 21720, 6.026135 %
NaN in CO is 46144, 12.802486 %
NaN in O3 is 23732, 6.584358 %
NaN in SO2 is 21664, 6.010598 %
There are 35 air quality stations in Beijing

The stations in Beijing are:
 {'fengtaihuayuan_aq', 'zhiwuyuan_aq', 'aotizhongxin_aq', 'tongzhou_aq', 'gucheng_aq', 'qianmen_aq', 'mentougou_aq', 'daxing_aq', 'miyunshuiku_aq', 'huairou_aq', 'tiantan_aq', 'yanqin_aq', 'dingling_aq', 'fangshan_aq', 'beibuxinqu_aq', 'nongzhanguan_aq', 'pingchang_aq', 'yungang_aq', 'dongsi_aq', 'wanshouxigong_aq', 'xizhimenbei_aq', 'nansanhuan_aq', 'wanliu_aq', 'yufa_aq', 'yizhuang_aq', 'yongledian_aq', 'yongdingmennei_aq', 'liulihe_aq', 'shunyi_aq', 'badaling_aq', 'donggaocun_aq', 'dongsihuan_aq', 'miyun_aq', 'guanyuan_aq', 'pinggu_aq'}


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  bj_aq_station.drop("utc_time", axis=1, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  bj_aq_station.drop("stationId", axis=1, inplace=True)


In [3]:
# 还有一些特征仍然存在缺失值
bj_aq_stations_merged.isnull().any()

miyunshuiku_aq_PM2.5       False
miyunshuiku_aq_PM10         True
miyunshuiku_aq_NO2         False
miyunshuiku_aq_CO          False
miyunshuiku_aq_O3          False
miyunshuiku_aq_SO2         False
wanshouxigong_aq_PM2.5     False
wanshouxigong_aq_PM10      False
wanshouxigong_aq_NO2       False
wanshouxigong_aq_CO        False
wanshouxigong_aq_O3        False
wanshouxigong_aq_SO2       False
pinggu_aq_PM2.5            False
pinggu_aq_PM10              True
pinggu_aq_NO2              False
pinggu_aq_CO               False
pinggu_aq_O3               False
pinggu_aq_SO2              False
guanyuan_aq_PM2.5          False
guanyuan_aq_PM10           False
guanyuan_aq_NO2            False
guanyuan_aq_CO             False
guanyuan_aq_O3             False
guanyuan_aq_SO2            False
yanqin_aq_PM2.5            False
yanqin_aq_PM10             False
yanqin_aq_NO2              False
yanqin_aq_CO               False
yanqin_aq_O3               False
yanqin_aq_SO2              False
          

In [4]:
# generate all training data at a time.
X_batches, Y_batches = generate_model_data(merged_data=bj_aq_stations_merged, m=10, X_hours=32, Y_hours=48, step=1)

In [5]:
print(len(X_batches), X_batches[0].shape, Y_batches[0].shape)

1012 (10, 32, 210) (10, 48, 210)


# 可视化

In [3]:
bj_aq_stations.keys()

dict_keys(['dingling_aq', 'nansanhuan_aq', 'wanliu_aq', 'fengtaihuayuan_aq', 'zhiwuyuan_aq', 'aotizhongxin_aq', 'nongzhanguan_aq', 'yizhuang_aq', 'tongzhou_aq', 'dongsihuan_aq', 'yongledian_aq', 'yufa_aq', 'gucheng_aq', 'qianmen_aq', 'daxing_aq', 'miyunshuiku_aq', 'tiantan_aq', 'miyun_aq', 'liulihe_aq', 'yanqin_aq', 'yongdingmennei_aq', 'badaling_aq', 'fangshan_aq', 'beibuxinqu_aq', 'pingchang_aq', 'donggaocun_aq', 'mentougou_aq', 'yungang_aq', 'shunyi_aq', 'guanyuan_aq', 'huairou_aq', 'dongsi_aq', 'pinggu_aq', 'wanshouxigong_aq', 'xizhimenbei_aq'])

In [12]:
bj_aq_stations['dingling_aq'].columns

Index(['dingling_aq_PM2.5', 'dingling_aq_PM10', 'dingling_aq_NO2',
       'dingling_aq_CO', 'dingling_aq_O3', 'dingling_aq_SO2'],
      dtype='object')

# 数据预处理
- 对缺失值进行了补全，使用 ffill