In [1]:
from sklearn.metrics.pairwise import pairwise_distances
import numpy as np
import pandas as pd

from matplotlib import pyplot as plt
%matplotlib inline
from model.xgboost.xgboost_data_util import generate_training_set
from model.seq2seq.seq2seq_data_util import get_training_statistics, generate_dev_set

from utils.kmedoids import cluster as kmedoids_cluster

from utils.information import bj_station_list, bj_X_aq_list, bj_y_aq_list
from utils.information import ld_station_list, ld_X_aq_list, ld_y_aq_list, ld_X_meo_list
bj_X_meo_list = ["temperature","pressure","humidity","direction","speed"]

In [2]:
def plot_feature(y, index, norm=True):
    feature_of_stations = []
    for i in range(index, len(output_features),3):
        feature_name = output_features[i]
        f = y[i]
        if norm :
            f = f * statistics.loc['std'][feature_name] + statistics.loc['mean'][feature_name]
        feature_of_stations.append(f)
    plt.scatter(range(len(feature_of_stations)), feature_of_stations)

In [3]:
def cluster(y_all, index):
    feature_dis_list = []
    for k in range(y_all.shape[0]):
        y = y_all[k,:]
        index = 0
        feature = y[index::3]
        feature_dis = np.zeros((len(feature), len(feature)))
        for i in range(len(feature)):
            for j in range(len(feature)):
                feature_dis[i, j] = np.abs(feature[i] - feature[j])
        feature_dis_list.append(feature_dis)

    feature_dis_mean = np.zeros(feature_dis_list[0].shape)
    for feature_dis in feature_dis_list:
        feature_dis_mean += feature_dis
    feature_dis_mean = feature_dis_mean / len(feature_dis_list)
    return feature_dis_mean

In [4]:
city = "bj"

station_list = bj_station_list
X_aq_list = bj_X_aq_list
y_aq_list = bj_y_aq_list
X_meo_list = bj_X_meo_list

output_features = []
for station in station_list : 
    for aq_feature in y_aq_list :
        output_features.append(station + "_" + aq_feature)
output_features.sort()

In [5]:
statistics = get_training_statistics(city)

FileNotFoundError: File b'preprocessed_data/before_split/bj_aq_describe.csv' does not exist

In [14]:
stations = output_features[0::3]

In [15]:
pair = {}
for i in range(len(stations)):
    pair[i] = stations[i]

In [16]:
pair

{0: 'aotizhongxin_aq_O3',
 1: 'badaling_aq_O3',
 2: 'beibuxinqu_aq_O3',
 3: 'daxing_aq_O3',
 4: 'dingling_aq_O3',
 5: 'donggaocun_aq_O3',
 6: 'dongsi_aq_O3',
 7: 'dongsihuan_aq_O3',
 8: 'fangshan_aq_O3',
 9: 'fengtaihuayuan_aq_O3',
 10: 'guanyuan_aq_O3',
 11: 'gucheng_aq_O3',
 12: 'huairou_aq_O3',
 13: 'liulihe_aq_O3',
 14: 'mentougou_aq_O3',
 15: 'miyun_aq_O3',
 16: 'miyunshuiku_aq_O3',
 17: 'nansanhuan_aq_O3',
 18: 'nongzhanguan_aq_O3',
 19: 'pingchang_aq_O3',
 20: 'pinggu_aq_O3',
 21: 'qianmen_aq_O3',
 22: 'shunyi_aq_O3',
 23: 'tiantan_aq_O3',
 24: 'tongzhou_aq_O3',
 25: 'wanliu_aq_O3',
 26: 'wanshouxigong_aq_O3',
 27: 'xizhimenbei_aq_O3',
 28: 'yanqin_aq_O3',
 29: 'yizhuang_aq_O3',
 30: 'yongdingmennei_aq_O3',
 31: 'yongledian_aq_O3',
 32: 'yufa_aq_O3',
 33: 'yungang_aq_O3',
 34: 'zhiwuyuan_aq_O3'}

根据均值进行的聚类

In [17]:
X_train_mean, y_train_mean = generate_training_set(city="bj", 
                          station_list=station_list, 
                          X_aq_list=X_aq_list, 
                          y_aq_list=y_aq_list, 
                          X_meo_list=X_meo_list, 
                          use_day=True, 
                          pre_days=5, 
                          gap=1,
                          use_day_model=True,
                          generate_mean=True,
                          generate_range=False)

In [18]:
y_all = y_train_mean.reshape(-1, 105)

In [19]:
O3_dis = cluster(y_all, 0)
PM10_dis = cluster(y_all, 1)
PM25_dis = cluster(y_all, 2)

In [20]:
all_stations, all_target_stations = [],[]

In [21]:
stations, target_stations = kmedoids_cluster(O3_dis, k=4)
all_stations.append(stations)
all_target_stations.append(target_stations)

In [22]:
stations, target_stations = kmedoids_cluster(PM10_dis, k=4)
all_stations.append(stations)
all_target_stations.append(target_stations)

In [23]:
stations, target_stations = kmedoids_cluster(PM25_dis, k=4)
all_stations.append(stations)
all_target_stations.append(target_stations)

对range进行聚类

In [24]:
X_train_range, y_train_range = generate_training_set(city="bj", 
                                                      station_list=station_list, 
                                                      X_aq_list=X_aq_list, 
                                                      y_aq_list=y_aq_list, 
                                                      X_meo_list=X_meo_list, 
                                                      use_day=True, 
                                                      pre_days=5, 
                                                      gap=1,
                                                      use_day_model=True,
                                                      generate_mean=False,
                                                      generate_range=True)

In [25]:
y_all = y_train_range.reshape(-1, 105)

In [26]:
O3_dis = cluster(y_all, 0)
PM10_dis = cluster(y_all, 1)
PM25_dis = cluster(y_all, 2)

In [27]:
stations, target_stations = kmedoids_cluster(O3_dis, k=4)
all_stations.append(stations)
all_target_stations.append(target_stations)

In [28]:
stations, target_stations = kmedoids_cluster(PM10_dis, k=4)
all_stations.append(stations)
all_target_stations.append(target_stations)

In [29]:
stations, target_stations = kmedoids_cluster(PM25_dis, k=4)
all_stations.append(stations)
all_target_stations.append(target_stations)

代表性站点
26,6 'wanshouxigong_aq_O3'
12,6 'huairou_aq_O3'
7,4  'dongsihuan_aq_O3'
22,2 'shunyi_aq_O3'
18,1 'nongzhanguan_aq_O3'
30,1 'yongdingmennei_aq_O3'
3,1  'daxing_aq_O3'
17,1 'nansanhuan_aq_O3'
6,1  'dongsi_aq_O3'
24,1 'tongzhou_aq_O3'

In [30]:
all_stations

[array([26, 26, 26,  8, 12, 20, 26, 26,  8, 26, 26, 26, 12,  8, 12, 12, 20,
        26, 26, 26, 20, 26, 12, 26, 26, 26, 26, 26, 12,  8, 26, 26, 26,  8,
         8]),
 array([26, 26, 26, 26, 12, 12, 26,  7, 24, 26, 26, 26, 12, 26, 12, 12, 12,
         7, 24, 26, 12, 26, 24, 26, 24, 26, 26, 26, 12, 24,  7, 24, 26, 12,
        24]),
 array([26,  6, 26,  6, 12, 12,  6,  7,  6, 26, 26, 26, 12, 26, 12, 12, 12,
         7,  6, 26, 12, 26, 12, 26,  6, 26, 26, 26, 12,  6,  7,  6, 26, 12,
         6]),
 array([ 0,  1,  0, 21,  1, 21, 21, 21, 11, 21, 21, 11, 21, 11, 11, 21, 21,
        21, 21,  0, 21, 21,  0, 21, 21, 21, 21, 21,  1, 21, 21, 11, 21, 11,
        11]),
 array([21, 25, 25, 21, 25, 17, 21, 21,  8, 21, 25, 25, 25,  8, 25, 25, 21,
        17, 21, 25, 21, 21, 25, 21, 21, 25, 21, 21, 25, 21, 17,  8,  8, 25,
        25]),
 array([21, 19, 21, 21, 19, 16, 21, 21, 21, 21, 21, 21, 16, 21, 21, 21, 16,
        17, 21, 19, 21, 21, 21, 21, 21, 21, 21, 21, 19, 21, 17, 21, 21, 21,
        21])]

In [31]:
all_target_stations

[array([ 8, 20, 12, 26]),
 array([24, 12,  7, 26]),
 array([26, 12,  7,  6]),
 array([21,  1,  0, 11]),
 array([21,  8, 25, 17]),
 array([16, 19, 21, 17])]

In [32]:
from collections import Counter

In [33]:
dic = {}
for i in range(len(all_stations[0])):
    target_station = []
    for station in all_stations:
        target_station.append(station[i])
    c = Counter(target_station)
    dic[i] = c

In [34]:
dic

{0: Counter({0: 1, 21: 2, 26: 3}),
 1: Counter({1: 1, 6: 1, 19: 1, 25: 1, 26: 2}),
 2: Counter({0: 1, 21: 1, 25: 1, 26: 3}),
 3: Counter({6: 1, 8: 1, 21: 3, 26: 1}),
 4: Counter({1: 1, 12: 3, 19: 1, 25: 1}),
 5: Counter({12: 2, 16: 1, 17: 1, 20: 1, 21: 1}),
 6: Counter({6: 1, 21: 3, 26: 2}),
 7: Counter({7: 2, 21: 3, 26: 1}),
 8: Counter({6: 1, 8: 2, 11: 1, 21: 1, 24: 1}),
 9: Counter({21: 3, 26: 3}),
 10: Counter({21: 2, 25: 1, 26: 3}),
 11: Counter({11: 1, 21: 1, 25: 1, 26: 3}),
 12: Counter({12: 3, 16: 1, 21: 1, 25: 1}),
 13: Counter({8: 2, 11: 1, 21: 1, 26: 2}),
 14: Counter({11: 1, 12: 3, 21: 1, 25: 1}),
 15: Counter({12: 3, 21: 2, 25: 1}),
 16: Counter({12: 2, 16: 1, 20: 1, 21: 2}),
 17: Counter({7: 2, 17: 2, 21: 1, 26: 1}),
 18: Counter({6: 1, 21: 3, 24: 1, 26: 1}),
 19: Counter({0: 1, 19: 1, 25: 1, 26: 3}),
 20: Counter({12: 2, 20: 1, 21: 3}),
 21: Counter({21: 3, 26: 3}),
 22: Counter({0: 1, 12: 2, 21: 1, 24: 1, 25: 1}),
 23: Counter({21: 3, 26: 3}),
 24: Counter({6: 1, 21: 3,

In [35]:
pair

{0: 'aotizhongxin_aq_O3',
 1: 'badaling_aq_O3',
 2: 'beibuxinqu_aq_O3',
 3: 'daxing_aq_O3',
 4: 'dingling_aq_O3',
 5: 'donggaocun_aq_O3',
 6: 'dongsi_aq_O3',
 7: 'dongsihuan_aq_O3',
 8: 'fangshan_aq_O3',
 9: 'fengtaihuayuan_aq_O3',
 10: 'guanyuan_aq_O3',
 11: 'gucheng_aq_O3',
 12: 'huairou_aq_O3',
 13: 'liulihe_aq_O3',
 14: 'mentougou_aq_O3',
 15: 'miyun_aq_O3',
 16: 'miyunshuiku_aq_O3',
 17: 'nansanhuan_aq_O3',
 18: 'nongzhanguan_aq_O3',
 19: 'pingchang_aq_O3',
 20: 'pinggu_aq_O3',
 21: 'qianmen_aq_O3',
 22: 'shunyi_aq_O3',
 23: 'tiantan_aq_O3',
 24: 'tongzhou_aq_O3',
 25: 'wanliu_aq_O3',
 26: 'wanshouxigong_aq_O3',
 27: 'xizhimenbei_aq_O3',
 28: 'yanqin_aq_O3',
 29: 'yizhuang_aq_O3',
 30: 'yongdingmennei_aq_O3',
 31: 'yongledian_aq_O3',
 32: 'yufa_aq_O3',
 33: 'yungang_aq_O3',
 34: 'zhiwuyuan_aq_O3'}

In [36]:
# 站点的具体位置
station_locations = pd.read_excel("./data/Beijing/location/Beijing_AirQuality_Stations_locations.xlsx")
station_locations.head()

Unnamed: 0,stationName,longitude,latitude
0,dongsi_aq,116.417,39.929
1,tiantan_aq,116.407,39.886
2,guanyuan_aq,116.339,39.929
3,wanshouxigong_aq,116.352,39.878
4,aotizhongxin_aq,116.397,39.982


让所有特征的 mean 和 range 一起投个票

聚类的结果表明，城区和交通有相当一致的特征（range and mean）,而郊区和对照点的特征并不集中．