In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from utils_v2 import *

%matplotlib inline

In [2]:
aq_df4 = pd.read_csv('./data/aiqQuality_201804.csv')
grid_df4 = pd.read_csv('./data/gridWeather_201804.csv')
ob_df4 = pd.read_csv('./data/observedWeather_201804.csv')

In [3]:
ob_df4.head()

Unnamed: 0,id,station_id,time,weather,temperature,pressure,humidity,wind_speed,wind_direction
0,2028533,shunyi_meo,2018-04-01 01:00:00,Hail,15.5,1009.2,51.0,0.5,82.0
1,2028534,hadian_meo,2018-04-01 01:00:00,Hail,15.7,1006.8,51.0,1.8,50.0
2,2028535,yanqing_meo,2018-04-01 01:00:00,Hail,12.6,955.6,33.0,0.8,34.0
3,2028536,miyun_meo,2018-04-01 01:00:00,Hail,14.8,1004.0,40.0,0.2,999017.0
4,2028537,huairou_meo,2018-04-01 01:00:00,Hail,15.3,1003.4,44.0,1.5,146.0


In [4]:
#prepare weather data
weather_all = pd.concat([ob_df4,grid_df4]).reset_index()
weather_all = clean_weather(weather_all)


In [5]:
#add time columns to airQuality data
#levels is the number of nearest weather station to search
#time_column is the name of the time column
aq_all = fill_weather_gap(aq_df=aq_df4,weather_df=weather_all,levels = 5,time_column = 'time')


aq_all['timestamp'] = pd.to_datetime(aq_all['time'])
aq_all['year'] = aq_all['timestamp'].dt.year
aq_all['month'] = aq_all['timestamp'].dt.month
aq_all['day'] = aq_all['timestamp'].dt.day
aq_all['hour'] = aq_all['timestamp'].dt.hour

Level 1 #null before 353
After:  104
Level 2 #null before 104
After:  66
Level 3 #null before 66
After:  48
Level 4 #null before 48
After:  30


In [6]:
#TODO:predict aq loss value
aq_all = aq_all.fillna(method='ffill')

#add geo info of the air quality stations for nearest weather station searching
aq_geo = pd.read_csv('./data/aq_geo.csv')
aq_all = aq_all.merge(aq_geo,how='left',on='station_id')

In [7]:
aq_all.head()

Unnamed: 0,id,time,PM25_Concentration,PM10_Concentration,NO2_Concentration,CO_Concentration,O3_Concentration,SO2_Concentration,dist,w_end,...,temperature,station_id,timestamp,year,month,day,hour,longitude,latitude,station_type_id
0,2942426,2018-04-01 02:00:00,259.0,,98.0,3.4,56.0,57.0,3.532817,beijing_grid_303,...,5.0,dongsi_aq,2018-04-01 02:00:00,2018,4,1,2,116.417,39.929,0
1,2942427,2018-04-01 02:00:00,250.0,,95.0,3.1,64.0,28.0,1.665762,beijing_grid_303,...,5.0,tiantan_aq,2018-04-01 02:00:00,2018,4,1,2,116.407,39.886,0
2,2942428,2018-04-01 02:00:00,240.0,246.0,107.0,2.2,49.0,30.0,4.635416,beijing_grid_282,...,5.0,guanyuan_aq,2018-04-01 02:00:00,2018,4,1,2,116.339,39.929,0
3,2942429,2018-04-01 02:00:00,255.0,260.0,87.0,2.5,65.0,35.0,4.777266,beijing_grid_303,...,5.0,wanshouxigong_aq,2018-04-01 02:00:00,2018,4,1,2,116.352,39.878,0
4,2942430,2018-04-01 02:00:00,266.0,260.0,111.0,3.5,45.0,57.0,2.014976,beijing_grid_304,...,5.0,aotizhongxin_aq,2018-04-01 02:00:00,2018,4,1,2,116.397,39.982,0


### Prepare data for feature extraction
先把原始的三周的数据拿出来，以便于之后的算statistical features

In [8]:
time_columns = ['year','month','day','hour']
attrs_to_predict = ['PM25_Concentration', 'PM10_Concentration', 'O3_Concentration']
weather_attrs = ['humidity','pressure','temperature']
timestamp_column_name = 'time'

#result will be numpy array in the format (station_id,station_type_id,time_columns,historical_data-> 24*7*3 * 3,to_be_predicted_values)
data_201804 = get_raw_train_test_data(aq_all,attrs_to_predict,time_columns,timestamp_column_name,weather_attrs,24*7*3)

data_201804 = np.array(data_201804)

In [9]:
data_201804[:-5,:6]

array([['aotizhongxin_aq', '0', '2018', '4', '24', '21'],
       ['aotizhongxin_aq', '0', '2018', '4', '24', '22'],
       ['aotizhongxin_aq', '0', '2018', '4', '24', '23'],
       ..., 
       ['zhiwuyuan_aq', '0', '2018', '4', '29', '12'],
       ['zhiwuyuan_aq', '0', '2018', '4', '29', '13'],
       ['zhiwuyuan_aq', '0', '2018', '4', '29', '14']],
      dtype='<U18')

### Extract features from the raw historical data
根据前面准备的前三周数据取statistical features，还有取holiday feature

In [10]:
#1.split the data in april into training and validation part
#Use the data on 4.29 as validation and others for training
data_201804_train = data_201804[np.where(data_201804[:,4] != '29')]
data_201804_val = data_201804[np.where(data_201804[:,4] == '29')]

In [11]:
data_201804.shape

(3990, 1950)

In [12]:
#2.get the features and labels for training and validation set
X_raw_train,y_raw_train = split_features_labels(data = data_201804_train,attr = 'PM25_Concentration',length = 24*7*3)
X_raw_val,y_raw_val = split_features_labels(data = data_201804_val,attr = 'PM25_Concentration',length = 24*7*3)

  r = func(a, **kwargs)


PM25_Concentration
PM2.5
PM25_Concentration
PM2.5


In [13]:
print(X_raw_train.shape)
print(y_raw_train.shape)

(3325, 690)
(3325, 48)


注意到y_raw有48行，他是对应的48小时我们要predict的数据，不能直接放到model去train，需要onehot要预测的是后几个小时
X_raw已经含有前三周的一些特征，包括最后一周的原始数据，三周内每一天的statistical 值[min max mean median var],每一周statistical的值,holiday feature 的值->[是否放假第一天，是否放假前一天，是否放假最后一天，是否上班第一天]

In [14]:
X_raw_train = X_raw_train.astype(np.float)
y_raw_train = y_raw_train.astype(np.float)
X_raw_val = X_raw_val.astype(np.float)
y_raw_val = y_raw_val.astype(np.float)

In [15]:
train_201804 = get_train_data_final(X_raw_train,y_raw_train)
train_201804 = np.vstack(train_201804)
train_x = train_201804[:,:-1]
train_y = train_201804[:,-1]

val_201804 = get_train_data_final(X_raw_val,y_raw_val)
val_201804 = np.vstack(val_201804)
val_x = val_201804[:,:-1]
val_y = val_201804[:,-1]

In [16]:
#存起来备用
np.save('/mnt/disks/bdt/201804_PM25_train_x.npy',train_x)
np.save('/mnt/disks/bdt/201805_PM25_train_y.npy',train_y)
np.save('/mnt/disks/bdt/201804_PM25_val_x.npy',val_x)
np.save('/mnt/disks/bdt/201805_PM25_val_y.npy',val_y)