# Create PM2.5 Dataset

This notebook read in the output of the `epa_reorganize.ipynb`, and create a *time-by-station* table of PM2.5 observation.

In [1]:
# Loading libraries
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import re, os

# Define parameters
stnlist = ['嘉義', '龍潭', '淡水', '湖口', '苗栗', '美濃', '大園', '前鎮', '基隆', '板橋', '古亭', '南投',
       '潮州', '小港', '仁武', '花蓮', '馬祖', '崙背', '萬華', '冬山', '竹東', '松山', '沙鹿', '忠明',
       '大寮', '頭份', '屏東', '中壢', '恆春', '新竹', '士林', '觀音', '線西', '左營', '埔里', '新店',
       '新莊', '永和', '菜寮', '朴子', '彰化', '汐止', '復興', '前金', '平鎮', '西屯', '林口', '桃園',
       '豐原', '三義', '宜蘭', '鳳山', '大里', '二林', '楠梓', '竹山', '橋頭', '土城', '陽明', '善化',
       '萬里', '三重', '新營', '斗六', '中山', '新港', '安南', '林園']
#itemlist = ['RAINFALL', 'SO2', 'NOx', 'RH', 'PM10', 'CO', 'NO', 'O3', 'WIND_SPEED', 'WIND_DIREC', 'NO2', 'AMB_TEMP', 'PM2.5']
#print((len(stnlist),len(itemlist)))

# Read 2015 dataset as examples for stations
#pm2015 = pd.read_csv('pm25_2015.csv')
#stnlist = list(pm2015.keys()[2:])
print(len(stnlist))

68


In [2]:
import pickle, os, re
import numpy as np
import pandas as pd
# Load previously processed data
alldata = pd.read_pickle('epa_2000_2014.pkl').sort_values(['station','date', 'hour'])
#for i in range(alldata.shape[0]):
#    alldata['date'][i] = alldata['date'][i][0].strftime('%Y-%m-%d')
alldata.head()

Unnamed: 0,PM2.5,WIND_DIREC,CO,PM10,NOx,SO2,O3,NO,RAINFALL,AMB_TEMP,NO2,WIND_SPEED,date,hour,RH,station
298176,19.0,50.970001,0.29,51.0,10.97,6.6,25.5,0.35,0.0,8.21,10.62,6.66,2008-01-01 00:00:00,h00,65.07,三義
298177,19.0,40.48,0.27,53.0,9.3,6.7,26.299999,0.41,0.0,8.49,8.89,6.43,2008-01-01 00:00:00,h01,64.080002,三義
298178,19.0,46.060001,0.26,49.0,8.58,6.4,27.299999,0.39,0.0,8.59,8.19,7.21,2008-01-01 00:00:00,h02,60.369999,三義
298179,18.0,41.810001,0.26,46.0,7.18,6.1,28.6,0.35,0.0,8.53,6.83,7.58,2008-01-01 00:00:00,h03,61.529999,三義
298180,20.0,38.5,0.26,47.0,7.02,6.2,29.200001,0.45,0.0,8.44,6.57,7.8,2008-01-01 00:00:00,h04,60.57,三義


In [3]:
alldata['date'] = list(alldata['date'])
alldata.head()

Unnamed: 0,PM2.5,WIND_DIREC,CO,PM10,NOx,SO2,O3,NO,RAINFALL,AMB_TEMP,NO2,WIND_SPEED,date,hour,RH,station
298176,19.0,50.970001,0.29,51.0,10.97,6.6,25.5,0.35,0.0,8.21,10.62,6.66,2008-01-01,h00,65.07,三義
298177,19.0,40.48,0.27,53.0,9.3,6.7,26.299999,0.41,0.0,8.49,8.89,6.43,2008-01-01,h01,64.080002,三義
298178,19.0,46.060001,0.26,49.0,8.58,6.4,27.299999,0.39,0.0,8.59,8.19,7.21,2008-01-01,h02,60.369999,三義
298179,18.0,41.810001,0.26,46.0,7.18,6.1,28.6,0.35,0.0,8.53,6.83,7.58,2008-01-01,h03,61.529999,三義
298180,20.0,38.5,0.26,47.0,7.02,6.2,29.200001,0.45,0.0,8.44,6.57,7.8,2008-01-01,h04,60.57,三義


In [4]:
# Create psudo variables with full time-stamps
dates = pd.date_range(start='2000-01-01',end='2014-12-31', freq='D')
hours = pd.Series(sorted(list(set(alldata['hour']))))
thead = []
for d in dates:
    for h in hours:
        thead.append({'date':d, 'hour':h})
thead = pd.DataFrame(thead)
# Verify data dimension
print(thead.shape)
print((15*365+4)*24)
# Take a peek at the data
print(thead.head())

(131496, 2)
131496
        date hour
0 2000-01-01  h00
1 2000-01-01  h01
2 2000-01-01  h02
3 2000-01-01  h03
4 2000-01-01  h04


In [5]:
# Retrieve PM2.5 data from the full dataset
pm25 = alldata.loc[:,['date','hour','station','PM2.5']]
# Start with empty dataset of full time-stamps
pm25_bs = thead
# Merge by station
for s in stnlist:
    tmp = pm25.loc[pm25['station']==s,['date','hour','PM2.5']]
    print(s)
    print(tmp.shape)
    pm25_bs = pm25_bs.merge(tmp, on=['date','hour'], how='left')
# Sort and clean up
pm25_bs = pm25_bs.sort_values(['date', 'hour'])
cnames = list(pm25.columns[:2]) + stnlist
pm25_bs.columns = cnames
print(pm25_bs.shape)
pm25_bs.head()

嘉義
(90528, 3)
龍潭
(84552, 3)
淡水
(83640, 3)
湖口
(83400, 3)
苗栗
(83112, 3)
美濃
(82992, 3)
大園
(83904, 3)
前鎮
(88416, 3)
基隆
(89784, 3)
板橋
(84456, 3)
古亭
(0, 3)
南投
(90864, 3)
潮州
(89520, 3)
小港
(89832, 3)
仁武
(90864, 3)
花蓮
(90000, 3)
馬祖
(88776, 3)
崙背
(88344, 3)
萬華
(94872, 3)
冬山
(83064, 3)
竹東
(83400, 3)
松山
(83976, 3)
沙鹿
(91224, 3)
忠明
(0, 3)
大寮
(91176, 3)
頭份
(83544, 3)
屏東
(90984, 3)
中壢
(83040, 3)
恆春
(82392, 3)
新竹
(83352, 3)
士林
(84360, 3)
觀音
(84168, 3)
線西
(83184, 3)
左營
(83664, 3)
埔里
(90048, 3)
新店
(83808, 3)
新莊
(90288, 3)
永和
(86736, 3)
菜寮
(83856, 3)
朴子
(87144, 3)
彰化
(84024, 3)
汐止
(83976, 3)
復興
(86904, 3)
前金
(88176, 3)
平鎮
(84048, 3)
西屯
(83928, 3)
林口
(83688, 3)
桃園
(84696, 3)
豐原
(83784, 3)
三義
(83112, 3)
宜蘭
(89496, 3)
鳳山
(0, 3)
大里
(90744, 3)
二林
(84792, 3)
楠梓
(90624, 3)
竹山
(84624, 3)
橋頭
(97800, 3)
土城
(84024, 3)
陽明
(83832, 3)
善化
(83880, 3)
萬里
(94080, 3)
三重
(0, 3)
新營
(89712, 3)
斗六
(87432, 3)
中山
(90072, 3)
新港
(86904, 3)
安南
(88104, 3)
林園
(0, 3)
(131496, 70)


Unnamed: 0,date,hour,嘉義,龍潭,淡水,湖口,苗栗,美濃,大園,前鎮,...,陽明,善化,萬里,三重,新營,斗六,中山,新港,安南,林園
0,2000-01-01,h00,,,,,,,,,...,,,,,,,,,,
1,2000-01-01,h01,,,,,,,,,...,,,,,,,,,,
2,2000-01-01,h02,,,,,,,,,...,,,,,,,,,,
3,2000-01-01,h03,,,,,,,,,...,,,,,,,,,,
4,2000-01-01,h04,,,,,,,,,...,,,,,,,,,,


In [6]:
# Save the results to pm25_2000_2014.csv
pm25_bs.to_csv('pm25_2000_2014.csv', index=False)