# Create PM2.5 Dataset

This notebook read in the output of the `epa_reorganize.ipynb`, and create a *time-by-station* table of PM2.5 observation.

In [1]:
# Loading libraries
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import re, os

# Define parameters
stnlist = ['嘉義', '龍潭', '淡水', '湖口', '苗栗', '美濃', '大園', '前鎮', '基隆', '板橋', '古亭', '南投',
       '潮州', '小港', '仁武', '花蓮', '馬祖', '崙背', '萬華', '冬山', '竹東', '松山', '沙鹿', '忠明',
       '大寮', '頭份', '屏東', '中壢', '恆春', '新竹', '士林', '觀音', '線西', '左營', '埔里', '新店',
       '新莊', '永和', '菜寮', '朴子', '彰化', '汐止', '復興', '前金', '平鎮', '西屯', '林口', '桃園',
       '豐原', '三義', '宜蘭', '鳳山', '大里', '二林', '楠梓', '竹山', '橋頭', '土城', '陽明', '善化',
       '萬里', '三重', '新營', '斗六', '中山', '新港', '安南', '林園']
#itemlist = ['RAINFALL', 'SO2', 'NOx', 'RH', 'PM10', 'CO', 'NO', 'O3', 'WIND_SPEED', 'WIND_DIREC', 'NO2', 'AMB_TEMP', 'PM2.5']
#print((len(stnlist),len(itemlist)))

# Read 2015 dataset as examples for stations
#pm2015 = pd.read_csv('pm25_2015.csv')
#stnlist = list(pm2015.keys()[2:])
print(len(stnlist))

68


In [2]:
import pickle, os, re
import numpy as np
import pandas as pd
# Load previously processed data
alldata = pd.read_pickle('epa_2000_2014.pkl').sort_values(['station','date', 'hour'])
#for i in range(alldata.shape[0]):
#    alldata['date'][i] = alldata['date'][i][0].strftime('%Y-%m-%d')
alldata.head()

Unnamed: 0,date,hour,PM2.5,SO2,O3,WIND_SPEED,AMB_TEMP,RH,CO,WIND_DIREC,NO2,PM10,NO,RAINFALL,NOx,station
324624,2008-01-01 00:00:00,h00,19.0,6.6,25.5,6.66,8.21,65.07,0.29,50.970001,10.62,51.0,0.35,0.0,10.97,三義
324625,2008-01-01 00:00:00,h01,19.0,6.7,26.299999,6.43,8.49,64.080002,0.27,40.48,8.89,53.0,0.41,0.0,9.3,三義
324626,2008-01-01 00:00:00,h02,19.0,6.4,27.299999,7.21,8.59,60.369999,0.26,46.060001,8.19,49.0,0.39,0.0,8.58,三義
324627,2008-01-01 00:00:00,h03,18.0,6.1,28.6,7.58,8.53,61.529999,0.26,41.810001,6.83,46.0,0.35,0.0,7.18,三義
324628,2008-01-01 00:00:00,h04,20.0,6.2,29.200001,7.8,8.44,60.57,0.26,38.5,6.57,47.0,0.45,0.0,7.02,三義


In [3]:
alldata['date'] = list(alldata['date'])
alldata.head()

Unnamed: 0,date,hour,PM2.5,SO2,O3,WIND_SPEED,AMB_TEMP,RH,CO,WIND_DIREC,NO2,PM10,NO,RAINFALL,NOx,station
324624,2008-01-01,h00,19.0,6.6,25.5,6.66,8.21,65.07,0.29,50.970001,10.62,51.0,0.35,0.0,10.97,三義
324625,2008-01-01,h01,19.0,6.7,26.299999,6.43,8.49,64.080002,0.27,40.48,8.89,53.0,0.41,0.0,9.3,三義
324626,2008-01-01,h02,19.0,6.4,27.299999,7.21,8.59,60.369999,0.26,46.060001,8.19,49.0,0.39,0.0,8.58,三義
324627,2008-01-01,h03,18.0,6.1,28.6,7.58,8.53,61.529999,0.26,41.810001,6.83,46.0,0.35,0.0,7.18,三義
324628,2008-01-01,h04,20.0,6.2,29.200001,7.8,8.44,60.57,0.26,38.5,6.57,47.0,0.45,0.0,7.02,三義


In [4]:
# Create psudo variables with full time-stamps
dates = pd.date_range(start='2000-01-01',end='2014-12-31', freq='D')
hours = pd.Series(sorted(list(set(alldata['hour']))))
thead = []
for d in dates:
    for h in hours:
        thead.append({'date':d, 'hour':h})
thead = pd.DataFrame(thead)
# Verify data dimension
print(thead.shape)
print((15*365+4)*24)
# Take a peek at the data
print(thead.head())

(131496, 2)
131496
        date hour
0 2000-01-01  h00
1 2000-01-01  h01
2 2000-01-01  h02
3 2000-01-01  h03
4 2000-01-01  h04


In [5]:
# Retrieve PM2.5 data from the full dataset
pm25 = alldata.loc[:,['date','hour','station','PM2.5']]
# Start with empty dataset of full time-stamps
pm25_bs = thead
# Merge by station
for s in stnlist:
    tmp = pm25.loc[pm25['station']==s,['date','hour','PM2.5']]
    print(s)
    print(tmp.shape)
    pm25_bs = pm25_bs.merge(tmp, on=['date','hour'], how='left')
# Sort and clean up
pm25_bs = pm25_bs.sort_values(['date', 'hour'])
cnames = list(pm25.columns[:2]) + stnlist
pm25_bs.columns = cnames
print(pm25_bs.shape)
pm25_bs.head()

嘉義
(130824, 3)
龍潭
(130944, 3)
淡水
(131496, 3)
湖口
(130992, 3)
苗栗
(131184, 3)
美濃
(131328, 3)
大園
(126768, 3)
前鎮
(131160, 3)
基隆
(130704, 3)
板橋
(130080, 3)
古亭
(131304, 3)
南投
(131472, 3)
潮州
(130752, 3)
小港
(130992, 3)
仁武
(131256, 3)
花蓮
(131472, 3)
馬祖
(131424, 3)
崙背
(131088, 3)
萬華
(131328, 3)
冬山
(131112, 3)
竹東
(131352, 3)
松山
(130056, 3)
沙鹿
(131424, 3)
忠明
(130344, 3)
大寮
(131376, 3)
頭份
(131208, 3)
屏東
(131208, 3)
中壢
(131016, 3)
恆春
(131184, 3)
新竹
(130536, 3)
士林
(130896, 3)
觀音
(131232, 3)
線西
(131280, 3)
左營
(130824, 3)
埔里
(125136, 3)
新店
(131064, 3)
新莊
(129912, 3)
永和
(130800, 3)
菜寮
(129144, 3)
朴子
(130992, 3)
彰化
(131088, 3)
汐止
(131064, 3)
復興
(131256, 3)
前金
(131280, 3)
平鎮
(131232, 3)
西屯
(131472, 3)
林口
(131136, 3)
桃園
(131424, 3)
豐原
(130728, 3)
三義
(131040, 3)
宜蘭
(130896, 3)
鳳山
(131352, 3)
大里
(131424, 3)
二林
(131376, 3)
楠梓
(130968, 3)
竹山
(131064, 3)
橋頭
(131088, 3)
土城
(131496, 3)
陽明
(131472, 3)
善化
(131160, 3)
萬里
(131304, 3)
三重
(130536, 3)
新營
(131304, 3)
斗六
(131280, 3)
中山
(131136, 3)
新港
(131208, 3)
安南
(131160

Unnamed: 0,date,hour,嘉義,龍潭,淡水,湖口,苗栗,美濃,大園,前鎮,...,陽明,善化,萬里,三重,新營,斗六,中山,新港,安南,林園
0,2000-01-01,h00,,,,,,,,,...,,,,26.030001,,,,,,68.830002
1,2000-01-01,h01,,,,,,,,,...,,,,24.790001,,,,,,96.830002
2,2000-01-01,h02,,,,,,,,,...,,,,23.139999,,,,,,125.970001
3,2000-01-01,h03,,,,,,,,,...,,,,23.940001,,,,,,125.599998
4,2000-01-01,h04,,,,,,,,,...,,,,17.139999,,,,,,118.230003


In [6]:
# Save the results to pm25_2000_2014.csv
pm25_bs.to_csv('pm25_2000_2014.csv', index=False)