In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.metrics import mean_absolute_error
import requests

For Beijing:
* Air Quality data: https://biendata.com/competition/airquality/bj/2018-04-01-0/2018-04-01-23/2k0d1d8
* Observed Meteorology: https://biendata.com/competition/meteorology/bj/2018-04-01-0/2018-04-01-23/2k0d1d8
* Meteorology Grid Data: https://biendata.com/competition/meteorology/bj_grid/2018-04-01-0/2018-04-01-23/2k0d1d8

For London:
* Air Quality data: https://biendata.com/competition/airquality/ld/2018-04-01-0/2018-04-01-23/2k0d1d8
* Meteorology Grid Data: https://biendata.com/competition/meteorology/ld_grid/2018-04-01-0/2018-04-01-23/2k0d1d8
* 48-future http://kdd.caiyunapp.com/competition/forecast/bj/2018-04-29-0/2k0d1d8

* `wget https://biendata.com/competition/airquality/ld/2018-03-30-0/2018-04-28-23/2k0d1d8 -O ld_aq_0330_0428.csv`
* `wget https://biendata.com/competition/airquality/bj/2018-03-30-0/2018-04-28-23/2k0d1d8 -O bj_aq_0330_0428.csv`
* `wget https://biendata.com/competition/meteorology/bj_grid/2018-03-30-0/2018-04-28-23/2k0d1d8 -O bj_meo_0330_0428.csv`
* `wget https://biendata.com/competition/meteorology/ld_grid/2018-03-30-0/2018-04-28-23/2k0d1d8 -O ld_meo_0330_0428.csv`

In [2]:
df_aq = pd.read_csv("../input/beijing_1701_1803.csv")
df_meo = pd.read_csv("../input/Beijing_historical_meo_grid.csv")
df_station = pd.read_csv("../input/Beijing_AirQuality_Stations.csv")
target_cols = ['PM2.5', 'PM10', 'O3']

df_aq.utc_time = pd.to_datetime(df_aq.utc_time)
df_meo.utc_time = pd.to_datetime(df_meo.utc_time)

dfs = []
for stationId in df_aq.stationId.unique():
    df = df_aq[df_aq['stationId']==stationId]
    df = df.resample('1H', on='utc_time').sum()
    df = df.reset_index()
    df['stationId'] = stationId
    dfs.append(df)
df_ = pd.concat(dfs)

## 合併aq和station

In [None]:
# df_aq_station = pd.merge(df_aq, df_station, how='left',  left_on='stationId', right_on='Station_ID',)
# del df_aq_station['Station_ID']
# df_aq_station.to_csv("../input/beijing_1701_1803_station.csv", index=False)

# Workspace

In [23]:
df_ = pd.merge(df_, df_station, 'left', left_on='stationId', right_on='Station_ID')
df_ = pd.merge(df_, df_meo, 'left', 
              left_on=['utc_time', 'longitude', 'latitude'], 
              right_on=['utc_time', 'longitude', 'latitude']
             )
df_.longitude = df_.longitude.round(1)
df_.latitude = df_.latitude.round(1)

_, bins = pd.cut(df_['wind_direction'], bins=16, retbins=True)
df_['wind_direction'] = pd.cut(df_['wind_direction'], bins=16, labels=False)

le_wind_direction = LabelEncoder()
le_wind_direction.fit(df_.station_type)
df_.station_type = le_wind_direction.transform(df_.station_type)


le_stationId = LabelEncoder()
le_stationId.fit(df_.stationId)
df_.stationId = le_stationId.transform(df_.stationId)

df_ = pd.get_dummies(df_, columns=['stationId'])

df_['year'] = df_.utc_time.dt.year
df_['month'] = df_.utc_time.dt.month
df_['day'] = df_.utc_time.dt.day
df_['hour'] = df_.utc_time.dt.hour
df_['weekday'] = df_.utc_time.dt.weekday

AttributeError: 'DataFrame' object has no attribute 'longitude'

In [None]:
df = df_[pd.notnull(df_['PM2.5']) & 
          pd.notnull(df_['PM10']) &
          pd.notnull(df_['O3'])]
del df['SO2']
del df['CO']
del df['NO2']
del df['Station_ID']
del df['stationName']
del df['longitude']
del df['latitude']

In [None]:
y_col = 'PM2.5'
X_cols = [col for col in df.columns if col not in [y_col, 'stationId', 'utc_time']]

df_train = df[df.utc_time < "2018-03-01 00:00:00"]
df_test = df[df.utc_time >= "2018-03-01 00:00:00"]

X_train = df_train[X_cols]
y_train = df_train[y_col]
X_test = df_test[X_cols]
y_test = df_test[y_col]

In [74]:
def fetch_aq(city):
    end_time = pd.datetime.now().strftime('%Y-%m-%d-%H')
    r = requests.get("https://biendata.com/competition/airquality/" + city + "/2018-03-30-0/" + end_time + "/2k0d1d8")
    df = pd.DataFrame([i.decode('utf8').split(',') for i in r.content.splitlines()])
    df.columns = df.iloc[0]
    df = df.reindex(df.index.drop(0))
    del df['id']
    df.columns = ['stationId', 'utc_time', 'PM2.5', 'PM10', 'NO2', 'CO', 'O3', 'SO2']
    return df

df = fetch_aq("bj")

In [69]:
def fetch_meo_forecast(city, time):
    r = requests.get("http://kdd.caiyunapp.com/competition/forecast/" + city + "/" + time + "/2k0d1d8")
    df = pd.DataFrame([i.decode('utf8').split(',') for i in r.content.splitlines()])
    df.columns = df.iloc[0]
    df = df.reindex(df.index.drop(0))
    return df

df = fetch_meo_forecast("ld", "2018-04-28-23")
df.forecast_time = pd.to_datetime(df.forecast_time)
df.forecast_time.min()