In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.metrics import mean_absolute_error

For Beijing:
* Air Quality data: https://biendata.com/competition/airquality/bj/2018-04-01-0/2018-04-01-23/2k0d1d8
* Observed Meteorology: https://biendata.com/competition/meteorology/bj/2018-04-01-0/2018-04-01-23/2k0d1d8
* Meteorology Grid Data: https://biendata.com/competition/meteorology/bj_grid/2018-04-01-0/2018-04-01-23/2k0d1d8

For London:
* Air Quality data: https://biendata.com/competition/airquality/ld/2018-04-01-0/2018-04-01-23/2k0d1d8
* Meteorology Grid Data: https://biendata.com/competition/meteorology/ld_grid/2018-04-01-0/2018-04-01-23/2k0d1d8

In [2]:
df_aq = pd.read_csv("../input/beijing_1701_1803.csv")
df_meo = pd.read_csv("../input/Beijing_historical_meo_grid.csv")
df_station = pd.read_csv("../input/Beijing_AirQuality_Stations.csv")
target_cols = ['PM2.5', 'PM10', 'O3']

df_aq.utc_time = pd.to_datetime(df_aq.utc_time)
df_meo.utc_time = pd.to_datetime(df_meo.utc_time)

dfs = []
for stationId in df_aq.stationId.unique():
    df = df_aq[df_aq['stationId']==stationId]
    df = df.resample('1H', on='utc_time').sum()
    df = df.reset_index()
    df['stationId'] = stationId
    dfs.append(df)
df_ = pd.concat(dfs)

## Description

In [None]:
df_aq.head()

In [None]:
df_aq.describe()

In [None]:
df_meo.head()

In [None]:
df_meo.describe()

In [None]:
df_station.head()

In [None]:
df_station.describe()

## 合併aq和station

In [None]:
# df_aq_station = pd.merge(df_aq, df_station, how='left',  left_on='stationId', right_on='Station_ID',)
# del df_aq_station['Station_ID']
# df_aq_station.to_csv("../input/beijing_1701_1803_station.csv", index=False)

# Workspace

In [None]:
df_.longitude = df_.longitude.round(1)
df_.latitude = df_.latitude.round(1)

df_ = pd.merge(df_, df_station, 'left', left_on='stationId', right_on='Station_ID')
df_ = pd.merge(df_, df_meo, 'left', 
              left_on=['utc_time', 'longitude', 'latitude'], 
              right_on=['utc_time', 'longitude', 'latitude']
             )
_, bins = pd.cut(df_['wind_direction'], bins=16, retbins=True)
df_['wind_direction'] = pd.cut(df_['wind_direction'], bins=16, labels=False)

le_wind_direction = LabelEncoder()
le_wind_direction.fit(df_.station_type)
df_.station_type = le_wind_direction.transform(df_.station_type)


# le_stationId = LabelEncoder()
# le_stationId.fit(df_.stationId)
# df_.stationId = le_stationId.transform(df_.stationId)

# df_ = pd.get_dummies(df_, columns=['stationId'])

# df_['year'] = df_.utc_time.dt.year
# df_['month'] = df_.utc_time.dt.month
# df_['day'] = df_.utc_time.dt.day
# df_['hour'] = df_.utc_time.dt.hour
# df_['weekday'] = df_.utc_time.dt.weekday

In [None]:
df = df_[pd.notnull(df_['PM2.5']) & 
          pd.notnull(df_['PM10']) &
          pd.notnull(df_['O3'])]
del df['SO2']
del df['CO']
del df['NO2']
del df['Station_ID']
del df['stationName']
del df['longitude']
del df['latitude']

In [None]:
y_col = 'PM2.5'
X_cols = [col for col in df.columns if col not in [y_col, 'stationId', 'utc_time']]

df_train = df[df.utc_time < "2018-03-01 00:00:00"]
df_test = df[df.utc_time >= "2018-03-01 00:00:00"]

X_train = df_train[X_cols]
y_train = df_train[y_col]
X_test = df_test[X_cols]
y_test = df_test[y_col]

# Modeling

In [41]:
# %%time

# from sklearn.model_selection import KFold, train_test_split, GridSearchCV
# import xgboost as xgb
# from lightgbm.sklearn import LGBMRegressor

# def SMAPE(y_pred, dtrain): 
#     y_true = dtrain.get_label()
#     return 'SMAPE', np.mean(np.abs((y_pred-y_true) / (y_pred+y_true))) * 2

# def MAE(y_pred, dtrain):
#     y_true = dtrain.get_label()
#     return 'MAE', mean_absolute_error(y_pred, y_true)


# dtrain = xgb.DMatrix(X_train, y_train)
# dtest = xgb.DMatrix(X_test, y_test, nthread=-1)
# param = {
#     'max_depth': 3, 
#     'learning_rate': 0.1,
#     'n_estimators': 200,
#     'objective':'gpu:reg:linear',
#     'tree_method': 'gpu_hist',
#     'eval_metric': 'mae'
# }

# watchlist = [(dtrain, 'train'), (dtest, 'eval')]
# # xgb.train(param, dtrain, num_boost_round=500, evals=watchlist , feval=SMAPE, early_stopping_rounds=10)
# gpu_res = {}
# xgb.train(param, dtrain, num_boost_round=500, evals=watchlist , early_stopping_rounds=10, evals_result=gpu_res)
# # y_pred = xgb_model.predict(X_test)
# # print(mean_squared_error(y_test, y_pred))

In [None]:
xgb.XGBRegressor()

In [None]:
df

# 補值

##  計算離每個aq最近的aq station

In [3]:
idx_to_station = {k:v for k,v in df_station.Station_ID.iteritems()}
near_stations = []
for k, row in df_station.iterrows():
    result =  np.sqrt((111*(df_station.longitude-116.417))**2 + (74*(df_station.latitude-39.929))**2).nsmallest(4)
    near_idx = np.sqrt((111*(df_station.longitude-row[1]))**2 + (74*(df_station.latitude-row[2]))**2).nsmallest(4).index[1:4].tolist()
    near_station = [idx_to_station[idx] for idx in near_idx]
    near_stations.append(near_station)
    print(k, row[0])
    print(result, "\n")
    
df_station_near = pd.concat([df_station, pd.DataFrame(near_stations, columns=['near_1', 'near_2', 'near_3'])], axis=1 )

0 dongsi_aq
0     0.000000
30    3.300267
1     3.370048
4     4.506715
dtype: float64 

1 tiantan_aq
0     0.000000
30    3.300267
1     3.370048
4     4.506715
dtype: float64 

2 guanyuan_aq
0     0.000000
30    3.300267
1     3.370048
4     4.506715
dtype: float64 

3 wanshouxigong_aq
0     0.000000
30    3.300267
1     3.370048
4     4.506715
dtype: float64 

4 aotizhongxin_aq
0     0.000000
30    3.300267
1     3.370048
4     4.506715
dtype: float64 

5 nongzhanguan_aq
0     0.000000
30    3.300267
1     3.370048
4     4.506715
dtype: float64 

6 wanliu_aq
0     0.000000
30    3.300267
1     3.370048
4     4.506715
dtype: float64 

7 beibuxinqu_aq
0     0.000000
30    3.300267
1     3.370048
4     4.506715
dtype: float64 

8 zhiwuyuan_aq
0     0.000000
30    3.300267
1     3.370048
4     4.506715
dtype: float64 

9 fengtaihuayuan_aq
0     0.000000
30    3.300267
1     3.370048
4     4.506715
dtype: float64 

10 yungang_aq
0     0.000000
30    3.300267
1     3.370048
4     4.506715

In [4]:
try:
    del df_impute
except:
    pass
df_impute = pd.merge(df_, df_station_near, 'left', left_on='stationId', right_on='Station_ID')
nears = ['near_1', 'near_2', 'near_3']
for near in nears:
    near_col = df_impute.iloc[0][near]
    df_right = df_impute[target_cols + ['stationId','utc_time']][df_impute.stationId==near_col]
    df_impute = pd.merge(df_impute, df_right, 
                         how='left', suffixes=('', '_'+ near),
                         left_on=['utc_time', near], 
                         right_on=['utc_time', 'stationId'])

In [6]:
def stKNN(df_impute, station):
    MSEs = []
    df_stKNNs_ = []
    df_impute_station = df_impute[df_impute.stationId==station]
    df_impute_station = df_impute_station.sort_values('utc_time')
    
    for target in target_cols:
        print(station, target)
        
        # Temperial 
        for i in [-2, -1, 1, 2]:
            df_impute_station[target + "_t{0:+}".format(i)] = df_impute_station[target].shift(i*-1)
            
        cols = [target + "_t{0:+}".format(i) for i in [-2, -1, 1, 2]]
        cols += [target+'_near_1', target+'_near_2', target+'_near_3', target, 'utc_time']
        df_stKNN_ = df_impute_station[cols]
        notnull_idxs = df_stKNN_[target].notnull().index # for evaluation

        # Iterate it until no null value
        while df_stKNN_[df_stKNN_[target].isnull()].shape[0]!=0:
            df_stKNN_[target+"_pred"] = df_stKNN_[cols].mean(axis=1)
            df_stKNN_.loc[df_stKNN_[target].isnull(), target] = df_stKNN_[target+'_pred']
            for i in [-2, -1, 1, 2]:
                df_stKNN_[target + "_t{0:+}".format(i)] = df_stKNN_[target].shift(i*-1)
            print(df_stKNN_[df_stKNN_[target].isnull()].shape[0], end=" ")

        df_stKNNs_.append(df_stKNN_[['utc_time', target]])
        mse = mean_absolute_error(df_stKNN_[target][notnull_idxs], df_stKNN_[target+'_pred'][notnull_idxs])
        MSEs.append([station + "_" + target, mse])

    df_stKNN = pd.concat(df_stKNNs_, axis=1)
    df_stKNN = df_stKNN.T.drop_duplicates().T
    df_stKNN['stationId'] = station
    return df_stKNN, MSEs
    
# df_stKNN, MSEs = stKNN(df_impute)

In [7]:
# import concurrent.futures
# import urllib.request

# # We can use a with statement to ensure threads are cleaned up promptly
# df_stKNNs = []
# MSEs = []
# with concurrent.futures.ProcessPoolExecutor() as executor:
#     # Start the load operations and mark each future with its URL
#     result = {executor.submit(stKNN, df_impute, station): station for station in df_impute.stationId.unique()}
#     for future in concurrent.futures.as_completed(result):
#         station = result[future]
#         try:
#             df_stKNN, MSEs_ = future.result()
#         except Exception as exc:
#             print('%r generated an exception: %s' % (station, exc))
#         else:
#             df_stKNNs.append(df_stKNN)
#             MSEs += MSEs_
#             print(MSEs_)

aotizhongxin_aq PM2.5


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


badaling_aq PM2.5


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


beibuxinqu_aq PM2.5


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


daxing_aq PM2.5


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s


dingling_aq PM2.5


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-

aotizhongxin_aq PM10
badaling_aq PM10
beibuxinqu_aq PM10
daxing_aq PM10
dingling_aq PM10
aotizhongxin_aq O3
beibuxinqu_aq O3
daxing_aq O3
badaling_aq O3
dingling_aq O3
[['aotizhongxin_aq_PM2.5', 4.9806487111575519], ['aotizhongxin_aq_PM10', 8.3050988624136206], ['aotizhongxin_aq_O3', 5.8494724840293975]]
donggaocun_aq PM2.5
[['beibuxinqu_aq_PM2.5', 5.5138112951726601], ['beibuxinqu_aq_PM10', 11.382999880580629], ['beibuxinqu_aq_O3', 4.9097672021242422]]
dongsi_aq PM2.5
dongsihuan_aq PM2.5
[['daxing_aq_PM2.5', 5.1499640726318274], ['daxing_aq_PM10', 10.86026938476861], ['daxing_aq_O3', 5.9373149627033994]]
[['badaling_aq_PM2.5', 4.4590309838005435], ['badaling_aq_PM10', 10.584676967015499], ['badaling_aq_O3', 6.1399353426851997]]
fangshan_aq PM2.5
[['dingling_aq_PM2.5', 4.5606059813153781], ['dingling_aq_PM10', 7.1559744649788968], ['dingling_aq_O3', 5.4981647411413412]]
fengtaihuayuan_aq PM2.5
donggaocun_aq PM10
dongsi_aq PM10
fengtaihuayuan_aq PM10
fangshan_aq PM10
donggaocun_aq O3
do

Process Process-2:
Traceback (most recent call last):
Process Process-1:
Process Process-3:
Traceback (most recent call last):
Process Process-5:
Traceback (most recent call last):
  File "/usr/lib/python3.4/multiprocessing/process.py", line 254, in _bootstrap
    self.run()
  File "/usr/lib/python3.4/multiprocessing/process.py", line 254, in _bootstrap
    self.run()
  File "/usr/lib/python3.4/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
Traceback (most recent call last):
  File "/usr/lib/python3.4/multiprocessing/process.py", line 254, in _bootstrap
    self.run()
  File "/usr/lib/python3.4/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/lib/python3.4/concurrent/futures/process.py", line 125, in _process_worker
    call_item = call_queue.get(block=True)
  File "/usr/lib/python3.4/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/l

KeyboardInterrupt: 

In [20]:
a

Unnamed: 0,utc_time,PM2.5,PM10,O3,stationId
370532,2017-01-01 22:00:00,458,490,8,zhiwuyuan_aq
370533,2017-01-01 23:00:00,442,497,8,zhiwuyuan_aq
370534,2017-01-02 00:00:00,441,493,9,zhiwuyuan_aq
370535,2017-01-02 01:00:00,241,495,17,zhiwuyuan_aq
370536,2017-01-02 02:00:00,13,493,33,zhiwuyuan_aq
370537,2017-01-02 03:00:00,7,494,50,zhiwuyuan_aq
370538,2017-01-02 04:00:00,9,493,57,zhiwuyuan_aq
370539,2017-01-02 05:00:00,27,493.5,37,zhiwuyuan_aq
370540,2017-01-02 06:00:00,24,493,60,zhiwuyuan_aq
370541,2017-01-02 07:00:00,20,399.969,42,zhiwuyuan_aq
