In [1]:
#  Load the "autoreload" extension so that code can change
%load_ext autoreload
%reload_ext autoreload
from pathlib import Path

#  always reload modules so that as you change code in src, it gets loaded
%autoreload 2
%matplotlib inline

import sys
sys.path.append('../')
from src.imports import *
from src.data.download_data import *
from src.data.fire_data import *
from src.data.read_data import *
from src.gen_functions import *
from src.features.dataset import Dataset
from src.features.build_features import *
from src.models.train_model import *
import seaborn as sns
output_notebook()
# set font size 
from src.visualization.visualize import *
from src.models.train_model import *


from sklearn.preprocessing import MinMaxScaler

from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import ExtraTreeRegressor
from sklearn.svm import SVR, NuSVR, LinearSVR
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import GradientBoostingRegressor

In [6]:
def get_lag(x_cols, n):
    """Obtain lagged value of x_cols return just the lagged df
    
    """
    lag_df = data_df[x_cols].shift(n)
    lag_df.columns = [ s+ f'_lag_{n}' for s in lag_df.columns]  
    return lag_df

In [2]:
# prepare a list of ml models
def get_models(models=dict()):
    # non-linear models
    models['knn'] = KNeighborsRegressor(n_neighbors=7)
     
    n_trees = 100
    models['gbm'] = GradientBoostingRegressor(n_estimators=n_trees)
    models['nusvmr'] = NuSVR()
    models['lisvmar'] = LinearSVR()
    models['svmr'] = SVR()

    return models

In [2]:
data = Dataset('Chiang Mai')
data.load_()
data.monitor  = data.pollutant = 'PM2.5'
# build the first dataset 
data.feature_no_fire()
data.fire_dict = {'w_speed': 20, 'shift': -18, 'roll': 120}
# use default fire feature
data.merge_fire()
x_cols = data.data.columns.drop([data.pollutant, data.monitor, 'PM2.5_lag_1' ] )
print(x_cols)
# drop the PM2.5 lag columns
data.data_backup = data.data.copy()
data.data_backup = data.data_backup.drop('PM2.5_lag_1', axis=1)
data.data_backup.shape

data no fire has shape (77746, 15)
use default fire feature
Index(['Temperature(C)', 'Humidity(%)', 'Wind Speed(kmph)', 'wind_CALM',
       'wind_E', 'wind_N', 'wind_S', 'wind_W', 'is_rain', 'is_holiday',
       'is_weekend', 'day_of_week', 'time_of_day', 'fire_0_100',
       'fire_100_400', 'fire_400_700', 'fire_700_1000'],
      dtype='object')


(77746, 18)

In [3]:
data.split_data(split_ratio=[0.3, 0.25, 0.25, 0.2])
xtrn, ytrn, x_cols = data.get_data_matrix(use_index=data.split_list[0], x_cols=x_cols)
xval, yval, _ = data.get_data_matrix(use_index=data.split_list[1], x_cols=x_cols)
print(xtrn.shape)

(23323, 17)


In [9]:
models = get_models()

model_search_df = []
for k, model in models.items():
    model.fit(xtrn,ytrn)
    score_dict = cal_scores(yval, model.predict(xval), header_str ='val_')
    score_dict['model'] = k
    print(score_dict)
    model_search_df.append(pd.DataFrame(score_dict, index=[0]))
    
model_search_df  = pd.concat(model_search_df, ignore_index=True)
model_search_df.sort_values('val_mean_absolute_error')

{'val_r2_score': 0.47969005287707467, 'val_mean_squared_error': 655.0582244288948, 'val_mean_absolute_error': 16.001526622173873, 'model': 'knn'}
{'val_r2_score': 0.5286881952355349, 'val_mean_squared_error': 593.3706931581074, 'val_mean_absolute_error': 14.85077940362799, 'model': 'gbm'}
{'val_r2_score': 0.34326573687757767, 'val_mean_squared_error': 826.8132921567159, 'val_mean_absolute_error': 17.018672763387844, 'model': 'nusvmr'}
{'val_r2_score': 0.3944699049700553, 'val_mean_squared_error': 762.3484253605162, 'val_mean_absolute_error': 16.492937087091335, 'model': 'lisvmar'}
{'val_r2_score': 0.33525413348709976, 'val_mean_squared_error': 836.8997160677219, 'val_mean_absolute_error': 16.83848224728322, 'model': 'svmr'}


Unnamed: 0,val_r2_score,val_mean_squared_error,val_mean_absolute_error,model
1,0.528688,593.370693,14.850779,gbm
0,0.47969,655.058224,16.001527,knn
3,0.39447,762.348425,16.492937,lisvmar
4,0.335254,836.899716,16.838482,svmr
2,0.343266,826.813292,17.018673,nusvmr


In [None]:
%%time
model = do_rf_search(xtrn,ytrn, x_tree=True)
score_dict = cal_scores(yval, model.predict(xval), header_str ='val_')
print('optimize 1 score', score_dict) 

In [None]:
importances = model.feature_importances_
feat_imp = pd.DataFrame(importances, index=x_cols, columns=['importance']) 
feat_imp = feat_imp.sort_values('importance',ascending=False).reset_index()
show_fea_imp(feat_imp, title='rf feature of importance(raw)')


In [None]:
# optimize 1 drop unuse cols 
to_drop = feat_imp['index']
to_drop = [a for a in to_drop if 'fire' not in a]

for s in ['Humidity(%)','Temperature(C)','Wind Speed(kmph)']:
    to_drop.remove(s)
to_drop.reverse()

model, new_x_cols = reduce_cols(dataset=data,x_cols=x_cols,to_drop=to_drop,model=model,trn_i=0, val_i=1)
data.x_cols = new_x_cols

In [None]:
data.fire_dict = sk_op_fire(data, model, trn_index=data.split_list[0], val_index=data.split_list[1])