In [6]:
# import module
import pandas as pd
import numpy as np
import os
from datetime import datetime
from math import *
import glob
import time
import random
import warnings

## sklearn
from sklearn.ensemble import (GradientBoostingRegressor, GradientBoostingClassifier, RandomForestRegressor, RandomForestClassifier, VotingClassifier, VotingRegressor)
from sklearn.linear_model import *
from sklearn import metrics
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import *
from sklearn.model_selection import *
from sklearn.preprocessing import StandardScaler, RobustScaler, LabelEncoder, MinMaxScaler

#scipy
import scipy.stats as st

#LightGBM
from lightgbm import LGBMClassifier, LGBMRegressor


# xgboost
from xgboost import XGBClassifier
from xgboost import XGBRegressor
from xgboost import plot_importance
import xgboost as xgb


warnings.filterwarnings(action='ignore')
%matplotlib inline

font_name = mpl.font_manager.FontProperties(fname='C:/Windows/Fonts/malgunbd.ttf').get_name()
mpl.rc('font', family=font_name)
mpl.rcParams['axes.unicode_minus'] = False


In [7]:
# load data
df = pd.read_csv('D:/data/row_final.csv')
speed = pd.read_csv('D:/data/speed.csv')
df.head()

Unnamed: 0,co,date,no2,o3,pm10,pm25,site,so2,humid,rain_yn,temp,wind_direction,wind_speed
0,0.3,2017-12-01 00:00:00,0.017,0.034,10.0,2.0,광교동,0.003,,0.0,-4.1,81.1,1.5
1,0.4,2017-12-01 00:00:00,0.011,0.018,30.0,4.0,중2동,0.003,,0.0,-3.6,12.8,0.7
2,0.4,2017-12-01 00:00:00,0.016,0.027,20.0,5.0,대야동,0.004,,0.0,-2.5,3.6,0.5
3,0.3,2017-12-01 00:00:00,0.012,0.026,13.0,12.0,산본동,0.004,,0.0,-3.6,291.1,1.7
4,0.5,2017-12-01 00:00:00,0.025,0.021,18.0,10.0,철산동,0.002,,0.0,-3.7,340.0,1.7


In [8]:
speed.head()

Unnamed: 0,date,speed
0,2017-12-01 00:00:00,40.0
1,2017-12-01 10:00:00,40.5
2,2017-12-01 11:00:00,40.0
3,2017-12-01 12:00:00,40.0
4,2017-12-01 13:00:00,40.0


# Preprocessing

In [9]:
def date_split(df) : 
    df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d %H:%M')
    df['ymd'] = [d.date() for d in df['date']]
    df['md'] = df['ymd'].apply(lambda x: x.strftime('%m-%d'))
def season(x):
    if '05-31'>= x>='03-01':
        return('봄')
    elif '11-30' >= x >= '09-01':
        return('가을')
    elif '08-31' >=x >= '06-01':
        return('여름')
    else: return('겨울')
    
def speed_categori(x):
    if 15>=x:
        return(0)
    elif 25>=x>=15:
        return(1)
    else: return(2)

In [10]:
df = df[['site', 'date', 'so2', 'co', 'o3', 'no2', 'pm10', 'pm25', 'temp',
       'wind_speed', 'wind_direction', 'rain_yn', 'humid']]
df.loc[df['temp'] <= -30, 'temp' ] = np.float("nan")
df['rain_yn'] = df['rain_yn'].apply(lambda x: x if x==0 else 1)
df = df[df['site']=='시화산단']
date_split(df)
df['season'] = df['md'].apply(season)
df = df[df['date']>='2017-11-01']
df.drop(['site','md','ymd'],axis=1,inplace=True)
df['date'] = df['date'].astype(str)
df.head()

Unnamed: 0,date,so2,co,o3,no2,pm10,pm25,temp,wind_speed,wind_direction,rain_yn,humid,season
7,2017-12-01 00:00:00,0.005,0.4,0.03,0.017,21.0,13.0,-4.6,1.9,4.5,0.0,43.6,겨울
20,2017-12-01 01:00:00,0.004,0.5,0.032,0.013,25.0,14.0,-6.3,,,0.0,55.5,겨울
28,2017-12-01 02:00:00,0.005,0.5,0.027,0.017,25.0,13.0,-6.7,0.6,58.6,0.0,60.9,겨울
36,2017-12-01 03:00:00,0.005,0.5,0.028,0.015,27.0,13.0,-7.2,0.7,11.3,0.0,66.5,겨울
48,2017-12-01 04:00:00,0.004,0.5,0.028,0.015,20.0,12.0,-7.3,0.7,325.4,0.0,57.1,겨울


In [11]:
df.isnull().sum()

date                0
so2               452
co                477
o3                460
no2               567
pm10              723
pm25              756
temp              406
wind_speed        347
wind_direction    347
rain_yn             0
humid             338
season              0
dtype: int64

In [12]:
df.fillna(method='ffill',inplace=True) # Null 값 이전값 대체
df_speed = pd.merge(df,speed,on='date') # 통행속도와 지역 데이터 merge
df_speed['traffic'] = df_speed['speed'].apply(speed_categori) # 통행속도 범주화
df_speed.isnull().sum()

date              0
so2               0
co                0
o3                0
no2               0
pm10              0
pm25              0
temp              0
wind_speed        0
wind_direction    0
rain_yn           0
humid             0
season            0
speed             0
traffic           0
dtype: int64

## Data Normalization

In [14]:
df_speed.head()

Unnamed: 0,date,so2,co,o3,no2,pm10,pm25,temp,wind_speed,wind_direction,rain_yn,humid,season,speed,traffic
0,2017-12-01 00:00:00,0.005,0.4,0.03,0.017,21.0,13.0,-4.6,1.9,4.5,0.0,43.6,겨울,40.0,2
1,2017-12-01 01:00:00,0.004,0.5,0.032,0.013,25.0,14.0,-6.3,1.9,4.5,0.0,55.5,겨울,40.0,2
2,2017-12-01 02:00:00,0.005,0.5,0.027,0.017,25.0,13.0,-6.7,0.6,58.6,0.0,60.9,겨울,39.5,2
3,2017-12-01 03:00:00,0.005,0.5,0.028,0.015,27.0,13.0,-7.2,0.7,11.3,0.0,66.5,겨울,40.5,2
4,2017-12-01 04:00:00,0.004,0.5,0.028,0.015,20.0,12.0,-7.3,0.7,325.4,0.0,57.1,겨울,40.0,2


In [15]:
data_cols = ['so2','co','o3', 'no2','pm10', 'temp', 'wind_direction','wind_speed', 'humid'] # 연속형 변수 Normalization
df_minmax = df_speed[data_cols]
df_other = df_speed[['date','pm25','season','rain_yn','traffic']] # 범주형 변수 따로 추출
min_max_scaler = MinMaxScaler()
min_max_scaler.fit(df_minmax)
speed_scale = min_max_scaler.transform(df_minmax)
speed_scale = pd.DataFrame(speed_scale, columns=data_cols, index=list(df_minmax.index.values))

    
speed_scale = pd.concat([speed_scale,df_other],axis=1)
scale_column = df_other.columns.tolist()
scale_column.remove('pm25')
scale_column.remove('date')


for i in scale_column :
    speed_scale[i] = speed_scale[i].shift(1)

for i in data_cols:
    speed_scale[i] = speed_scale[i].rolling(window=4).mean()

speed_scale['pm25_window'] = speed_scale['pm25'].rolling(window=4).mean()


speed_scale.dropna(inplace=True) #Rolling 후 생긴 Null행 제거
speed_scale.head(5)

Unnamed: 0,so2,co,o3,no2,pm10,temp,wind_direction,wind_speed,humid,date,pm25,season,rain_yn,traffic,pm25_window
3,0.012633,0.178571,0.18109,0.113861,0.057107,0.290424,0.054792,0.097328,0.52393,2017-12-01 03:00:00,13.0,겨울,0.0,2.0,13.25
4,0.011968,0.190476,0.177885,0.108911,0.056472,0.279827,0.277639,0.074427,0.561937,2017-12-01 04:00:00,12.0,겨울,0.0,2.0,13.0
5,0.011968,0.190476,0.168269,0.116337,0.057107,0.274333,0.288472,0.055344,0.576858,2017-12-01 05:00:00,16.0,겨울,0.0,2.0,13.5
6,0.011968,0.190476,0.165064,0.118812,0.057107,0.269231,0.295625,0.051527,0.584459,2017-12-01 06:00:00,13.0,겨울,0.0,2.0,13.5
7,0.011968,0.190476,0.139423,0.163366,0.055838,0.265699,0.317153,0.05916,0.552646,2017-12-01 07:00:00,13.0,겨울,0.0,2.0,13.5


In [16]:
data = speed_scale.copy()
data.dropna(inplace=True)
X = data.drop(['date',"pm25"], axis=1)
Y = data["pm25"]
mse = []

# one-hot encoding
# object 변수만 해당
for i in ["season",'rain_yn','traffic']:
    X[i] = pd.get_dummies(X[i])
# trian / test split 0.8/ 0.2
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=0)

In [17]:
def base_model(model):
    model.fit(x_train,y_train)
    y_pred = model.predict(x_test)
    print("MSE : {:.3f}".format((mean_squared_error(y_test, y_pred))**0.5))
    mse.append((mean_squared_error(y_test, y_pred))**0.5)
    print("RMSE : {:.3f}".format((sqrt(mean_squared_error(y_test, y_pred))**0.5)))

In [18]:
model = RandomForestRegressor(random_state =5)
base_model(model)

model = GradientBoostingRegressor(random_state =5)
base_model(model)

model = XGBRegressor(objective ='reg:squarederror',random_state =5)
base_model(model)

model = LGBMRegressor(random_state =5)
base_model(model)
np.mean(mse)

MSE : 6.468
RMSE : 2.543
MSE : 6.529
RMSE : 2.555
MSE : 6.549
RMSE : 2.559
MSE : 6.396
RMSE : 2.529


6.485580740738487

## Hyperparameter Tuning

In [19]:
speed_scale.head()

Unnamed: 0,so2,co,o3,no2,pm10,temp,wind_direction,wind_speed,humid,date,pm25,season,rain_yn,traffic,pm25_window
3,0.012633,0.178571,0.18109,0.113861,0.057107,0.290424,0.054792,0.097328,0.52393,2017-12-01 03:00:00,13.0,겨울,0.0,2.0,13.25
4,0.011968,0.190476,0.177885,0.108911,0.056472,0.279827,0.277639,0.074427,0.561937,2017-12-01 04:00:00,12.0,겨울,0.0,2.0,13.0
5,0.011968,0.190476,0.168269,0.116337,0.057107,0.274333,0.288472,0.055344,0.576858,2017-12-01 05:00:00,16.0,겨울,0.0,2.0,13.5
6,0.011968,0.190476,0.165064,0.118812,0.057107,0.269231,0.295625,0.051527,0.584459,2017-12-01 06:00:00,13.0,겨울,0.0,2.0,13.5
7,0.011968,0.190476,0.139423,0.163366,0.055838,0.265699,0.317153,0.05916,0.552646,2017-12-01 07:00:00,13.0,겨울,0.0,2.0,13.5


In [20]:
# modeling Definition
def voting_model(models) : #Voting Model
    for i,m in enumerate(models):
        print(i, m.__class__)
        m.fit(x_train, y_train)

    models = sorted(models, key=lambda m: mean_squared_error(y_test, m.predict(x_test)))


    y_preds = np.array([m.predict(x_test) for m in models]).T
    y_preds_mean = y_preds.mean(axis=1)
    df_predict = pd.DataFrame({'Actual': y_test, 'Predicted': y_preds_mean})
    print("MSE : {:.4f}".format((mean_squared_error(y_test, y_preds_mean))**0.5))

    y_pred_log = np.array([m.predict(x_test) for m in models]).T.dot(
    np.linspace(1.0, 0.0, len(models))/sum(np.linspace(1.0, 0.0, len(models))))
    df_predict = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred_log})
    print("weight MSE : {:.4f}".format((mean_squared_error(y_test, y_pred_log))**0.5))

def Randomsearch_rmse(models): #random search 
    model = RandomizedSearchCV(estimator= models, param_distributions = param, 
                               n_iter=500, cv = 3, scoring= 'neg_mean_squared_error')
    model.fit(x_train, y_train)

    print("RandomizedSearchCV 최적의 파라미터 : ", model.best_params_)
    best_model = model.best_estimator_

    y_pred = best_model.predict(x_test)
    print("MSE : {:.4f}".format((mean_squared_error(y_test, y_pred))**0.5))

In [40]:
# Random Search (xgboost)
param = {'max_depth': list(range(4, 25)),
         'n_estimator' : list(range(10, 3000, 20)),
         'min_child_weight' : list(np.linspace(1, 12, 20)),
         'subsample' : list(np.linspace(0, 1, 15)),
         'reg_alpha' : [x for x in np.linspace(0.01,1,25)],
         'reg_lambda' : [x for x in np.linspace(0.01,1,25)]
}
models = XGBRegressor(objective='reg:squarederror',random_state =5)

Randomsearch_rmse(models)

RandomizedSearchCV 최적의 파라미터 :  {'subsample': 0.7857142857142857, 'reg_lambda': 0.2575, 'reg_alpha': 0.42250000000000004, 'n_estimator': 410, 'min_child_weight': 3.3157894736842106, 'max_depth': 6}
MSE : 6.3001


In [41]:
# Random Search (RandomForest)
param = {'max_depth' : [int(x) for x in range(5,25)],
         'n_estimators' : [int(x) for x in np.linspace(start = 200, stop = 3000, num = 20)],
         'min_samples_leaf': list(range(1,6))
}
models = RandomForestRegressor(random_state =5)

Randomsearch_rmse(models)

RandomizedSearchCV 최적의 파라미터 :  {'n_estimators': 347, 'min_samples_leaf': 3, 'max_depth': 10}
MSE : 6.4931


In [23]:
# Random Search (RandomForest)
param = {'max_depth' : [int(x) for x in range(5,25)],
         'n_estimators' : [int(x) for x in np.linspace(start = 200, stop = 3000, num = 20)],
         'min_samples_leaf': np.arange(1,10,1),
         'min_samples_split' : np.arange(1,10,1),
         'max_features' : list(range(1,x_train.shape[1]))
}
models = RandomForestRegressor(random_state =5)

Randomsearch_rmse(models)

RandomizedSearchCV 최적의 파라미터 :  {'n_estimators': 3000, 'min_samples_split': 3, 'min_samples_leaf': 1, 'max_features': 4, 'max_depth': 19}
MSE : 6.2690


In [24]:
# Random Search (LightGBM)
param = {'max_depth' : list(range(8, 30)),
         'n_estimators' : [int(x) for x in np.linspace(start = 1000, stop = 3000, num = 30)],
         'reg_alpha' : [x for x in np.linspace(0.01,1,30)],
         'reg_lambda' : [x for x in np.linspace(0.01,1,30)],
         'boosting' : ['gbdt','dart','goss'],
         'learning_rate' : [x for x in np.linspace(0.001,1,30)]
}
models = LGBMRegressor(random_state =5)

Randomsearch_rmse(models)

RandomizedSearchCV 최적의 파라미터 :  {'reg_lambda': 0.11241379310344828, 'reg_alpha': 0.48793103448275865, 'n_estimators': 1275, 'max_depth': 17, 'learning_rate': 0.035448275862068966, 'boosting': 'gbdt'}
MSE : 6.2468


In [25]:
# Random Search (LightGBM)
param = {'max_depth' : list(range(4, 25)),
         'n_estimators' : [int(x) for x in np.linspace(start = 100, stop = 3000, num = 50)],
         'reg_alpha' : [x for x in np.linspace(0.01,1,25)],
         'reg_lambda' : [x for x in np.linspace(0.01,1,25)],
         'boosting' : ['gbdt','dart','goss'],
         'learning_rate' : [x for x in np.linspace(0.001,1,25)],
         'bagging_fraction' : [x for x in np.linspace(0.001,1,25)]     
}
models = LGBMRegressor(random_state =5)

Randomsearch_rmse(models)

RandomizedSearchCV 최적의 파라미터 :  {'reg_lambda': 0.01, 'reg_alpha': 0.8350000000000001, 'n_estimators': 2467, 'max_depth': 13, 'learning_rate': 0.5421250000000001, 'boosting': 'dart', 'bagging_fraction': 0.41725}
MSE : 6.2199


### LightGBM RandomForest Voting

In [27]:
models = [RandomForestRegressor(n_estimators=3000, min_samples_leaf=1, min_samples_split=3 , max_features=4, max_depth=19, random_state =5),

         LGBMRegressor(max_depth=13, n_estimators=2467, learning_rate =0.5421250000000001, boosting='dart',
                       reg_alpha=0.8350000000000001, reg_lambda=0.01, bagging_fraction=0.41725, random_state =5)]

voting_model(models)

0 <class 'sklearn.ensemble._forest.RandomForestRegressor'>
1 <class 'lightgbm.sklearn.LGBMRegressor'>
MSE : 6.1086
weight MSE : 6.2199


## Airkorea 24시간이동평균  
## RMSE 측정

In [31]:
def pm24(pm25, C12):
        if pm25<30 :
            return(pm25)
        elif pm25>=30 :
            if (pm25/C12) <0.9 or (pm25/C12)> 1.7:
                return(pm25)
            else : return(0.75*pm25)

In [20]:
airkorea_24h = df.copy()
airkorea_24h['C12'] = airkorea_24h['pm25'].rolling(window=13).mean()
airkorea_24h.reset_index(drop=True, inplace=True)
airkorea_24h['Cai'] = airkorea_24h.apply(lambda x : pm24(x['pm25'],x['C12']), axis=1)
airkorea_24h['C4']  = airkorea_24h['Cai'].rolling(window=5).mean()
airkorea_24h.reset_index(drop=True, inplace=True)
airkorea_24h.dropna(inplace=True)
airkorea_24h['C24E']  = round((airkorea_24h['C12']*12 + airkorea_24h['C4']*12)/24)
print("MSE : {:.4f}".format((mean_squared_error(airkorea_24h['pm25'], airkorea_24h['C24E']))**0.5))

MSE : 10.7705


## Categorical

## Airkorea 미세먼지 기준 적용하여 진행  
## 좋음(0-50), 보통(51-100), 나쁨(101-250), 매우나쁨(250-)

In [21]:
def airkorea_categori(x) :
    if 15>= x>=0:
        return('0')
    elif 35 >= x >= 16 :
        return('1')
    elif 75 >=x >= 36:
        return('2')
    else: return('3') 
    
def base_accuracy(model):
    model.fit(x_train,y_train)
    y_pred = model.predict(x_test)
    print("ACC : ", accuracy_score(y_test, y_pred))
    print("f1-score : ", f1_score(y_test, y_pred, average='macro'))
    
def categorical_modeling(models):
    model = VotingClassifier(estimators=[(f'{m}' , m) for m in models], voting='hard')
 
    model = model.fit(x_train, y_train)

    #예측 진행
    y_pred = model.predict(x_test) 
    df_predict = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
    # ACC, F1-score check
    print("ACC : ", accuracy_score(y_test, y_pred))
    print("f1-score : ", f1_score(y_test, y_pred, average='macro'))
    
def Randomsearch_categori(models):
    model = RandomizedSearchCV(estimator= models, param_distributions = param, 
                               n_iter=500, cv = 3, scoring= 'accuracy')
    model.fit(x_train, y_train)

    print("RandomizedSearchCV 최적의 파라미터 : ", model.best_params_)
    best_model = model.best_estimator_

    y_pred = best_model.predict(x_test)
    print("ACC : ", accuracy_score(y_test, y_pred))
    print("f1-score : ", f1_score(y_test, y_pred, average='macro'))

In [23]:
df_categori = speed_scale.copy()
df_categori['PM_categorical'] = df_categori['pm25'].apply(airkorea_categori) # categori 생성
df_categori['PM_categorical'] = df_categori['PM_categorical'].astype('category')
df_categori.dropna(inplace=True)
df_categori.head()

Unnamed: 0,so2,co,o3,no2,pm10,temp,wind_direction,wind_speed,humid,date,pm25,season,rain_yn,traffic,pm25_window,PM_categorical
3,0.012633,0.178571,0.18109,0.113861,0.057107,0.290424,0.054792,0.097328,0.52393,2017-12-01 03:00:00,13.0,겨울,0.0,2.0,13.25,0
4,0.011968,0.190476,0.177885,0.108911,0.056472,0.279827,0.277639,0.074427,0.561937,2017-12-01 04:00:00,12.0,겨울,0.0,2.0,13.0,0
5,0.011968,0.190476,0.168269,0.116337,0.057107,0.274333,0.288472,0.055344,0.576858,2017-12-01 05:00:00,16.0,겨울,0.0,2.0,13.5,1
6,0.011968,0.190476,0.165064,0.118812,0.057107,0.269231,0.295625,0.051527,0.584459,2017-12-01 06:00:00,13.0,겨울,0.0,2.0,13.5,0
7,0.011968,0.190476,0.139423,0.163366,0.055838,0.265699,0.317153,0.05916,0.552646,2017-12-01 07:00:00,13.0,겨울,0.0,2.0,13.5,0


In [24]:
data = df_categori.copy()
data.dropna(inplace=True)
X = data.drop(['date',"pm25",'PM_categorical'], axis=1)
Y = data["PM_categorical"]
mse = []

# one-hot encoding
# object 변수만 해당
for i in ["season",'rain_yn','traffic']:
    X[i] = pd.get_dummies(X[i])
# trian / test split 0.8/ 0.2
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=0)

In [25]:
# ACC Check
model = RandomForestClassifier(random_state =5)
base_accuracy(model)

model = GradientBoostingClassifier(random_state =5)
base_accuracy(model)

model = XGBClassifier(objective ='multi:softprob',random_state =5)
base_accuracy(model)

model = LGBMClassifier(random_state =5)
base_accuracy(model)

ACC :  0.8438579956016337
f1-score :  0.8438085620986613
ACC :  0.8391454602576186
f1-score :  0.8291397320116763
ACC :  0.839459629280553
f1-score :  0.8331090109286836
ACC :  0.839459629280553
f1-score :  0.83495032063809


In [28]:
# Random Search CV (xgboost)
param = {'max_depth': list(range(2, 25)),
         'n_estimator' : list(range(10, 3000, 30)),
         'min_child_weight' : list(np.linspace(1, 10, 15)),
         'subsample' : list(np.linspace(0, 1, 7))
}
models = XGBClassifier(objective='multi:softprob',random_state =5)
Randomsearch_categori(models)

RandomizedSearchCV 최적의 파라미터 :  {'subsample': 0.3333333333333333, 'n_estimator': 1960, 'min_child_weight': 4.857142857142858, 'max_depth': 2}
ACC :  0.8391454602576186
f1-score :  0.8349085211925797


In [27]:
# Random Search CV (xgboost)
param = {'max_depth': list(range(4, 25)),
         'n_estimator' : list(range(10, 3000, 20)),
         'min_child_weight' : list(np.linspace(1, 12, 20)),
         'subsample' : list(np.linspace(0, 1, 15)),
         'reg_alpha' : [x for x in np.linspace(0.01,1,25)],
         'reg_lambda' : [x for x in np.linspace(0.01,1,25)]
}
models = XGBClassifier(objective='multi:softprob',random_state =5)

Randomsearch_categori(models)

RandomizedSearchCV 최적의 파라미터 :  {'subsample': 0.42857142857142855, 'reg_lambda': 0.01, 'reg_alpha': 0.2575, 'n_estimator': 1990, 'min_child_weight': 3.3157894736842106, 'max_depth': 4}
ACC :  0.8378887841658812
f1-score :  0.8323472763394253


In [41]:
# RandomForest
param = {'max_depth' : [int(x) for x in range(5,25)],
         'n_estimators' : [int(x) for x in np.linspace(start = 200, stop = 3000, num = 20)],
         'min_samples_leaf': np.arange(1,10,1),
         'min_samples_split' : np.arange(1,10,1),
         'max_features' : list(range(1,x_train.shape[1]))
}
models = RandomForestClassifier(random_state =5)

Randomsearch_categori(models)

RandomizedSearchCV 최적의 파라미터 :  {'n_estimators': 1378, 'min_samples_split': 3, 'min_samples_leaf': 3, 'max_features': 2, 'max_depth': 24}
ACC :  0.8448005026704367
f1-score :  0.8413348095954294


In [42]:
# RandomForest
param = {'max_depth' : [int(x) for x in range(5,25)],
         'n_estimators' : [int(x) for x in np.linspace(start = 200, stop = 3000, num = 20)],
         'min_samples_leaf': list(range(1,2)),
         'min_samples_split' : list(range(1,5)),
         'bootstrap' : [True, False]
}
models = RandomForestClassifier(random_state =5)

Randomsearch_categori(models)

RandomizedSearchCV 최적의 파라미터 :  {'n_estimators': 1231, 'min_samples_split': 4, 'min_samples_leaf': 1, 'max_depth': 21, 'bootstrap': True}
ACC :  0.8482563619227145
f1-score :  0.8466582120575071


In [43]:
# LightGBM
param = {'max_depth' : list(range(4, 25)),
         'n_estimators' : [int(x) for x in np.linspace(start = 200, stop = 3000, num = 20)],
         'reg_alpha' : [x for x in np.linspace(0.001,1,25)],
         'reg_lambda' : [x for x in np.linspace(0.001,1,25)]      
}
models = LGBMClassifier(random_state =5)

Randomsearch_categori(models)

RandomizedSearchCV 최적의 파라미터 :  {'reg_lambda': 0.875125, 'reg_alpha': 0.875125, 'n_estimators': 200, 'max_depth': 5}
ACC :  0.8410304743952246
f1-score :  0.8380284599237111


In [44]:
# LightGBM
param = {'max_depth' : list(range(3, 30)),
         'n_estimators' : [int(x) for x in np.linspace(start = 100, stop = 3000, num = 20)],
         'reg_alpha' : [x for x in np.linspace(0.001,1,30)],
         'reg_lambda' : [x for x in np.linspace(0.001,1,30)],
         'learning_rate' : [x for x in np.linspace(0.001,1,30)]
}
models = LGBMClassifier(objective = 'multiclass',random_state =5)

Randomsearch_categori(models)

RandomizedSearchCV 최적의 파라미터 :  {'reg_lambda': 0.001, 'reg_alpha': 0.45887500000000003, 'n_estimators': 218, 'max_depth': 4, 'learning_rate': 0.12587500000000001, 'boosting': 'dart'}
ACC :  0.8385171222117499
f1-score :  0.8337364523371089


In [None]:
# LightGBM
param = {'max_depth' : list(range(2, 30)),
         'n_estimators' : [int(x) for x in np.linspace(start = 100, stop = 3000, num = 30)],
         'reg_alpha' :[x for x in np.linspace(0.001,1,30)],
         'reg_lambda' : [x for x in np.linspace(0.001,1,30)],
         'learning_rate' : [x for x in np.linspace(0.001,1,30)],
         'bagging_fraction' : [x for x in np.linspace(0.001,1,30)]     
}
models = LGBMClassifier(objective = 'multiclass',random_state =5)

Randomsearch_categori(models)

In [None]:
param = {'max_depth' : list(range(16, 24)),
         'n_estimators' : [int(x) for x in np.linspace(start = 100, stop = 2500, num = 50)],
         'reg_alpha' : list(np.arange(0.001, 0.2, 0.04)),
         'reg_lambda' : list(np.arange(0.001, 0.1, 0.04)),
         'learning_rate' : list(np.arange(0.1, 0.3, 0.01))   
}

model = LGBMClassifier(objective='reg:squarederror', random_state=5)

lgb_model = GridSearchCV(model, param_grid = param, cv = 3)
lgb_model.fit(x_train, y_train)

print("GridSearchCV 최적의 파라미터 : ", lgb_model.best_params_)
print("GridSearchCV 최고 정확도 : ", lgb_model.best_score_)

best_model = lgb_model.best_estimator_

pred = best_model.predict(x_test)
print("RMSE : {0:.4f}".format((mean_squared_error(y_test, y_pred))**0.5))

In [20]:
model = LGBMClassifier(max_depth=5, n_estimators=200, reg_alpha=0.875125, reg_lambda=0.875125, bagging_fraction= 0.1675,  random_state =5)
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
print("ACC : ", accuracy_score(y_test, y_pred))
print("f1-score : ", f1_score(y_test, y_pred, average='macro'))

ACC :  0.842287150486962
f1-score :  0.8382515682023672


In [None]:
# LightGBM
param = {'max_depth' : list(range(4, 25)),
         'n_estimators' : [int(x) for x in np.linspace(start = 100, stop = 3000, num = 50)],
         'reg_alpha' : [x for x in np.linspace(0.01,1,25)],
         'reg_lambda' : [x for x in np.linspace(0.01,1,25)],
         'boosting' : ['gbdt','dart','goss'],
         'learning_rate' : [x for x in np.linspace(0.001,1,25)],
         'bagging_fraction' : [x for x in np.linspace(0.001,1,25)]     
}
models = LGBMClassifier(random_state =5)

Randomsearch_categori(models)

In [48]:
# add lightGBM
models = [RandomForestClassifier(n_estimators=1231, min_samples_leaf=1, min_samples_split =4 ,max_depth=21, 
                                 bootstrap= True, random_state =5)]

models += [LGBMClassifier(max_depth=5, n_estimators=200, reg_alpha=0.875125, reg_lambda=0.875125, bagging_fraction= 0.1675,  random_state =5)]

# models += [XGBClassifier(max_depth=6, n_estimators=1330, min_child_weight= 3.5714285714285716,  subsample= 0.8333333333333333,  random_state =5)]


categorical_modeling(models)

ACC :  0.844172164624568
f1-score :  0.8409354331236714


In [None]:
target_names = df_categori['PM_categorical'].unique().tolist()
print(metrics.classification_report(y_test, y_pred, target_names=target_names))

In [44]:
model = RandomForestClassifier(n_estimators=1231, min_samples_leaf=1, min_samples_split =4 ,max_depth=21, 
                                 bootstrap= True, random_state =5)
model.fit(x_train,y_train)
y_pred = model.predict(x_test)
print("ACC : ", accuracy_score(y_test, y_pred))
print("f1-score : ", f1_score(y_test, y_pred, average='macro'))

ACC :  0.8485705309456487
f1-score :  0.8474498700543873


In [45]:
target_names = df_categori['PM_categorical'].unique().tolist()
print(metrics.classification_report(y_test, y_pred, target_names=target_names))

              precision    recall  f1-score   support

           0       0.87      0.85      0.86       846
           1       0.83      0.85      0.84      1308
           2       0.85      0.86      0.86       853
           3       0.92      0.77      0.84       176

    accuracy                           0.85      3183
   macro avg       0.87      0.83      0.85      3183
weighted avg       0.85      0.85      0.85      3183



In [35]:
# add lightGBM
models = [RandomForestClassifier(n_estimators=1231, min_samples_leaf=1, min_samples_split =4 ,max_depth=21, 
                                 bootstrap= True, random_state =5)]

models += [LGBMClassifier(reg_lambda = 0.875125,reg_alpha = 0.875125, n_estimators=200 , max_depth=5, random_state=5)]

# categorical_modeling(models)

In [37]:
votingC = VotingClassifier(estimators=[(f'{m}' , m) for m in models], voting='soft')

votingC = votingC.fit(x_train, y_train)

#예측 진행
y_pred = votingC.predict(x_test) 
df_predict = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
# ACC, F1-score check
print("ACC : ", accuracy_score(y_test, y_pred))
print("f1-score : ", f1_score(y_test, y_pred, average='macro'))

ACC :  0.844172164624568
f1-score :  0.8409354331236714


In [41]:
target_names = df_categori['PM_categorical'].unique().tolist()
print(metrics.classification_report(y_test, y_pred, target_names=target_names))

              precision    recall  f1-score   support

           0       0.85      0.87      0.86       846
           1       0.83      0.84      0.83      1308
           2       0.85      0.85      0.85       853
           3       0.92      0.74      0.82       176

    accuracy                           0.84      3183
   macro avg       0.86      0.83      0.84      3183
weighted avg       0.84      0.84      0.84      3183



## Airkorea 24시간예측이동평균(정확도)

In [35]:
cate_24h = df_categori.copy()
cate_24h['C12'] = cate_24h['pm25'].rolling(window=13).mean()
cate_24h.reset_index(drop=True, inplace=True)
cate_24h['Cai'] = cate_24h.apply(lambda x : pm24(x['pm25'],x['C12']), axis=1)
cate_24h['C4']  = cate_24h['Cai'].rolling(window=5).mean()
cate_24h.reset_index(drop=True, inplace=True)
cate_24h.dropna(inplace=True)
cate_24h['C24E']  = round((cate_24h['C12']*12 + cate_24h['C4']*12)/24)
cate_24h['C24E'] = cate_24h['C24E'].apply(airkorea_categori)
print("ACC : ", accuracy_score(cate_24h['PM_categorical'], cate_24h['C24E']))

ACC :  0.729668532612114
