## 업 샘플링 랜덤 포레스트

In [1]:
# 라이브러리 로드
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

from sklearn.model_selection import cross_validate
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.inspection import permutation_importance
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

import time
import warnings
warnings.filterwarnings('ignore')

from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

import seaborn as sns
%matplotlib inline

In [2]:
# 데이터 로딩딩
df = pd.read_csv('./data/up_sampling.csv', encoding='utf-8')

In [3]:
df

Unnamed: 0,일시,WIND_SPEED,GUST_WIND,SIG_WAVE,MAX_WAVE,HPA,특보강도
0,2012-04-21 18:00:00,5.5,12.7,1.1,1.7,1007.0,1.0
1,2018-08-17 07:00:00,10.6,15.2,0.7,1.1,1012.9,1.0
2,2018-06-10 12:00:00,6.5,15.2,0.8,1.2,1005.4,1.0
3,2018-05-20 12:00:00,10.7,18.8,0.8,1.1,1019.9,1.0
4,2015-11-25 09:00:00,6.4,13.3,0.9,1.4,1019.2,1.0
...,...,...,...,...,...,...,...
138971,2020-12-29 22:00:00,12.5,15.9,1.0,1.7,1013.4,0.0
138972,2020-12-29 23:00:00,13.3,17.3,1.1,1.6,1013.1,0.0
138973,2020-12-30 00:00:00,13.6,16.8,1.1,1.8,1013.5,0.0
138974,2020-12-30 01:00:00,12.3,15.8,1.1,1.8,1014.1,0.0


In [4]:
# 데이터셋
data = df[['WIND_SPEED', 'GUST_WIND', 'SIG_WAVE', 'MAX_WAVE', 'HPA']].to_numpy()
target = df['특보강도'].to_numpy()

In [5]:
# 훈련세트, 테스트 세트 나누기
train_input, test_input, train_target, test_target = train_test_split(data, target, test_size=0.2, random_state=42)

In [6]:
# 교차 검증 수행 (훈련세트와 검증세트 점수 비교해서 과대적합 파악)
rf = RandomForestClassifier(n_jobs=-1, random_state=42)
scores = cross_validate(rf, train_input, train_target,
                        return_train_score=True, n_jobs=-1)
print(np.mean(scores['train_score']), np.mean(scores['test_score']))

0.9999640223061702 0.9841248425975895


In [7]:
# 랜덤 포레스트 특성 중요도 출력
# 결정트리랑 비교 해보기
rf.fit(train_input, train_target)
print(rf.feature_importances_)

[0.14910333 0.36452282 0.17113899 0.13360738 0.18162748]


In [8]:
# OOB 점수를 출력
rf = RandomForestClassifier(oob_score=True, n_jobs=-1, random_state=42)
rf.fit(train_input, train_target)
print(rf.oob_score_)


0.9872728908076992


In [10]:
# 랜덤 포레스트 학습 및 별도의 테스트 셋으로 예측 성능 평가
rf_clf = RandomForestClassifier(random_state=0)

rf_clf.fit(train_input , train_target)

pred = rf_clf.predict(test_input)

accuracy = accuracy_score(test_target, pred)

print('랜덤 포레스트 정확도: {0:.4f}'.format(accuracy))

랜덤 포레스트 정확도: 0.9879
