## 다운 샘플링 랜덤 포레스트

In [1]:
# 라이브러리 로드
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

from sklearn.model_selection import cross_validate
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.inspection import permutation_importance
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

import time
import warnings
warnings.filterwarnings('ignore')

from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

import seaborn as sns
%matplotlib inline

In [2]:
df = pd.read_csv('./data/down_sampling.csv', encoding='utf-8')

In [11]:
df

Unnamed: 0,일시,WIND_SPEED,GUST_WIND,SIG_WAVE,MAX_WAVE,HPA,특보강도
0,2013-09-07 15:00:00,4.6,10.4,0.5,0.8,1015.7,0.0
1,2011-04-13 05:00:00,5.2,7.7,0.2,0.4,1016.1,0.0
2,2019-01-29 01:00:00,9.8,13.2,1.1,1.8,1023.6,0.0
3,2012-06-08 05:00:00,2.9,8.3,0.1,0.2,1005.3,0.0
4,2016-07-11 16:00:00,1.9,6.7,0.1,0.2,1004.1,0.0
...,...,...,...,...,...,...,...
13737,2020-12-30 20:00:00,14.4,19.5,2.0,3.1,1018.9,1.0
13738,2020-12-30 21:00:00,13.0,20.6,2.0,3.5,1020.4,1.0
13739,2020-12-30 22:00:00,10.7,16.3,2.0,3.4,1021.3,1.0
13740,2020-12-30 23:00:00,13.9,20.4,2.3,3.8,1021.5,1.0


In [7]:
data = df[['WIND_SPEED', 'GUST_WIND', 'SIG_WAVE', 'MAX_WAVE', 'HPA']].to_numpy()
target = df['특보강도'].to_numpy()

In [8]:
train_input, test_input, train_target, test_target = train_test_split(data, target, test_size=0.2, random_state=42)

In [9]:
# 교차 검증 수행 (훈련세트와 검증세트 점수 비교해서 과대적합 파악)
rf = RandomForestClassifier(n_jobs=-1, random_state=42)
scores = cross_validate(rf, train_input, train_target,
                        return_train_score=True, n_jobs=-1)
print(np.mean(scores['train_score']), np.mean(scores['test_score']))

# => 완전 과대적합인디..?

0.9999317742483731 0.8631854333655673


In [10]:
# 랜덤 포레스트 특성 중요도 출력
# 결정트리랑 비교 해보기
rf.fit(train_input, train_target)
print(rf.feature_importances_)

[0.15611841 0.34996029 0.18674847 0.13793552 0.16923731]


In [12]:
# OOB 점수를 출력
rf = RandomForestClassifier(oob_score=True, n_jobs=-1, random_state=42)
rf.fit(train_input, train_target)
print(rf.oob_score_)

0.8672791776585099


In [13]:
# 랜덤 포레스트 학습 및 별도의 테스트 셋으로 예측 성능 평가
rf_clf = RandomForestClassifier(random_state=0)

rf_clf.fit(train_input , train_target)

pred = rf_clf.predict(test_input)

accuracy = accuracy_score(test_target, pred)

print('랜덤 포레스트 정확도: {0:.4f}'.format(accuracy))

랜덤 포레스트 정확도: 0.8698


## 종합코드

In [None]:
# 라이브러리 로드
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

from sklearn.model_selection import cross_validate
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.inspection import permutation_importance
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

import time
import warnings
warnings.filterwarnings('ignore')

from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

import seaborn as sns
%matplotlib inline

# 데이터 로딩딩
df = pd.read_csv('./data/down_sampling.csv', encoding='utf-8')

# 데이터셋
data = df[['WIND_SPEED', 'GUST_WIND', 'SIG_WAVE', 'MAX_WAVE', 'HPA']].to_numpy()
target = df['특보강도'].to_numpy()

# 훈련세트, 테스트 세트 나누기
train_input, test_input, train_target, test_target = train_test_split(data, target, test_size=0.2, random_state=42)

# 교차 검증 수행 (훈련세트와 검증세트 점수 비교해서 과대적합 파악)
rf = RandomForestClassifier(n_jobs=-1, random_state=42)
scores = cross_validate(rf, train_input, train_target,
                        return_train_score=True, n_jobs=-1)
print(np.mean(scores['train_score']), np.mean(scores['test_score']))

# 랜덤 포레스트 특성 중요도 출력
# 결정트리랑 비교 해보기
rf.fit(train_input, train_target)
print(rf.feature_importances_)

# OOB 점수를 출력
rf = RandomForestClassifier(oob_score=True, n_jobs=-1, random_state=42)
rf.fit(train_input, train_target)
print(rf.oob_score_)

# 랜덤 포레스트 학습 및 별도의 테스트 셋으로 예측 성능 평가
rf_clf = RandomForestClassifier(random_state=0)

rf_clf.fit(train_input , train_target)

pred = rf_clf.predict(test_input)

accuracy = accuracy_score(test_target, pred)

print('랜덤 포레스트 정확도: {0:.4f}'.format(accuracy))