# 1. 라이브러리

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import os
import glob

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler

from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor, VotingRegressor
from sklearn.linear_model import LogisticRegression
from xgboost import XGBRegressor
from sklearn.svm import SVR

from sklearn.metrics import classification_report, f1_score, r2_score, mean_squared_error

import warnings
warnings.filterwarnings('ignore')

# 2. 데이터 불러오기

In [2]:
df = pd.read_csv('./data/df_damage.csv')

df.drop(columns=['지점_번호', '지점명', '시간', '연도', '월', '일', '계절', '원인'],inplace=True)

# 3. 모델 설계

In [3]:
x = df.iloc[:, :-6]
y = df.iloc[:, -1]

x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.25, random_state=42)

In [4]:
xgb= XGBRegressor()

xgb_param_grid={
    'n_estimators' : [200, 300, 400, 500],
    'learning_rate' : [0.01, 0.05, 0.15],
    'max_depth' : [3,5, 10, 15],
    'gamma' : [2, 3],
    'colsample_bytree' : [0.8, 0.9],
}
gridsearch=GridSearchCV(XGBRegressor(), param_grid=xgb_param_grid, scoring='f1_macro', n_jobs=-1, verbose=2)
gridsearch.fit(x_train, y_train)

best = gridsearch.best_estimator_
best.fit(x_train, y_train)
y_pred = best.predict(x_test)
print('r2 score : ', r2_score(y_test, y_pred))

Fitting 5 folds for each of 192 candidates, totalling 960 fits
r2 score :  0.5997050165078193


# 4. 예측

In [5]:
print('r2 score : ', r2_score(y_test, y_pred))
print('mse : ', mean_squared_error(y_test, y_pred))

r2 score :  0.5997050165078193
mse :  1.070468260516186e+16


In [6]:
# 조정된 결정계수
def adj_r2(r2, n, p):
    return 1 - (1-r2)*(n-1) / (n-p-1)

In [7]:
adj_r2(r2_score(y_test, y_pred), len(x_test), len(x_test.columns))

0.5982801809300624

In [8]:
np.array(y_test)

array([0., 0., 0., ..., 0., 0., 0.])

In [9]:
y_pred

array([ -187318.47,  -187318.47,  -187318.47, ...,  -187318.47,
       16241922.  ,  -187318.47], dtype=float32)