#### 라이브러리 정의

In [None]:
import pandas as pd
import dask.dataframe as dd
import numpy as np

# - 파이썬에서 사용되는 기본 시각화 라이브러리
import matplotlib.pyplot as plt

from scipy.stats import spearmanr
from sklearn.model_selection import train_test_split

from sklearn import set_config
import joblib  # 모델 저장용

### 경고(오류는 아님) 메시지 없애기
# - 사이킷런 버전에 따라 오류가 아니니 안내(경고)메시지가 자주 나타남
# - 안내(경고) 메시지 없이 실행할 수 있도록 처리
from sklearn import set_config
set_config(display="text")

# - 한글처리
plt.rc("font", family="Malgun Gothic")

# - 마이너스 기호 깨짐 처리
plt.rcParams["axes.unicode_minus"] = False

#### 데이터 읽어오기

In [2]:
australia_data = pd.read_csv("./data/australia_merged_fire.csv")
australia_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1175962 entries, 0 to 1175961
Data columns (total 29 columns):
 #   Column                 Non-Null Count    Dtype  
---  ------                 --------------    -----  
 0   latitude               1175962 non-null  float64
 1   longitude              1175962 non-null  float64
 2   brightness             1175962 non-null  float64
 3   bright_t31             1175962 non-null  float64
 4   frp                    1175962 non-null  float64
 5   T2M                    1175962 non-null  float64
 6   WS2M                   1175962 non-null  float64
 7   RH2M                   1175962 non-null  float64
 8   PRECTOTCORR            1175962 non-null  float64
 9   confidence_h           1175962 non-null  float64
 10  confidence_l           1175962 non-null  float64
 11  confidence_n           1175962 non-null  float64
 12  daynight_D             1175962 non-null  float64
 13  daynight_N             1175962 non-null  float64
 14  year              

In [3]:
australia_data.isnull().sum()

latitude                 0
longitude                0
brightness               0
bright_t31               0
frp                      0
T2M                      0
WS2M                     0
RH2M                     0
PRECTOTCORR              0
confidence_h             0
confidence_l             0
confidence_n             0
daynight_D               0
daynight_N               0
year                     0
month                    0
day                      0
season                   0
weekday                  0
WS2M_RH2M_interaction    0
high_temperature         0
precipitation_flag       0
T2M_binned_medium        0
T2M_binned_high          0
RH2M_binned_medium       0
RH2M_binned_high         0
acq_date                 0
geometry                 0
predicted_area_km2       0
dtype: int64

#### 스피어만 상관관계 검증

In [4]:
# 독립 변수(X): float64 타입만 추출, predicted_area_km2 제외
numeric_columns = australia_data.select_dtypes(include=['float64']).columns
X = australia_data[numeric_columns.drop('predicted_area_km2')]
y = australia_data['predicted_area_km2']

# 스피어만 상관계수 계산
print("스피어만 상관관계 분석 결과 (결측값 제거 후):")
print("-" * 50)

for value in X.columns:
    statistic, pvalue = spearmanr(X[value], y)
    
    # 출력
    if pvalue < 0.05 and abs(statistic) >= 0.2:
        print(f"상관관계 계수 {statistic:.3f}, p-value {pvalue:.4f} : ({value})는 predicted_area_km2에 대해 유의미하다 (강한 상관관계)")
    elif pvalue < 0.05:
        print(f"상관관계 계수 {statistic:.3f}, p-value {pvalue:.4f} : ({value})는 predicted_area_km2에 대해 약한 상관관계 (유의미)")
    else:
        print(f"상관관계 계수 {statistic:.3f}, p-value {pvalue:.4f} : ({value})는 predicted_area_km2에 대해 유의미하지 않다")


스피어만 상관관계 분석 결과 (결측값 제거 후):
--------------------------------------------------
상관관계 계수 -0.112, p-value 0.0000 : (latitude)는 predicted_area_km2에 대해 약한 상관관계 (유의미)
상관관계 계수 -0.121, p-value 0.0000 : (longitude)는 predicted_area_km2에 대해 약한 상관관계 (유의미)
상관관계 계수 0.759, p-value 0.0000 : (brightness)는 predicted_area_km2에 대해 유의미하다 (강한 상관관계)
상관관계 계수 0.606, p-value 0.0000 : (bright_t31)는 predicted_area_km2에 대해 유의미하다 (강한 상관관계)
상관관계 계수 0.896, p-value 0.0000 : (frp)는 predicted_area_km2에 대해 유의미하다 (강한 상관관계)
상관관계 계수 0.115, p-value 0.0000 : (T2M)는 predicted_area_km2에 대해 약한 상관관계 (유의미)
상관관계 계수 0.074, p-value 0.0000 : (WS2M)는 predicted_area_km2에 대해 약한 상관관계 (유의미)
상관관계 계수 0.053, p-value 0.0000 : (RH2M)는 predicted_area_km2에 대해 약한 상관관계 (유의미)
상관관계 계수 0.004, p-value 0.0000 : (PRECTOTCORR)는 predicted_area_km2에 대해 약한 상관관계 (유의미)
상관관계 계수 0.305, p-value 0.0000 : (confidence_h)는 predicted_area_km2에 대해 유의미하다 (강한 상관관계)
상관관계 계수 0.159, p-value 0.0000 : (confidence_l)는 predicted_area_km2에 대해 약한 상관관계 (유의미)
상관관계 계수 -0.348, p-valu

  statistic, pvalue = spearmanr(X[value], y)


#### 훈련(6) : 검증(2) : 테스트(2) 데이터 분리

In [5]:
train_cols = [
    "frp",           # 강한 상관관계
    "brightness",    # 강한 상관관계
    "bright_t31",    # 강한 상관관계
    "confidence_h",  # 강한 상관관계
    "confidence_l",  # 강한 상관관계
    "confidence_n",  # 강한 상관관계
    "daynight_D",    # 강한 상관관계
    "daynight_N",    # 강한 상관관계
    "latitude",      # 약한 상관관계
    "longitude",     # 약한 상관관계
    "T2M",           # 약한 상관관계
    "WS2M",          # 약한 상관관계
    "RH2M",          # 약한 상관관계
    "PRECTOTCORR",   # 약한 상관관계
    "year",          # 약한 상관관계
    "month",         # 약한 상관관계
    "day",           # 약한 상관관계
    "season",        # 약한 상관관계
    "weekday",       # 약한 상관관계
    "WS2M_RH2M_interaction",  # 약한 상관관계
    "high_temperature",       # 약한 상관관계
    "precipitation_flag"      # 약한 상관관계
]

# 데이터 준비
train = australia_data[train_cols]
target = australia_data["predicted_area_km2"]

# 첫 번째 분할 (훈련 데이터와 검증 데이터)
train_input, val_input, train_target, val_target = train_test_split(train, target, test_size=0.4, random_state=42, shuffle=True)

# 두 번째 분할 (검증 데이터와 테스트 데이터)
val_input, test_input, val_target, test_target = train_test_split(val_input, val_target, test_size=0.5, random_state=42, shuffle=True)

print(train_input.shape, train_target.shape)
print(val_input.shape, val_target.shape)
print(test_input.shape, test_target.shape)

(705577, 22) (705577,)
(235192, 22) (235192,)
(235193, 22) (235193,)


#### 한개의 모델로 데이터 샘플링

In [6]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor() 
rf.fit(train_input, train_target)

train_score = rf.score(train_input, train_target)
val_score = rf.score(val_input, val_target)
test_score = rf.score(test_input, test_target)

print(f"train_score: {train_score:.4f}, val_score: {val_score:.4f}, test_score: {test_score:.4f}, 과적합여부: {train_score - val_score:.4f}")

train_score: 0.9847, val_score: 0.8892, test_score: 0.8932, 과적합여부: 0.0954


#### 전체 모델 훈련 및 검증 평가

In [7]:
from model_class import Models

models = Models()
models.total_models(train_input, train_target, val_input, val_target, test_input, test_target)

Tuning and training RandomForest with None...
Best Parameters for RandomForest: {'n_estimators': 200, 'min_samples_split': 10, 'max_depth': 10}
-*** RandomForest with None ***-
훈련: 0.9137, 검증: 0.9032, 테스트: 0.9065, 과적합여부: 0.0105
사용 가능한 모델입니다 (일반화).

Saved RandomForest with None as final model.
Tuning and training HistGradientBoosting with None...
Best Parameters for HistGradientBoosting: {'max_iter': 100, 'max_depth': 3, 'learning_rate': 0.05}
-*** HistGradientBoosting with None ***-
훈련: 0.9024, 검증: 0.9024, 테스트: 0.9057, 과적합여부: -0.0000
과소적합으로 사용 불가능한 모델입니다.

Saved HistGradientBoosting with None as final model.
Tuning and training XGB with None...
Best Parameters for XGB: {'subsample': 1.0, 'n_estimators': 100, 'max_depth': 3, 'learning_rate': 0.05}
-*** XGB with None ***-
훈련: 0.9025, 검증: 0.9022, 테스트: 0.9053, 과적합여부: 0.0002
사용 가능한 모델입니다 (일반화).

Saved XGB with None as final model.
Tuning and training RandomForest with Standard...
Best Parameters for RandomForest: {'n_estimators': 200, 'min_

#### 전체 결과를 데이터 프레임으로 확인 및 결과 저장

In [8]:
results_df = models.get_results()
results_df

Unnamed: 0,model_nm,Scaler,GSCV,train_score,val_score,test_score,과적합여부,사용,train_mae,train_mse,train_r2,val_mae,val_mse,val_r2,test_mae,test_mse,test_r2
0,RandomForest,,Y,0.913722,0.903247,0.906453,0.010475,Y,0.127366,0.168843,0.913722,0.130121,0.189629,0.903247,0.129811,0.185708,0.906453
1,HistGradientBoosting,,Y,0.902401,0.902437,0.905728,-3.6e-05,N,0.132231,0.190998,0.902401,0.131788,0.191216,0.902437,0.131458,0.187146,0.905728
2,XGB,,Y,0.902487,0.902244,0.90526,0.000243,Y,0.132191,0.190831,0.902487,0.131807,0.191594,0.902244,0.131566,0.188076,0.90526
3,RandomForest,Standard,Y,0.913721,0.903248,0.906442,0.010474,Y,0.127366,0.168845,0.913721,0.130118,0.189627,0.903248,0.129816,0.185729,0.906442
4,HistGradientBoosting,Standard,Y,0.902401,0.902437,0.905728,-3.6e-05,N,0.132231,0.190998,0.902401,0.131788,0.191216,0.902437,0.131458,0.187146,0.905728
5,XGB,Standard,Y,0.902487,0.902244,0.90526,0.000243,Y,0.132191,0.19083,0.902487,0.131807,0.191594,0.902244,0.131565,0.188074,0.90526
6,RandomForest,MinMax,Y,0.913723,0.903246,0.906448,0.010478,Y,0.127367,0.168841,0.913723,0.130122,0.189631,0.903246,0.129819,0.185716,0.906448
7,HistGradientBoosting,MinMax,Y,0.902401,0.902437,0.905728,-3.6e-05,N,0.132231,0.190998,0.902401,0.131788,0.191216,0.902437,0.131458,0.187146,0.905728
8,XGB,MinMax,Y,0.902487,0.902244,0.90526,0.000243,Y,0.132191,0.190831,0.902487,0.131807,0.191594,0.902244,0.131566,0.188076,0.90526
9,RandomForest,Robust,Y,0.913722,0.90325,0.906445,0.010473,Y,0.127365,0.168843,0.913722,0.130118,0.189623,0.90325,0.129819,0.185723,0.906445


In [9]:
filtered_df = results_df[results_df['사용'] == 'Y']
filtered_df

Unnamed: 0,model_nm,Scaler,GSCV,train_score,val_score,test_score,과적합여부,사용,train_mae,train_mse,train_r2,val_mae,val_mse,val_r2,test_mae,test_mse,test_r2
0,RandomForest,,Y,0.913722,0.903247,0.906453,0.010475,Y,0.127366,0.168843,0.913722,0.130121,0.189629,0.903247,0.129811,0.185708,0.906453
2,XGB,,Y,0.902487,0.902244,0.90526,0.000243,Y,0.132191,0.190831,0.902487,0.131807,0.191594,0.902244,0.131566,0.188076,0.90526
3,RandomForest,Standard,Y,0.913721,0.903248,0.906442,0.010474,Y,0.127366,0.168845,0.913721,0.130118,0.189627,0.903248,0.129816,0.185729,0.906442
5,XGB,Standard,Y,0.902487,0.902244,0.90526,0.000243,Y,0.132191,0.19083,0.902487,0.131807,0.191594,0.902244,0.131565,0.188074,0.90526
6,RandomForest,MinMax,Y,0.913723,0.903246,0.906448,0.010478,Y,0.127367,0.168841,0.913723,0.130122,0.189631,0.903246,0.129819,0.185716,0.906448
8,XGB,MinMax,Y,0.902487,0.902244,0.90526,0.000243,Y,0.132191,0.190831,0.902487,0.131807,0.191594,0.902244,0.131566,0.188076,0.90526
9,RandomForest,Robust,Y,0.913722,0.90325,0.906445,0.010473,Y,0.127365,0.168843,0.913722,0.130118,0.189623,0.90325,0.129819,0.185723,0.906445
11,XGB,Robust,Y,0.902487,0.902244,0.90526,0.000243,Y,0.132191,0.19083,0.902487,0.131807,0.191594,0.902244,0.131565,0.188074,0.90526
