In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score


In [2]:
train = pd.read_csv('use_this.csv')
train.head()

Unnamed: 0,stn4contest,v01,v02,v03,v04,v05,v06,v07,v08,v09,vv,class_interval,year,month,day,hour
0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,2020,5,1,12
1,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,2020,5,1,15
2,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,2020,5,1,18
3,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,2020,5,1,21
4,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,2020,5,2,0


In [3]:
# 필요한 열 선택 및 새로운 특징 추가
features = train[['stn4contest', 'v01', 'v02', 'v03', 'v04', 'v05', 'v06', 'v07', 'v08', 'v09', 'year', 'month', 'day', 'hour']]
target = train['vv']

# 특징들만 스케일링
scaler = MinMaxScaler()
features_scaled = scaler.fit_transform(features.drop(columns=['stn4contest']))
features_scaled_df = pd.DataFrame(features_scaled, columns=features.columns[1:])
features_scaled_df['stn4contest'] = features['stn4contest'].values
features_scaled_df['vv'] = target.values  # y 값을 그대로 사용

# 관측소 기준으로 데이터 분리
train_data = features_scaled_df[features_scaled_df['stn4contest'] <= 16]
test_data = features_scaled_df[features_scaled_df['stn4contest'] > 16]

X_train = train_data.drop(columns=['vv']).values
y_train = train_data['vv'].values
X_test = test_data.drop(columns=['vv']).values
y_test = test_data['vv'].values

## LightGBM

In [10]:
import lightgbm as lgb
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# LightGBM 하이퍼파라미터 그리드 설정
lgb_params = {
    'num_leaves': [31, 50, 100],
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [100, 200, 500],
    'max_depth': [-1, 10, 20],
    'feature_fraction': [0.6, 0.8, 1.0],
    'bagging_fraction': [0.6, 0.8, 1.0]
}

# LightGBM 모델 정의
lgb_model = lgb.LGBMRegressor()

# GridSearchCV 설정
lgb_grid = GridSearchCV(lgb_model, lgb_params, cv=3, scoring='neg_mean_squared_error', verbose=1, n_jobs=-1)

# 모델 학습
lgb_grid.fit(X_train, y_train)

# 최적 하이퍼파라미터 및 성능 출력
print(f"Best parameters found: {lgb_grid.best_params_}")
print(f"Best RMSE: {np.sqrt(-lgb_grid.best_score_)}")

# 최적 모델로 예측 및 성능 평가
lgb_best_model = lgb_grid.best_estimator_
y_pred = lgb_best_model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("LightGBM Model Performance Metrics:")
print(f"{'Metric':<10}{'Value':<20}")
print(f"{'-'*30}")
print(f"{'RMSE':<10}{rmse:<20.4f}")
print(f"{'MAE':<10}{mae:<20.4f}")
print(f"{'MSE':<10}{mse:<20.4f}")
print(f"{'R²':<10}{r2:<20.4f}")

Fitting 3 folds for each of 729 candidates, totalling 2187 fits
Best parameters found: {'bagging_fraction': 0.6, 'feature_fraction': 0.6, 'learning_rate': 0.01, 'max_depth': 10, 'n_estimators': 200, 'num_leaves': 100}
Best RMSE: 3.4910006957205826
LightGBM Model Performance Metrics:
Metric    Value               
------------------------------
RMSE      3.1925              
MAE       0.8822              
MSE       10.1918             
R²        0.3817              


In [20]:
import lightgbm as lgb

# 최적 하이퍼파라미터 설정
best_params = {
    'bagging_fraction': 0.6,
    'feature_fraction': 0.6,
    'learning_rate': 0.01,
    'max_depth': 10,
    'n_estimators': 200,
    'num_leaves': 100
}

# 모델 초기화 및 학습
lgb_model =lgb.LGBMRegressor(**best_params, random_state=42)
lgb_model.fit(X_train, y_train)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002754 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2360
[LightGBM] [Info] Number of data points in the train set: 66300, number of used features: 14
[LightGBM] [Info] Start training from score 0.834257


In [21]:
import pickle
with open('lgb_best_model.pkl', 'wb') as file:
    pickle.dump(lgb_model, file)

## Random Forest

In [4]:
from sklearn.ensemble import RandomForestRegressor

# RandomForest 하이퍼파라미터 그리드 설정
rf_params = {
    'n_estimators': [100, 200, 500],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2']
}

# RandomForest 모델 정의
rf_model = RandomForestRegressor(random_state=42)

# GridSearchCV 설정
rf_grid = GridSearchCV(rf_model, rf_params, cv=3, scoring='neg_mean_squared_error', verbose=1, n_jobs=-1)

# 모델 학습
rf_grid.fit(X_train, y_train)

# 최적 하이퍼파라미터 및 성능 출력
print(f"Best parameters found: {rf_grid.best_params_}")
print(f"Best RMSE: {np.sqrt(-rf_grid.best_score_)}")

# 최적 모델로 예측 및 성능 평가
rf_best_model = rf_grid.best_estimator_
y_pred = rf_best_model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Random Forest Model Performance Metrics:")
print(f"{'Metric':<10}{'Value':<20}")
print(f"{'-'*30}")
print(f"{'RMSE':<10}{rmse:<20.4f}")
print(f"{'MAE':<10}{mae:<20.4f}")
print(f"{'MSE':<10}{mse:<20.4f}")
print(f"{'R²':<10}{r2:<20.4f}")

NameError: name 'GridSearchCV' is not defined

In [13]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# RandomForest 하이퍼파라미터 그리드 설정
best_params = {
    'max_depth': 10,
    'max_features': 'sqrt',
    'min_samples_leaf': 4,
    'min_samples_split': 10,
    'n_estimators': 500
}

# 모델 초기화 및 학습
rf_model = RandomForestRegressor(**best_params, random_state=42)
rf_model.fit(X_train, y_train)

# 모델 평가
y_pred = rf_model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Random Forest Model Performance Metrics:")
print(f"{'Metric':<10}{'Value':<20}")
print(f"{'-'*30}")
print(f"{'RMSE':<10}{rmse:<20.4f}")
print(f"{'MAE':<10}{mae:<20.4f}")
print(f"{'MSE':<10}{mse:<20.4f}")
print(f"{'R²':<10}{r2:<20.4f}")

import joblib

Random Forest Model Performance Metrics:
Metric    Value               
------------------------------
RMSE      3.2128              
MAE       0.8416              
MSE       10.3221             
R²        0.3738              


NameError: name 'rf_best_model' is not defined

In [17]:
import pickle
with open('rf_best_model.pkl', 'wb') as file:
    pickle.dump(rf_model, file)

In [13]:
from catboost import CatBoostRegressor

# CatBoost 하이퍼파라미터 그리드 설정
cat_params = {
    'iterations': [100, 200, 500],
    'learning_rate': [0.01, 0.1],
    'depth': [4, 6, 10],
    'l2_leaf_reg': [1, 3, 5, 7, 9],
    'bagging_temperature': [0.5, 1, 2]
}

# CatBoost 모델 정의
cat_model = CatBoostRegressor(verbose=0)

# GridSearchCV 설정
cat_grid = GridSearchCV(cat_model, cat_params, cv=3, scoring='neg_mean_squared_error', verbose=1, n_jobs=-1)

# 모델 학습
cat_grid.fit(X_train, y_train)

# 최적 하이퍼파라미터 및 성능 출력
print(f"Best parameters found: {cat_grid.best_params_}")
print(f"Best RMSE: {np.sqrt(-cat_grid.best_score_)}")

# 최적 모델로 예측 및 성능 평가
cat_best_model = cat_grid.best_estimator_
y_pred = cat_best_model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("CatBoost Model Performance Metrics:")
print(f"{'Metric':<10}{'Value':<20}")
print(f"{'-'*30}")
print(f"{'RMSE':<10}{rmse:<20.4f}")
print(f"{'MAE':<10}{mae:<20.4f}")
print(f"{'MSE':<10}{mse:<20.4f}")
print(f"{'R²':<10}{r2:<20.4f}")

Fitting 3 folds for each of 270 candidates, totalling 810 fits
Best parameters found: {'bagging_temperature': 0.5, 'depth': 6, 'iterations': 100, 'l2_leaf_reg': 9, 'learning_rate': 0.1}
Best RMSE: 3.4768424669118545
CatBoost Model Performance Metrics:
Metric    Value               
------------------------------
RMSE      3.2581              
MAE       0.8728              
MSE       10.6154             
R²        0.3560              


In [16]:
import joblib

# 모델 저장
joblib.dump(lgb_best_model, 'lgb_best_model.pkl')
joblib.dump(rf_best_model, 'rf_best_model.pkl')
joblib.dump(cat_best_model, 'cat_best_model.pkl')

print("모델이 성공적으로 저장되었습니다.")

모델이 성공적으로 저장되었습니다.


In [None]:
import joblib

joblib.dump(rf_best_model, 'rf_best_model.pkl')