In [None]:
import pandas as pd
import numpy as np
import os
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.font_manager as fm
from sklearn.preprocessing import LabelEncoder, StandardScaler
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
import tempfile

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [None]:
# 코랩 환경: 구글 드라이브를 코랩에 연동
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# 한글 폰트 설정하기
!apt-get -qq install fonts-nanum*

import matplotlib.font_manager as fm
font_path = '/usr/share/fonts/truetype/nanum/NanumGothic.ttf'
fm.fontManager.addfont(font_path)
plt.rc('font', family='NanumGothic')

Selecting previously unselected package fonts-nanum.
(Reading database ... 124926 files and directories currently installed.)
Preparing to unpack .../fonts-nanum_20200506-1_all.deb ...
Unpacking fonts-nanum (20200506-1) ...
Selecting previously unselected package fonts-nanum-coding.
Preparing to unpack .../fonts-nanum-coding_2.5-3_all.deb ...
Unpacking fonts-nanum-coding (2.5-3) ...
Selecting previously unselected package fonts-nanum-eco.
Preparing to unpack .../fonts-nanum-eco_1.000-7_all.deb ...
Unpacking fonts-nanum-eco (1.000-7) ...
Selecting previously unselected package fonts-nanum-extra.
Preparing to unpack .../fonts-nanum-extra_20200506-1_all.deb ...
Unpacking fonts-nanum-extra (20200506-1) ...
Setting up fonts-nanum-extra (20200506-1) ...
Setting up fonts-nanum (20200506-1) ...
Setting up fonts-nanum-coding (2.5-3) ...
Setting up fonts-nanum-eco (1.000-7) ...
Processing triggers for fontconfig (2.13.1-4.2ubuntu5) ...


In [None]:
# 데이터 불러오기
train = pd.read_csv('/content/drive/MyDrive/KUBIG_2025_WINTER_ML/train.csv')
test = pd.read_csv('/content/drive/MyDrive/KUBIG_2025_WINTER_ML/test.csv')

## 결측치 처리 - 배터리용량 mean

In [None]:
# 결측치 처리
train = train.assign(배터리용량=train['배터리용량'].fillna(train['배터리용량'].mean()))
test = test.assign(배터리용량=test['배터리용량'].fillna(train['배터리용량'].mean()))

In [None]:
x_train = train.drop(columns=['가격(백만원)'])
y_train = train['가격(백만원)']
x_test = test.copy()

### 범주형 데이터 인코딩 - LabelEncoder

In [None]:
# 범주형 변수 레이블 인코딩
categorical_features = [col for col in x_train.columns if x_train[col].dtype == 'object']

for col in categorical_features:
    le = LabelEncoder()
    x_train[col] = le.fit_transform(x_train[col])
    for case in np.unique(x_test[col]):
        if case not in le.classes_:
            le.classes_ = np.append(le.classes_, case)
    x_test[col] = le.transform(x_test[col])

### scaling

In [None]:
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

In [None]:
from sklearn.ensemble import RandomForestRegressor

param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

rf_model = RandomForestRegressor(random_state=42)
grid_search = GridSearchCV(rf_model, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1, verbose=2)

# 학습
grid_search.fit(x_train_scaled, y_train)
best_model = grid_search.best_estimator_

Fitting 5 folds for each of 54 candidates, totalling 270 fits


In [None]:
y_pred_train = best_model.predict(x_train_scaled)
rmse_train = np.sqrt(mean_squared_error(y_train, y_pred_train))
print(f"훈련 데이터 RMSE: {rmse_train:.4f}")

훈련 데이터 RMSE: 1.0674


In [None]:
# 테스트 데이터 예측 수행
pred = best_model.predict(x_test_scaled)

# 결과 저장
submit = pd.read_csv('/content/drive/MyDrive/KUBIG_2025_WINTER_ML/sample_submission.csv')
submit['가격(백만원)'] = pred
submit.to_csv('/content/drive/MyDrive/KUBIG_2025_WINTER_ML/submission_rf_0219.csv', index=False)

print("제출 파일이 성공적으로 저장되었습니다.")

제출 파일이 성공적으로 저장되었습니다.
