# Clean 데이터 학습
## 결측치가 없는(모델, 상태, 지역 정보가 모두 있는) 데이터로 회귀 모델 학습

### 0. 의존성 관리

In [1]:
import os, glob, math, warnings, random
from typing import List
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
from PIL import Image, ImageOps

from lazypredict.Supervised import LazyRegressor

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torchvision import models, transforms
from sklearn.model_selection import train_test_split
from torchvision.transforms.functional import InterpolationMode
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

### 1. 결측치가 없는 데이터로 변환

In [2]:
# CSV 로드
CSV_PATH = "../../csv/data_0826.csv"
df = pd.read_csv(CSV_PATH)

# 제거 대상 컬럼 지정
required_cols = ["model", "condition"]

# 결측치 제거 전 행 수
print("Before:", len(df))

# 해당 컬럼들 중 하나라도 결측치(NaN)이면 제거
df = df.dropna(subset=required_cols)

# 결측치 제거 후 행 수
print("After:", len(df))

# 확인
print(df[required_cols].isna().sum())

# clean data 다시 저장
df.to_csv("../../csv/data_regression_clean.csv", index=False)

Before: 5127
After: 1391
model        0
condition    0
dtype: int64


### 2. 데이터 벡터화(첫번째 이미지 + 모델 + 모델 등급 + 상태 + 지역) 하여 train set / test set 구분

In [None]:
# -*- coding: utf-8 -*-
# pip install torch torchvision pandas pillow numpy scikit-learn pyarrow
import os, glob, warnings
from typing import Optional
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
from PIL import Image, ImageOps

import torch
import torch.nn as nn
from torchvision import models, transforms
from torchvision.transforms.functional import InterpolationMode

# ========= 사용자 설정 =========
CSV_PATH   = "../../csv/data_regression_clean.csv"   # CSV: id, location, model, model_type, condition
IMG_ROOT   = "../../data/regression_clean_images"    # 각 id 폴더가 존재
RANDOM_SEED = 42

np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

# ========= 530×690으로 자르는 함수 =========
def make_rect_530x690(img: Image.Image) -> Image.Image:
    """
    입력 이미지 -> 중앙 크롭으로 타깃 비율(530:690) 맞춘 뒤, 정확히 530×690으로 리사이즈.
    * 왜 중앙 크롭? 패딩보다 왜곡/여백 없이 정보 밀도를 유지하기 쉬움.
    """
    img = ImageOps.exif_transpose(img)  # EXIF 회전 보정
    w, h = img.size
    target_w, target_h = 530, 690
    target_ratio = target_w / target_h   # ≈ 0.768 (세로가 더 긴 비율)
    cur_ratio = w / h

    if cur_ratio > target_ratio:
        # 현재가 더 가로로 넓음 → 가로를 잘라서 비율 맞추기
        new_w = int(h * target_ratio)
        left  = (w - new_w) // 2
        img = img.crop((left, 0, left + new_w, h))
    elif cur_ratio < target_ratio:
        # 현재가 더 세로로 김 → 세로를 잘라서 비율 맞추기
        new_h = int(w / target_ratio)
        top   = (h - new_h) // 2
        img = img.crop((0, top, w, top + new_h))
    # 이제 비율이 타깃과 동일 → 정확히 530×690으로 리사이즈
    return img.resize((target_w, target_h), Image.BICUBIC)

# ========= 임베딩용 전처리 =========
# 1) 530×690으로 맞춤(위 함수)
tfm = transforms.Compose([
    transforms.Lambda(lambda im: make_rect_530x690(im)),
    transforms.Resize(224, interpolation=InterpolationMode.BICUBIC, antialias=True),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std =[0.229, 0.224, 0.225]),
])

# ========= 임베딩 모델 준비(ResNet18, FC 제거) =========
device = "cuda" if torch.cuda.is_available() else "cpu"
resnet = models.resnet18(weights=models.ResNet18_Weights.DEFAULT)
feature_dim = resnet.fc.in_features  # 512
resnet.fc = nn.Identity()            # 분류기 제거 → 512차 임베딩
resnet.eval().to(device)

# ========= 폴더 첫 이미지 선택 & 임베딩 =========
IMG_EXTS = (".jpg", ".jpeg", ".png")

def first_image_path(folder: str) -> Optional[str]:
    if not os.path.isdir(folder):
        return None
    paths = []
    for ext in IMG_EXTS:
        paths += glob.glob(os.path.join(folder, f"*{ext}"))
        paths += glob.glob(os.path.join(folder, f"*{ext.upper()}"))
    if not paths:
        return None
    return sorted(set(paths))[0]

@torch.no_grad()
def embed_first_image(path: Optional[str]) -> np.ndarray:
    if path is None:
        return np.zeros(feature_dim, dtype=np.float32)
    try:
        with Image.open(path) as im:
            im = im.convert("RGB")
    except Exception:
        return np.zeros(feature_dim, dtype=np.float32)
    x = tfm(im).unsqueeze(0).to(device)        # (1,3,224,224)
    feat = resnet(x).squeeze(0).cpu().numpy()  # (512,)
    return feat.astype(np.float32)

# 모든 id에 대해: 폴더의 첫 이미지 1장만 임베딩
ids = df["id"].astype(str).tolist()
img_feats = []
for i, item_id in enumerate(ids, 1):
    p0 = first_image_path(os.path.join(IMG_ROOT, item_id))
    img_feats.append(embed_first_image(p0))
    if i % 200 == 0:
        print(f"[임베딩] {i}/{len(ids)}")

img_feats = np.vstack(img_feats)                # (N, 512)
img_cols = [f"img_{k}" for k in range(img_feats.shape[1])]
df_img = pd.DataFrame(img_feats, columns=img_cols)

# ========= 벡터 결합: (원핫 4컬럼) + (이미지512) =========
df_vec = pd.concat([df.reset_index(drop=True), df_img], axis=1)

cat_cols = ["location", "model", "model_type", "condition"]
for c in cat_cols:
    df_vec[c] = df_vec[c].astype(str).str.strip()  # 간단 정규화

X_cat = pd.get_dummies(df_vec[cat_cols], drop_first=False)
X_img = df_vec[img_cols]
X_all = pd.concat([X_cat, X_img], axis=1)

print("원핫 feature 수:", X_cat.shape[1])
print("이미지 feature 수:", X_img.shape[1])     # 512
print("총 feature 수   :", X_all.shape[1])      # 564 = 원핫 + 512


[임베딩] 200/1391
[임베딩] 400/1391
[임베딩] 600/1391
[임베딩] 800/1391
[임베딩] 1000/1391
[임베딩] 1200/1391
원핫 feature 수: 52
이미지 feature 수: 512
총 feature 수   : 564


In [4]:
# ========= 4) Train / Test split & 학습/평가 (HGBR + log 타깃) =========
import numpy as np, math
from sklearn.model_selection import train_test_split
from sklearn.compose import TransformedTargetRegressor
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib

# 안전체크: price 컬럼 존재
if "price" not in df_vec.columns:
    raise ValueError("CSV에 'price' 컬럼이 필요합니다. df_vec에 price가 없습니다.")

# 입력/타깃 준비
X = X_all.values.astype(np.float32)              # (N, D)
y = df_vec["price"].astype(float).values         # (N,)

# 학습/검증 분리
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=RANDOM_SEED
)

print("X_train:", X_train.shape, "X_test:", X_test.shape)

# 모델: 트리 계열 회귀 + 로그 타깃 감싸기(스케일 안정화)
base_reg = HistGradientBoostingRegressor(
    learning_rate=0.08,
    max_leaf_nodes=31,
    min_samples_leaf=20,
    random_state=RANDOM_SEED
)

model = TransformedTargetRegressor(
    regressor=base_reg,
    func=np.log1p,        # y -> log1p(y)로 학습
    inverse_func=np.expm1 # 예측을 원래 스케일로 복원
)

# 학습
model.fit(X_train, y_train)

# 예측 & 평가
pred = model.predict(X_test)
mae  = mean_absolute_error(y_test, pred)
rmse = math.sqrt(mean_squared_error(y_test, pred))
r2   = r2_score(y_test, pred)

print(f"[Eval] MAE={mae:,.2f} | RMSE={rmse:,.2f} | R²={r2:.4f}")

# (선택) 단순 평균 예측과 비교해 보기 (baseline)
y_mean_pred = np.full_like(y_test, y_train.mean(), dtype=float)
rmse_mean = math.sqrt(mean_squared_error(y_test, y_mean_pred))
r2_mean   = r2_score(y_test, y_mean_pred)
print(f"[Baseline: 평균] RMSE={rmse_mean:,.2f} | R²={r2_mean:.4f}")  # R²는 항상 0.0 근처

# (선택) 모델 저장 - scikit-learn은 .joblib 권장
joblib.dump(model, "price_regressor_hgbr.joblib")
print("저장 완료: price_regressor_hgbr.joblib")

# (선택) 로드/사용 예시
# loaded = joblib.load("price_regressor_hgbr.joblib")
# y_pred = loaded.predict(X_test[:5])


X_train: (1112, 564) X_test: (279, 564)
[Eval] MAE=118,127.42 | RMSE=208,631.71 | R²=0.2294
[Baseline: 평균] RMSE=613,777.48 | R²=-5.6690
저장 완료: price_regressor_hgbr.joblib


### 3. AUTO ML 및 Optuna을 통해 모델 추천 및 하이퍼 파라미터 추천

In [None]:
# ====== AutoML: LazyRegressor ======
lzr = LazyRegressor(verbose=0, ignore_warnings=True, custom_metric=None, random_state=42)
models_df, preds_df = lzr.fit(X_train, X_test, y_train, y_test)

# 성능표 확인 (RMSE/MAE/R2 등). 상위 10개만 보기
print(models_df.head(10))

  0%|          | 0/42 [00:00<?, ?it/s]

### 4. 벡터화된 데이터로 추천 모델 학습

In [None]:
# ========= 4) PyTorch Dataset/Dataloader =========
class TabImageDS(Dataset):
    def __init__(self, X, y):
        self.X = torch.from_numpy(X)  # float32
        self.y = torch.from_numpy(y).view(-1, 1)  # (N, 1)
    def __len__(self):
        return self.X.shape[0]
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

train_ds = TabImageDS(X_train, y_train)
test_ds  = TabImageDS(X_test,  y_test)
train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
test_loader  = DataLoader(test_ds,  batch_size=BATCH_SIZE, shuffle=False)

# ========= 5) 간단한 MLP 회귀 모델 =========
in_dim = X_all.shape[1]
class PriceRegressor(nn.Module):
    def __init__(self, in_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(in_dim, 512),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(512, 128),
            nn.ReLU(),
            nn.Linear(128, 1)
        )
    def forward(self, x):
        return self.net(x)

model = PriceRegressor(in_dim).to(device)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=LR)

# ========= 6) 학습 루프 =========
def evaluate(loader):
    model.eval()
    preds, gts = [], []
    with torch.no_grad():
        for xb, yb in loader:
            xb, yb = xb.to(device), yb.to(device)
            out = model(xb)
            preds.append(out.squeeze(1).cpu().numpy())
            gts.append(yb.squeeze(1).cpu().numpy())
    preds = np.concatenate(preds)
    gts   = np.concatenate(gts)
    mae  = mean_absolute_error(gts, preds)
    rmse = math.sqrt(((gts - preds) ** 2).mean())
    r2   = r2_score(gts, preds)
    return mae, rmse, r2

best_rmse = float("inf")
for epoch in range(1, EPOCHS + 1):
    model.train
    total_loss = 0.0
    for xb, yb in train_loader:
        xb, yb = xb.to(device), yb.to(device)
        optimizer.zero_grad()
        out = model(                                            asdasdasdasdaddaasdasdadsdasdssssxb)
        loss = criterion(out, yb)
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * xb.size(0)

    train_mae, train_rmse, train_r2 = evaluate(train_loader)
    val_mae,   val_rmse,   val_r2   = evaluate(test_loader)
    print(f"[{epoch:02d}] loss={total_loss/len(train_ds):.4f} "
        f"| Train RMSE={train_rmse:.2f} R2={train_r2:.3f} "
        f"| Val RMSE={val_rmse:.2f} R2={val_r2:.3f}")

    # 간단한 베스트 체크(검증 RMSE 개선 시 가중치 임시 저장)
    if val_rmse < best_rmse:
        best_rmse = val_rmse
        torch.save(model.state_dict(), "price_regressor_best.pth")

# 마지막 에폭 가중치도 저장(원하면 둘 중 하나만 쓰면 됨)
torch.save(model.state_dict(), "price_regressor.pth")
print("가중치 저장 완료: price_regressor_best.pth / price_regressor.pth")

# ========= 7) 로드 방법(예시) =========
# model = PriceRegressor(in_dim)
# model.load_state_dict(torch.load("price_regressor_best.pth", map_location="cpu"))
# model.eval()

571
[01] loss=374563173588081.7500 | Train RMSE=19353632.51 R2=-0.002 | Val RMSE=269349.38 R2=-0.860
[02] loss=374563004046904.8750 | Train RMSE=19353624.71 R2=-0.002 | Val RMSE=269252.85 R2=-0.859
[03] loss=374562681069171.6250 | Train RMSE=19353616.90 R2=-0.002 | Val RMSE=269097.77 R2=-0.857
[04] loss=374562264847921.5625 | Train RMSE=19353603.90 R2=-0.002 | Val RMSE=268857.79 R2=-0.854
[05] loss=374561713573858.6250 | Train RMSE=19353581.36 R2=-0.002 | Val RMSE=268495.03 R2=-0.849
[06] loss=374560827935637.5625 | Train RMSE=19353552.76 R2=-0.002 | Val RMSE=268022.85 R2=-0.842
[07] loss=374559506068968.1250 | Train RMSE=19353517.21 R2=-0.002 | Val RMSE=267422.88 R2=-0.834
[08] loss=374557996616715.0000 | Train RMSE=19353460.87 R2=-0.002 | Val RMSE=266507.16 R2=-0.821
[09] loss=374556140479976.1250 | Train RMSE=19353394.98 R2=-0.002 | Val RMSE=265393.96 R2=-0.806
[10] loss=374553125522677.9375 | Train RMSE=19353315.23 R2=-0.002 | Val RMSE=264037.35 R2=-0.788
[11] loss=374549900398518.