In [2]:
import pandas as pd
import numpy as np

train = pd.read_csv("./data/train.csv")
test = pd.read_csv("./data/test.csv")

# 타겟 분리
y = train["SalePrice"]
train.drop(columns=["SalePrice"], inplace=True)

# train/test 합치기 (전처리 일괄 처리 위해)
full = pd.concat([train, test], axis=0, ignore_index=True)
print(full.shape)  # (2919, 80)

(2919, 80)


In [3]:
full["LotFrontage"] = full["LotFrontage"].fillna(full["LotFrontage"].median())
full["GarageYrBlt"] = full["GarageYrBlt"].fillna(0)
full["MasVnrArea"] = full["MasVnrArea"].fillna(0)

In [4]:
fill_none = [
    "Alley",
    "BsmtQual",
    "BsmtCond",
    "BsmtExposure",
    "BsmtFinType1",
    "BsmtFinType2",
    "FireplaceQu",
    "GarageType",
    "GarageFinish",
    "GarageQual",
    "GarageCond",
    "PoolQC",
    "Fence",
    "MiscFeature",
    "MasVnrType",
]

for col in fill_none:
    full[col] = full[col].fillna("None")

# 최빈값으로 채울 변수
fill_mode = [
    "Electrical",
    "KitchenQual",
    "Exterior1st",
    "Exterior2nd",
    "SaleType",
    "MSZoning",
    "Functional",
]

for col in fill_mode:
    full[col] = full[col].fillna(full[col].mode()[0])

In [5]:
y_log = np.log1p(y)  # log(1 + y)

In [6]:
full = pd.get_dummies(full)  # 모든 object 타입 자동 인코딩
print(full.shape)  # 차원 수 증가 확인

(2919, 303)


In [7]:
# full: train + test 합친 데이터프레임
missing_cols = full.columns[full.isnull().any()]
print("결측치가 남아 있는 열들:", missing_cols.tolist())

# 각 열별 결측치 개수 출력
print(full[missing_cols].isnull().sum())

결측치가 남아 있는 열들: ['BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath', 'GarageCars', 'GarageArea']
BsmtFinSF1      1
BsmtFinSF2      1
BsmtUnfSF       1
TotalBsmtSF     1
BsmtFullBath    2
BsmtHalfBath    2
GarageCars      1
GarageArea      1
dtype: int64


In [None]:
cols_fill_zero = [
    "BsmtFinSF1",
    "BsmtFinSF2",
    "BsmtUnfSF",
    "TotalBsmtSF",
    "BsmtFullBath",
    "BsmtHalfBath",
    "GarageCars",
    "GarageArea",
]

for col in cols_fill_zero:
    full[col] = full[col].fillna(0)

In [None]:
X_train = full.iloc[: len(y), :]
X_test = full.iloc[len(y) :, :]

X_train.to_csv("./data/X_train.csv", index=False)
X_test.to_csv("./data/X_test.csv", index=False)