In [22]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder

plt.style.use('fivethirtyeight')
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
train = pd.read_csv('./train.csv')

In [23]:
# 결측치 확인하기
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   int64  
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [24]:
# 결측치 처리 
cols_none = [
    'PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu', 
    'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond',
    'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
    'MasVnrType'
]

# 0으로 채워야 하는 수치형 컬럼들
cols_zero = [
    'GarageArea', 'GarageCars', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 
    'TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath', 'MasVnrArea'
]

for col in cols_none:
    train[col] = train[col].fillna('None')

for col in cols_zero:
    train[col] = train[col].fillna(0)

In [33]:
# 예시 1 Quality Map : 4, 3 ,2 ,1 할당 하기. (파이썬)
test = train.copy()
quality_map = {
    'Ex': 4,  # Excellent
    'Gd': 3,  # Good
    'Ta': 2,  # Average (Typical/Average)
    'Fa': 1,  # Fair
    'Po': 0   # Poor (있다면)
}

# test 데이터에도 같은 방식으로
test['ExterQual'] = test['ExterQual'].map(quality_map)

In [34]:
# 나머지 수치형 결측치 (예: LotFrontage)는 중앙값으로 대체
test['LotFrontage'] = test['LotFrontage'].fillna(test['LotFrontage'].median())

# 나머지 범주형 결측치 (예: Electrical)는 최빈값으로 대체
test['Electrical'] = test['Electrical'].fillna(test['Electrical'].mode()[0])

In [35]:
# 3. Object 컬럼을 숫자로 변환 (Label Encoding)
# 의미 없는 부분.
# 범주형(object) 컬럼만 추출
object_cols = test.select_dtypes(include=['object']).columns

le_dict = {}  # 매핑 저장용
for col in object_cols:
    le = LabelEncoder()
    train[col] = le.fit_transform(train[col])
    
    # 1. 변환된 값들 확인 (평균/최빈값)
    print(f"{col}: 평균={train[col].mean():.2f}, 최빈값={test[col].mode()[0]}")
    
    # 2. 매핑 저장
    le_dict[col] = dict(zip(le.classes_, range(len(le.classes_))))

print("\n=== Label Encoding 매핑 ===")
for col, mapping in le_dict.items():
    print(f"{col}: {mapping}")

In [38]:
# 3. 수치형 데이터만 선택 (상관계수 계산을 위해)
# 문자열(Object) 데이터가 섞여있으면 corr()에서 에러가 날 수 있으므로 수치형만 추출합니다.
numeric_df = test.select_dtypes(include=[np.number])

In [40]:
# 4. 상관계수 계산
corr_series = numeric_df.corr()['SalePrice'].sort_values(ascending=False)

In [41]:
# 5. 결과 출력 (상위 15개 핵심 변수)
print("--- SalePrice와 상관관계 ---")
print(corr_series) 

--- SalePrice와 상관관계 ---
SalePrice       1.000000
OverallQual     0.790982
GrLivArea       0.708624
Neighborhood    0.660349
GarageCars      0.640409
                  ...   
GarageType     -0.415283
GarageFinish   -0.425684
KitchenQual    -0.589189
BsmtQual       -0.593734
ExterQual            NaN
Name: SalePrice, Length: 81, dtype: float64
