In [None]:
# 這是一個Jupyter Notebook，可以在notebooks/目錄下創建

# %% [markdown]
# # 房價數據探索分析
# 
# 這是期中報告的EDA部分

# %%
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# 設置中文字體
plt.rcParams['font.sans-serif'] = ['Arial Unicode MS', 'DejaVu Sans']
plt.rcParams['axes.unicode_minus'] = False

# %%
# 加載數據
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')

print(f"訓練集: {train.shape}")
print(f"測試集: {test.shape}")

# %%
# 顯示前幾行
train.head()

# %%
# 基本統計
train.describe()

# %%
# 缺失值分析
missing = train.isnull().sum()
missing = missing[missing > 0]
print(f"有 {len(missing)} 個特徵有缺失值")

plt.figure(figsize=(10, 6))
missing.sort_values(ascending=True).tail(20).plot(kind='barh')
plt.title('缺失值最多的20個特徵')
plt.tight_layout()
plt.show()

# %%
# 目標變量分析
plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.hist(train['SalePrice'], bins=50, edgecolor='black', alpha=0.7)
plt.title('SalePrice 原始分佈')
plt.xlabel('價格 ($)')
plt.ylabel('頻數')

plt.subplot(1, 2, 2)
plt.hist(np.log1p(train['SalePrice']), bins=50, edgecolor='black', alpha=0.7, color='orange')
plt.title('SalePrice 對數轉換後分佈')
plt.xlabel('log(價格)')
plt.ylabel('頻數')

plt.tight_layout()
plt.show()

# %%
# 相關性分析
numeric_cols = train.select_dtypes(include=[np.number]).columns
correlation = train[numeric_cols].corr()

# 與SalePrice最相關的特徵
corr_with_price = correlation['SalePrice'].sort_values(ascending=False)
print("與SalePrice最相關的10個特徵:")
print(corr_with_price.head(10))

# %%
# 可視化關鍵特徵
key_features = ['OverallQual', 'GrLivArea', 'GarageCars', 'TotalBsmtSF', 'YearBuilt']

fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.flatten()

for i, feature in enumerate(key_features):
    if i < len(axes):
        axes[i].scatter(train[feature], train['SalePrice'], alpha=0.3)
        axes[i].set_xlabel(feature)
        axes[i].set_ylabel('SalePrice')
        axes[i].set_title(f'{feature} vs SalePrice')

# 移除多餘的子圖
for i in range(len(key_features), len(axes)):
    fig.delaxes(axes[i])

plt.tight_layout()
plt.show()