In [None]:
# 1. 匯入套件
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# 設定圖表風格
sns.set(style="whitegrid")

# 2. 載入資料
df = pd.read_csv('data/housing.csv')

# --- 簡單的 EDA (探索式資料分析) ---
print("資料集形狀:", df.shape)
print("\n欄位資訊:")
print(df.info())

# 視覺化：地理位置與房價分佈 (這就是前面提到的地理空間分析)
plt.figure(figsize=(10, 6))
sc = plt.scatter(x=df['longitude'], y=df['latitude'], 
                 alpha=0.4, 
                 c=df['median_house_value'], 
                 cmap=plt.get_cmap("jet"), 
                 label='Price')
plt.colorbar(sc, label='Median House Value')
plt.title("California Housing Prices - Geospatial Data")
plt.xlabel("Longitude")
plt.ylabel("Latitude")
plt.legend()
plt.show()

# --- 資料前處理 (Data Preprocessing) ---

# 處理缺失值：total_bedrooms 有少數缺失，我們用中位數填補
median_bedrooms = df['total_bedrooms'].median()
df['total_bedrooms'].fillna(median_bedrooms, inplace=True)

# 處理類別變數：ocean_proximity (轉成數字/One-Hot)
# 為了示範簡單化，我們先丟掉這個非數值欄位 (進階做法是用 OneHotEncoder)
df_numeric = df.drop('ocean_proximity', axis=1)

# 定義特徵 (X) 與 目標 (y)
X = df_numeric.drop('median_house_value', axis=1)
y = df_numeric['median_house_value']

# 分割訓練集與測試集 (80% 訓練, 20% 測試)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# --- 模型訓練 (Modeling) ---

# 初始化線性回歸模型
model = LinearRegression()

# 訓練模型
model.fit(X_train, y_train)

# --- 評估 (Evaluation) ---

# 進行預測
predictions = model.predict(X_test)

# 計算 RMSE (均方根誤差)
mse = mean_squared_error(y_test, predictions)
rmse = np.sqrt(mse)

print(f"\n模型評估結果 (RMSE): ${rmse:,.2f}")
print("這代表我們的預測平均誤差大約是 7 萬美元左右。")