In [42]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

读取数据集

In [43]:
housing_data = pd.read_csv('housing.csv')
housing_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20640 entries, 0 to 20639
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   longitude           20640 non-null  float64
 1   latitude            20640 non-null  float64
 2   housing_median_age  20640 non-null  float64
 3   total_rooms         20640 non-null  float64
 4   total_bedrooms      20433 non-null  float64
 5   population          20640 non-null  float64
 6   households          20640 non-null  float64
 7   median_income       20640 non-null  float64
 8   median_house_value  20640 non-null  float64
 9   ocean_proximity     20640 non-null  object 
dtypes: float64(9), object(1)
memory usage: 1.6+ MB


**longitude**：经度，表示住房所在位置的经度坐标。   
**latitude**：纬度，表示住房所在位置的纬度坐标。   
**housing_median_age**：中位数房龄，表示该地区住房的中位年龄。  
**total_rooms**：总房间数，表示该地区所有住房中的房间总数。  
**total_bedrooms**：总卧室数，表示该地区所有住房中的卧室总数。  
**population**：人口数，表示该地区的人口数量。    
**households**：家庭数量，表示该地区的家庭总数。   
**median_income**：中位数收入，表示该地区家庭的中位收入水平。    
**median_house_value**：中位数房价，表示该地区住房的中位价值。    
**ocean_proximity**：海洋邻近度，表示住房与海洋的距离。  

In [44]:
len(set(housing_data['ocean_proximity']))

5

因为第9列ocean_proximity是类别变量，且ocean_proximity的类别只有5种，不会造成维度爆炸  
所以这里选择使用独热编码处理ocean_proximity

In [45]:
# 对第9列（'ocean_proximity'）进行独热编码
ocean_proximity_dummies = pd.get_dummies(housing_data['ocean_proximity'], prefix='ocean_proximity')

# 将独热编码后的DataFrame与原始DataFrame合并
data_encoded = pd.concat([housing_data.drop('ocean_proximity', axis=1), ocean_proximity_dummies], axis=1)

In [46]:
data_encoded

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value,ocean_proximity_<1H OCEAN,ocean_proximity_INLAND,ocean_proximity_ISLAND,ocean_proximity_NEAR BAY,ocean_proximity_NEAR OCEAN
0,-122.23,37.88,41.0,880.0,129.0,322.0,126.0,8.3252,452600.0,False,False,False,True,False
1,-122.22,37.86,21.0,7099.0,1106.0,2401.0,1138.0,8.3014,358500.0,False,False,False,True,False
2,-122.24,37.85,52.0,1467.0,190.0,496.0,177.0,7.2574,352100.0,False,False,False,True,False
3,-122.25,37.85,52.0,1274.0,235.0,558.0,219.0,5.6431,341300.0,False,False,False,True,False
4,-122.25,37.85,52.0,1627.0,280.0,565.0,259.0,3.8462,342200.0,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20635,-121.09,39.48,25.0,1665.0,374.0,845.0,330.0,1.5603,78100.0,False,True,False,False,False
20636,-121.21,39.49,18.0,697.0,150.0,356.0,114.0,2.5568,77100.0,False,True,False,False,False
20637,-121.22,39.43,17.0,2254.0,485.0,1007.0,433.0,1.7000,92300.0,False,True,False,False,False
20638,-121.32,39.43,18.0,1860.0,409.0,741.0,349.0,1.8672,84700.0,False,True,False,False,False


缺失total_bedrooms的样本只有207个，数量比较少，所以我选择直接删除。

In [47]:
data_encoded = data_encoded.dropna(subset=['total_bedrooms'])
data_encoded.shape

(20433, 14)

In [54]:
data_encoded.info()

<class 'pandas.core.frame.DataFrame'>
Index: 20433 entries, 0 to 20639
Data columns (total 14 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   longitude                   20433 non-null  float64
 1   latitude                    20433 non-null  float64
 2   housing_median_age          20433 non-null  float64
 3   total_rooms                 20433 non-null  float64
 4   total_bedrooms              20433 non-null  float64
 5   population                  20433 non-null  float64
 6   households                  20433 non-null  float64
 7   median_income               20433 non-null  float64
 8   median_house_value          20433 non-null  float64
 9   ocean_proximity_<1H OCEAN   20433 non-null  bool   
 10  ocean_proximity_INLAND      20433 non-null  bool   
 11  ocean_proximity_ISLAND      20433 non-null  bool   
 12  ocean_proximity_NEAR BAY    20433 non-null  bool   
 13  ocean_proximity_NEAR OCEAN  20433 no

In [None]:
X = data_encoded.drop(columns='median_house_value').to_numpy().astype(np.float64)
y = data_encoded['median_house_value'].to_numpy().astype(np.float64)
y = y.reshape(-1, 1)
X_mean = np.mean(X, axis=0)  
X_std = np.std(X, axis=0)   
X_norm = (X - X_mean) / X_std
# 划分数据集
X_train,X_test,y_train,y_test = train_test_split(X_norm,y,test_size=0.3,random_state=1)

最开始训练的损失函数和评估的loss函数是一样的，都是MSE损失函数。
$$\text{MSE}(y, \hat{y}) = \frac{1}{n} \sum_{i=1}^{n} (y_i - \hat{y}_i)^2$$

评估模型效果时发现，MSE函数的评估结果是非常大的。但是实际上，预测结果的误差是非常小的。  
这时我发觉对于房屋价格来说，我们更关心的是预测结果的相对误差，而不是绝对误差。于是，我将评估函数修改为RMSE函数。
$$\text{RMSE}(y, \hat{y}) = \sqrt{\frac{1}{n} \sum_{i=1}^{n} \left( \log(\max(\hat{y}_i, 1)) - \log(y_i) \right)^2 }$$


In [None]:
def linear(x, w, b):
    return x @ w + b
def loss(y,y_hat):
    return (y_hat-y.reshape(y_hat.shape))**2/2
def loss_rmse(y,y_hat):
    y_hat = np.clip(y_hat,1,float('inf'))
    y = np.log(y)
    y_hat = np.log(y_hat)
    rmse = np.sqrt(loss(y,y_hat))
    return rmse
def gradient_descent(x, y_true, y_pred, lr):
    global w, b
    m = len(x)
    w_gradient = x.T @ (y_pred.reshape(-1,1) - y_true) / m
    b_gradient = np.sum((y_pred - y_true)) / m
    w -= lr * w_gradient
    b -= lr * b_gradient
def data_iter(batch_size,x,y):
    num = len(x)
    index = np.arange(num)
    np.random.shuffle(index)
    for i in range(0,num,batch_size):
        j = np.array(index[i:min(i+batch_size,num)])
        yield x[j],y[j]
def train(X,y,lr,epochs,batch_size):
    for epoch in range(epochs):
        for X_train,y_train in data_iter(batch_size,X,y):
            y_pred = linear(X_train,w,b)
            train_loss = loss_rmse(y_train,y_pred)
            gradient_descent(X_train,y_train,y_pred,lr)
        print(f'epoch {epoch + 1}, loss {float(train_loss.mean()):f}')

初始化参数

In [50]:
w = np.ones((X.shape[1],1))
b = 0
lr = 0.1
epochs = 1000
batch_size = 1024

训练模型

In [51]:
train(X_train,y_train,lr,epochs,batch_size)

epoch 1, loss 0.275057
epoch 2, loss 0.209466
epoch 3, loss 0.191361
epoch 4, loss 0.198966
epoch 5, loss 0.195352
epoch 6, loss 0.199757
epoch 7, loss 0.187300
epoch 8, loss 0.184548
epoch 9, loss 0.196144
epoch 10, loss 0.192675
epoch 11, loss 0.192550
epoch 12, loss 0.199309
epoch 13, loss 0.199179
epoch 14, loss 0.195025
epoch 15, loss 0.195215
epoch 16, loss 0.210212
epoch 17, loss 0.191262
epoch 18, loss 0.198712
epoch 19, loss 0.190011
epoch 20, loss 0.182241
epoch 21, loss 0.192053
epoch 22, loss 0.182418
epoch 23, loss 0.195583
epoch 24, loss 0.208423
epoch 25, loss 0.185807
epoch 26, loss 0.193655
epoch 27, loss 0.191856
epoch 28, loss 0.201894
epoch 29, loss 0.196200
epoch 30, loss 0.193966
epoch 31, loss 0.206925
epoch 32, loss 0.185589
epoch 33, loss 0.183081
epoch 34, loss 0.200343
epoch 35, loss 0.198714
epoch 36, loss 0.211567
epoch 37, loss 0.206156
epoch 38, loss 0.190350
epoch 39, loss 0.215952
epoch 40, loss 0.191861
epoch 41, loss 0.204245
epoch 42, loss 0.205416
e

评估模型

In [53]:
def r_squared(y_true, y_pred):
    ss_res = np.sum((y_pred - y_true) ** 2)
    ss_tot = np.sum((y_true - np.mean(y_true)) ** 2)
    r2 = 1 - (ss_res / ss_tot)
    return r2

y_pred_test = linear(X_test, w, b)

# 计算测试集上的 R^2
r2_score = r_squared(y_test, y_pred_test)
print(f'R^2: {r2_score}')

R^2: 0.658811212850126


In [57]:
correlation_matrix = data_encoded.corr()
print("相关系数矩阵：")
# 提取房价中位数与其他特征的相关系数
median_house_value_correlation = correlation_matrix['median_house_value']
print(median_house_value_correlation)

相关系数矩阵：
longitude                    -0.045398
latitude                     -0.144638
housing_median_age            0.106432
total_rooms                   0.133294
total_bedrooms                0.049686
population                   -0.025300
households                    0.064894
median_income                 0.688355
median_house_value            1.000000
ocean_proximity_<1H OCEAN     0.257614
ocean_proximity_INLAND       -0.484787
ocean_proximity_ISLAND        0.023525
ocean_proximity_NEAR BAY      0.160526
ocean_proximity_NEAR OCEAN    0.140378
Name: median_house_value, dtype: float64
