In [153]:
import hashlib
import os
import tarfile
import zipfile
import requests

#@save
DATA_HUB = dict()
DATA_URL = 'http://d2l-data.s3-accelerate.amazonaws.com/' #文件下载链接

In [154]:
def download(name, cache_dir = os.path.join('.', 'data/amazonaws_house_price_data')):
    assert name in DATA_HUB,f"{name} 不存在与{DATA_HUB}"
    url, sha1_hash = DATA_HUB[name]# 将value拿出来(url, hash)
    fname = os.path.join(cache_dir, url.split('/')[-1])
    if os.path.exists(fname):
        sha1 = hashlib.sha1()
        with open(fname, 'rb') as f:
            while True:
                data = f.read(1048576)
                if not data:
                    break
                sha1.update(data)
        if sha1.hexdigest() == sha1_hash:
            return fname  # 命中缓存
    print(f'正在从{url}下载{fname}...')
    r = requests.get(url, stream=True, verify=True)
    with open(fname, 'wb') as f:
        f.write(r.content)
    return fname 
        

In [155]:
#还需实现两个实用函数： 一个将下载并解压缩一个zip或tar文件， 另一个是将本书中使用的所有数据集从DATA_HUB下载到缓存目录中。

def download_extract(name, folder=None):  #@save
    """下载并解压zip/tar文件"""
    fname = download(name)
    base_dir = os.path.dirname(fname)
    data_dir, ext = os.path.splitext(fname)
    if ext == '.zip':
        fp = zipfile.ZipFile(fname, 'r')
    elif ext in ('.tar', '.gz'):
        fp = tarfile.open(fname, 'r')
    else:
        assert False, '只有zip/tar文件可以被解压缩'
    fp.extractall(base_dir)
    return os.path.join(base_dir, folder) if folder else data_dir

def download_all():  #@save
    """下载DATA_HUB中的所有文件"""
    for name in DATA_HUB:
        download(name)

In [156]:
# 如果没有安装pandas，请取消下一行的注释
# !pip install pandas

%matplotlib inline
import numpy as np
import pandas as pd
import torch
from torch import nn
from d2l import torch as d2l

In [157]:
DATA_HUB['kaggle_house_train'] = (  #@save
    DATA_URL + 'kaggle_house_pred_train.csv',
    '585e9cc93e70b39160e7921475f9bcd7d31219ce')

DATA_HUB['kaggle_house_test'] = (  #@save
    DATA_URL + 'kaggle_house_pred_test.csv',
    'fa19780a7b011d9b009e8bff8e99922a8ee2eb90')

In [158]:
train_data = pd.read_csv(download('kaggle_house_train'))
test_data = pd.read_csv(download('kaggle_house_test'))

In [159]:
print(train_data.shape)
print(test_data.shape)

(1460, 81)
(1459, 80)


In [160]:
print(train_data.iloc[0:4, [0, 1, 2, 3, -3, -2, -1]])


   Id  MSSubClass MSZoning  LotFrontage SaleType SaleCondition  SalePrice
0   1          60       RL         65.0       WD        Normal     208500
1   2          20       RL         80.0       WD        Normal     181500
2   3          60       RL         68.0       WD        Normal     223500
3   4          70       RL         60.0       WD       Abnorml     140000


In [161]:
#删除第一列ID,合并除了target其他特征的数据

all_features = pd.concat((train_data.iloc[:, 1:-1], test_data.iloc[:,1:]))

In [162]:
all_features.shape

(2919, 79)

数据预处理

In [163]:
#若无法获得测试数据，则可根据训练数据计算出均值和标准差
numeric_features = all_features.dtypes[all_features.dtypes != 'object'].index
#将数值类型的数据进行标准化操作
all_features[numeric_features] = all_features[numeric_features].apply(lambda x : (x-x.mean())/x.std())
#标准化数据后,所有均值小时，因此我们可以将缺失值填充为0
all_features[numeric_features]= all_features[numeric_features].fillna(0)

In [173]:
#将obj列的数据进行独热编码
all_features = pd.get_dummies(all_features, dummy_na=True)
obj_lis = []
for i in all_features.columns:
    if i not in numeric_features:
        obj_lis.append(i)
all_features[obj_lis] = all_features[obj_lis].astype(int)

Unnamed: 0,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,...,SaleType_Oth,SaleType_WD,SaleType_nan,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial,SaleCondition_nan
0,0.067320,-0.184443,-0.217841,0.646073,-0.507197,1.046078,0.896679,0.523038,0.580708,-0.29303,...,0,1,0,0,0,0,0,1,0,0
1,-0.873466,0.458096,-0.072032,-0.063174,2.187904,0.154737,-0.395536,-0.569893,1.177709,-0.29303,...,0,1,0,0,0,0,0,1,0,0
2,0.067320,-0.055935,0.137173,0.646073,-0.507197,0.980053,0.848819,0.333448,0.097840,-0.29303,...,0,1,0,0,0,0,0,1,0,0
3,0.302516,-0.398622,-0.078371,0.646073,-0.507197,-1.859033,-0.682695,-0.569893,-0.494771,-0.29303,...,0,1,0,1,0,0,0,0,0,0
4,0.067320,0.629439,0.518814,1.355319,-0.507197,0.947040,0.753100,1.381770,0.468770,-0.29303,...,0,1,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,2.419286,-2.069222,-1.043758,-1.481667,1.289537,-0.043338,-0.682695,-0.569893,-0.968860,-0.29303,...,0,1,0,0,0,0,0,1,0,0
1455,2.419286,-2.069222,-1.049083,-1.481667,-0.507197,-0.043338,-0.682695,-0.569893,-0.415757,-0.29303,...,0,1,0,1,0,0,0,0,0,0
1456,-0.873466,3.884968,1.246594,-0.772420,1.289537,-0.373465,0.561660,-0.569893,1.717643,-0.29303,...,0,1,0,1,0,0,0,0,0,0
1457,0.655311,-0.312950,0.034599,-0.772420,-0.507197,0.682939,0.370221,-0.569893,-0.229194,-0.29303,...,0,1,0,0,0,0,0,1,0,0


In [179]:
#通过values属性，我们可以从pandas中提取numpy格式，之后转为张量用于训练
n_train = train_data.shape[0]
train_features = torch.tensor(all_features[:n_train].values, dtype=torch.float32)
test_features = torch.tensor(all_features[:n_train].values, dtype=torch.float32)

train_labels = torch.tensor(train_data.SalePrice.values.reshape(-1,1), dtype=torch.float32)

In [182]:
#训练
loss = nn.MSELoss()
in_feature = train_features.shape[1] 

def get_net():
    net = nn.Sequential(nn.Linear(in_feature,1))
    return net

### 房价就像股票价格一样，我们关心的是相对数量，而不是绝对数量。 因此，我们更关心相对误差

    对数变换可以压缩数据范围：较大的值经过对数变换后增长较慢，而较小的值增长较快。这有助于减小数据中极端值的影响，因为它们在对数尺度上相对较小。

    对数变换可以线性化关系：某些关系在对数尺度上可以更接近线性。这使得一些数据在对数尺度上更容易建模和分析。

    对数变换可以减小离群值的影响：偏态分布通常包括离群值，这些值可能对分析和建模产生负面影响。对数变换可以使离群值更接近其他数据点，减小它们的影响。

In [None]:
def log_rmse(net, features, labels):
     # 为了在取对数时进一步稳定该值，将小于1的值设置为1
        clipped_preds=torch.clamp()
    
    