In [10]:
import hashlib
import os
import tarfile
import zipfile
import requests

#@save
DATA_HUB = dict()
DATA_URL = 'http://d2l-data.s3-accelerate.amazonaws.com/'

In [11]:
def download(name, cache_dir=os.path.join('..', 'data')):  #@save
    """下载一个DATA_HUB中的文件，返回本地文件名"""
    assert name in DATA_HUB, f"{name} 不存在于 {DATA_HUB}"
    url, sha1_hash = DATA_HUB[name]
    os.makedirs(cache_dir, exist_ok=True)
    fname = os.path.join(cache_dir, url.split('/')[-1])
    if os.path.exists(fname):
        sha1 = hashlib.sha1()
        with open(fname, 'rb') as f:
            while True:
                data = f.read(1048576)
                if not data:
                    break
                sha1.update(data)
        if sha1.hexdigest() == sha1_hash:
            return fname  # 命中缓存
    print(f'正在从{url}下载{fname}...')
    r = requests.get(url, stream=True, verify=True)
    with open(fname, 'wb') as f:
        f.write(r.content)
    return fname

In [12]:
def download_extract(name, folder=None):  #@save
    """下载并解压zip/tar文件"""
    fname = download(name)
    base_dir = os.path.dirname(fname)
    data_dir, ext = os.path.splitext(fname)
    if ext == '.zip':
        fp = zipfile.ZipFile(fname, 'r')
    elif ext in ('.tar', '.gz'):
        fp = tarfile.open(fname, 'r')
    else:
        assert False, '只有zip/tar文件可以被解压缩'
    fp.extractall(base_dir)
    return os.path.join(base_dir, folder) if folder else data_dir

def download_all():  #@save
    """下载DATA_HUB中的所有文件"""
    for name in DATA_HUB:
        download(name)

In [13]:
# 如果你没有安装pandas，请取消下一行的注释
# !pip install pandas

%matplotlib inline
import numpy as np
import pandas as pd
import torch
from torch import nn
from d2l import torch as d2l

In [14]:
DATA_HUB['kaggle_house_train'] = (  #@save
    DATA_URL + 'kaggle_house_pred_train.csv',
    '585e9cc93e70b39160e7921475f9bcd7d31219ce')

DATA_HUB['kaggle_house_test'] = (  #@save
    DATA_URL + 'kaggle_house_pred_test.csv',
    'fa19780a7b011d9b009e8bff8e99922a8ee2eb90')

In [15]:
train_data = pd.read_csv(download('kaggle_house_train'))
test_data = pd.read_csv(download('kaggle_house_test'))

In [16]:
print(train_data.shape)
print(test_data.shape)

(1460, 81)
(1459, 80)


In [25]:
print(train_data.iloc[0:6,[1,2,3,4,5,6,-2,-1]])

   MSSubClass MSZoning  LotFrontage  LotArea Street Alley SaleCondition  \
0          60       RL         65.0     8450   Pave   NaN        Normal   
1          20       RL         80.0     9600   Pave   NaN        Normal   
2          60       RL         68.0    11250   Pave   NaN        Normal   
3          70       RL         60.0     9550   Pave   NaN       Abnorml   
4          60       RL         84.0    14260   Pave   NaN        Normal   
5          50       RL         85.0    14115   Pave   NaN        Normal   

   SalePrice  
0     208500  
1     181500  
2     223500  
3     140000  
4     250000  
5     143000  


In [18]:
all_features = pd.concat((train_data.iloc[:,1:-1], test_data.iloc[:,1:]))

In [19]:
# 取出是数值的特征类别
numeric_features = all_features.dtypes[all_features.dtypes != 'object'].index
numeric_features

Index(['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond',
       'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2',
       'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces',
       'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF',
       'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal',
       'MoSold', 'YrSold'],
      dtype='object')

In [20]:
all_features[numeric_features] = all_features[numeric_features].apply(
    lambda x: (x - x.mean()) / (x.std()))

In [22]:
all_features.iloc[:30,:]

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,0.06732,RL,-0.184443,-0.217841,Pave,,Reg,Lvl,AllPub,Inside,...,-0.285886,-0.063139,,,,-0.089577,-1.551918,0.157619,WD,Normal
1,-0.873466,RL,0.458096,-0.072032,Pave,,Reg,Lvl,AllPub,FR2,...,-0.285886,-0.063139,,,,-0.089577,-0.446848,-0.602858,WD,Normal
2,0.06732,RL,-0.055935,0.137173,Pave,,IR1,Lvl,AllPub,Inside,...,-0.285886,-0.063139,,,,-0.089577,1.026577,0.157619,WD,Normal
3,0.302516,RL,-0.398622,-0.078371,Pave,,IR1,Lvl,AllPub,Corner,...,-0.285886,-0.063139,,,,-0.089577,-1.551918,-1.363335,WD,Abnorml
4,0.06732,RL,0.629439,0.518814,Pave,,IR1,Lvl,AllPub,FR2,...,-0.285886,-0.063139,,,,-0.089577,2.131647,0.157619,WD,Normal
5,-0.167877,RL,0.672275,0.50043,Pave,,IR1,Lvl,AllPub,Inside,...,-0.285886,-0.063139,,MnPrv,Shed,1.144116,1.394934,0.918095,WD,Normal
6,-0.873466,RL,0.243916,-0.010665,Pave,,Reg,Lvl,AllPub,Inside,...,-0.285886,-0.063139,,,,-0.089577,0.658221,-0.602858,WD,Normal
7,0.06732,RL,,0.027119,Pave,,IR1,Lvl,AllPub,Corner,...,-0.285886,-0.063139,,,Shed,0.52727,1.76329,0.918095,WD,Normal
8,-0.167877,RM,-0.784145,-0.513264,Pave,,Reg,Lvl,AllPub,Inside,...,-0.285886,-0.063139,,,,-0.089577,-0.815205,0.157619,WD,Abnorml
9,3.124875,RL,-0.826981,-0.348436,Pave,,Reg,Lvl,AllPub,Corner,...,-0.285886,-0.063139,,,,-0.089577,-1.920274,0.157619,WD,Normal


In [23]:
all_features[numeric_features] = all_features[numeric_features].fillna(0)

In [24]:
all_features.iloc[:30,:]

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,0.06732,RL,-0.184443,-0.217841,Pave,,Reg,Lvl,AllPub,Inside,...,-0.285886,-0.063139,,,,-0.089577,-1.551918,0.157619,WD,Normal
1,-0.873466,RL,0.458096,-0.072032,Pave,,Reg,Lvl,AllPub,FR2,...,-0.285886,-0.063139,,,,-0.089577,-0.446848,-0.602858,WD,Normal
2,0.06732,RL,-0.055935,0.137173,Pave,,IR1,Lvl,AllPub,Inside,...,-0.285886,-0.063139,,,,-0.089577,1.026577,0.157619,WD,Normal
3,0.302516,RL,-0.398622,-0.078371,Pave,,IR1,Lvl,AllPub,Corner,...,-0.285886,-0.063139,,,,-0.089577,-1.551918,-1.363335,WD,Abnorml
4,0.06732,RL,0.629439,0.518814,Pave,,IR1,Lvl,AllPub,FR2,...,-0.285886,-0.063139,,,,-0.089577,2.131647,0.157619,WD,Normal
5,-0.167877,RL,0.672275,0.50043,Pave,,IR1,Lvl,AllPub,Inside,...,-0.285886,-0.063139,,MnPrv,Shed,1.144116,1.394934,0.918095,WD,Normal
6,-0.873466,RL,0.243916,-0.010665,Pave,,Reg,Lvl,AllPub,Inside,...,-0.285886,-0.063139,,,,-0.089577,0.658221,-0.602858,WD,Normal
7,0.06732,RL,0.0,0.027119,Pave,,IR1,Lvl,AllPub,Corner,...,-0.285886,-0.063139,,,Shed,0.52727,1.76329,0.918095,WD,Normal
8,-0.167877,RM,-0.784145,-0.513264,Pave,,Reg,Lvl,AllPub,Inside,...,-0.285886,-0.063139,,,,-0.089577,-0.815205,0.157619,WD,Abnorml
9,3.124875,RL,-0.826981,-0.348436,Pave,,Reg,Lvl,AllPub,Corner,...,-0.285886,-0.063139,,,,-0.089577,-1.920274,0.157619,WD,Normal


In [26]:
all_features.shape

(2919, 79)

In [27]:
all_features = pd.get_dummies(all_features, dummy_na=True)
all_features.shape

(2919, 331)

In [34]:
all_features.iloc[:2,35:45]

Unnamed: 0,YrSold,MSZoning_C (all),MSZoning_FV,MSZoning_RH,MSZoning_RL,MSZoning_RM,MSZoning_nan,Street_Grvl,Street_Pave,Street_nan
0,0.157619,0,0,0,1,0,0,0,1,0
1,-0.602858,0,0,0,1,0,0,0,1,0


In [13]:
n_train = train_data.shape[0]
train_features = torch.tensor(all_features[:n_train].values,
                             dtype=torch.float32)
test_featrues = torch.tensor(all_features[n_train:].values,
                             dtype=torch.float32)
train_labels = torch.tensor(train_data.SalePrice.values.reshape(-1,1),
                            dtype=torch.float32)

In [15]:
loss = nn.MSELoss()
in_features = train_features.shape[1]

def get_net():
    net = nn.Sequential(nn.Linear(in_features, 1))
    return net

In [16]:
# 使用相对误差 y-y^/y
def log_rmse(net, features, labels):
    clipped_preds = torch.clamp(net(features),1,float('inf'))
    rmse = troch.Sqrt(loss(torch.log(clipped_preds), torch.log(labels)))
    return rmse.item()