In [25]:
import torch
import d2l.torch as d2l
import pandas as pd
import numpy as np

In [26]:
path = './kaggle_house_data/house-prices-advanced-regression-techniques/train.csv'
all_features = pd.read_csv(path)
label = all_features['SalePrice']
all_features = all_features.iloc[:,1:-1]

# 若无法获得测试数据，则可根据训练数据计算均值和标准差
numeric_features = all_features.dtypes[all_features.dtypes != 'object'].index
all_features[numeric_features] = all_features[numeric_features].apply(
    lambda x: (x - x.mean()) / (x.std()))
# 在标准化数据之后，所有均值消失，因此我们可以将缺失值设置为0
all_features[numeric_features] = all_features[numeric_features].fillna(0)
all_features = pd.get_dummies(all_features, dummy_na=True)
# print(all_features.columns.values)

all_features = torch.tensor(all_features.to_numpy(dtype=np.float32), dtype=torch.float32)
label = torch.tensor(label.values, dtype=torch.float32).reshape(-1, 1)

all_features, label = all_features.cuda(), label.cuda()
all_features.shape

torch.Size([1460, 331])

In [27]:
from torch import nn
from torch.utils import data
class Net(nn.Module):
    def __init__(self, num_input, num_output) -> None:
        super().__init__()
        self.layer1 = nn.Linear(num_input, num_output)
    
    def forward(self, x):
        o = self.layer1(x)

        return o

In [28]:
lr = 0.01
batch_size = 24

train_set = data.TensorDataset(all_features, label)
train_iter = data.DataLoader(train_set, batch_size, shuffle=True, num_workers=4)
net = Net(331, 1).cuda()
loss = nn.MSELoss(reduction='none')
updater = torch.optim.Adam(net.parameters(), lr=lr)

def init_weights(m):
        if type == nn.Linear:
            nn.init.normal_(m.weight, std=0.01)
net.apply(init_weights)

Net(
  (layer1): Linear(in_features=331, out_features=1, bias=True)
)

In [29]:
def log_rmse(net, features, labels):
    # 为了在取对数时进一步稳定该值，将小于1的值设置为1
    clipped_preds = torch.clip(net(features), 1, float('inf'))
    return torch.sqrt(2 * loss(torch.log(clipped_preds), torch.log(labels)).mean())

In [30]:
epoch = 50
for i in range(epoch):
    for x,y in train_iter:
        updater.zero_grad()
        l = loss(net(x), y)
        l.sum().backward()
        updater.step()
    print(log_rmse(net, all_features, label))


tensor(17.0140, device='cuda:0', grad_fn=<SqrtBackward0>)
tensor(16.8088, device='cuda:0', grad_fn=<SqrtBackward0>)
tensor(16.2238, device='cuda:0', grad_fn=<SqrtBackward0>)
tensor(15.8168, device='cuda:0', grad_fn=<SqrtBackward0>)
tensor(15.5040, device='cuda:0', grad_fn=<SqrtBackward0>)
tensor(15.2417, device='cuda:0', grad_fn=<SqrtBackward0>)
tensor(15.0219, device='cuda:0', grad_fn=<SqrtBackward0>)
tensor(14.8338, device='cuda:0', grad_fn=<SqrtBackward0>)
tensor(14.6657, device='cuda:0', grad_fn=<SqrtBackward0>)
tensor(14.5178, device='cuda:0', grad_fn=<SqrtBackward0>)
tensor(14.3814, device='cuda:0', grad_fn=<SqrtBackward0>)
tensor(14.2572, device='cuda:0', grad_fn=<SqrtBackward0>)
tensor(14.1449, device='cuda:0', grad_fn=<SqrtBackward0>)


KeyboardInterrupt: 

In [16]:
net = torch.load('./net.pth')

In [23]:
for i in range(10):
    print(net(all_features[i,:]), label[i])

tensor([222393.3438], device='cuda:0', grad_fn=<AddBackward0>) tensor([208500.], device='cuda:0')
tensor([184124.0312], device='cuda:0', grad_fn=<AddBackward0>) tensor([181500.], device='cuda:0')
tensor([231334.9062], device='cuda:0', grad_fn=<AddBackward0>) tensor([223500.], device='cuda:0')
tensor([176943.7500], device='cuda:0', grad_fn=<AddBackward0>) tensor([140000.], device='cuda:0')
tensor([287953.5312], device='cuda:0', grad_fn=<AddBackward0>) tensor([250000.], device='cuda:0')
tensor([172592.], device='cuda:0', grad_fn=<AddBackward0>) tensor([143000.], device='cuda:0')
tensor([266915.], device='cuda:0', grad_fn=<AddBackward0>) tensor([307000.], device='cuda:0')
tensor([228467.2812], device='cuda:0', grad_fn=<AddBackward0>) tensor([200000.], device='cuda:0')
tensor([133980.2500], device='cuda:0', grad_fn=<AddBackward0>) tensor([129900.], device='cuda:0')
tensor([92646.1172], device='cuda:0', grad_fn=<AddBackward0>) tensor([118000.], device='cuda:0')
