In [1]:
import pandas as pd
import numpy as np

import torch

In [2]:
train_data = pd.read_csv("../house-prices-advanced-regression-techniques/train.csv")
test_data = pd.read_csv("../house-prices-advanced-regression-techniques/test.csv")

In [3]:
print(train_data.iloc[0:4, [0, 1, 2, 3, -3, -2, -1]])
print(train_data.shape)
print(test_data.shape)

   Id  MSSubClass MSZoning  LotFrontage SaleType SaleCondition  SalePrice
0   1          60       RL         65.0       WD        Normal     208500
1   2          20       RL         80.0       WD        Normal     181500
2   3          60       RL         68.0       WD        Normal     223500
3   4          70       RL         60.0       WD       Abnorml     140000
(1460, 81)
(1459, 80)


In [4]:
all_features = pd.concat((train_data.iloc[:, 1:-1], test_data.iloc[:, 1:]))
all_features.shape

(2919, 79)

In [5]:
numerical_features = all_features.dtypes[all_features.dtypes != 'object'].index
all_features[numerical_features] = all_features[numerical_features].apply(lambda x: (x - x.mean())/(x.std()))
all_features[numerical_features] = all_features[numerical_features].fillna(0)

In [6]:
all_features = pd.get_dummies(all_features, dummy_na=True)
all_features = all_features.astype(np.float32)
all_features.shape

(2919, 330)

In [7]:
from torch.utils import data

def data_iter(data_array, batch_size, is_train):
    dataset = data.TensorDataset(*data_array)
    dataloader = data.DataLoader(dataset, batch_size=batch_size, shuffle=is_train)
    return dataloader

In [8]:
train_data_num = train_data.shape[0]

train_features = torch.tensor(all_features.iloc[:train_data_num, :].values, dtype=torch.float32)
test_features = torch.tensor(all_features.iloc[train_data_num:, :].values, dtype=torch.float32)
train_label = torch.tensor(train_data.iloc[:, -1].values, dtype=torch.float32)

batch_size = 64

dataloader = data_iter((train_features, train_label.reshape(-1, 1)), batch_size=batch_size, is_train=True)

In [14]:
import torch.nn as nn

input_feature = train_features.shape[1]

def xavier_init_weights(m):
    if isinstance(m, nn.Linear):
        nn.init.xavier_uniform_(m.weight)
        if m.bias is not None:
            nn.init.zeros_(m.bias)

class MyModel(nn.Module):
    def __init__(self, input_feature):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_feature, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )

    def forward(self, x):
        return self.net(x)


model = MyModel(input_feature)
model.apply(xavier_init_weights)

optimer = torch.optim.Adam(model.parameters(), lr=0.01)


In [15]:
epoch_num = 500
loss = nn.MSELoss()

from tqdm import tqdm

log = {"train_loss":[]}

for epoch in tqdm(range(epoch_num)):
    total_loss = 0
    for X, y in dataloader:
        y_hat = model(X)
        l = loss(y_hat, y.reshape(-1, 1))
        optimer.zero_grad()
        l.backward()
        optimer.step()
        total_loss += l.item()
    
    total_loss /= train_features.shape[0]
    log["train_loss"].append(total_loss)
    if epoch%10 == 0:
        print(total_loss)



  4%|▎         | 18/500 [00:00<00:05, 83.66it/s]

614759843.4191781
513584432.3945205


  7%|▋         | 37/500 [00:00<00:05, 87.13it/s]

281868010.9589041
102295188.33972603


 11%|█         | 55/500 [00:00<00:05, 85.55it/s]

39273773.89589041
26930389.30410959


 15%|█▍        | 73/500 [00:00<00:04, 87.09it/s]

23696249.468493152
21998808.89863014


 18%|█▊        | 91/500 [00:01<00:05, 81.70it/s]

20762737.293150686
19722160.679452054


 24%|██▍       | 119/500 [00:01<00:04, 86.54it/s]

18849690.915068492
18081037.852054793


 26%|██▌       | 128/500 [00:01<00:04, 86.30it/s]

17501200.635616437
16908312.89863014


 31%|███       | 155/500 [00:01<00:04, 83.49it/s]

17001476.339726027
15965970.147945205


 35%|███▌      | 175/500 [00:02<00:03, 87.92it/s]

15514937.161643835
15188515.06849315


 39%|███▉      | 195/500 [00:02<00:03, 90.18it/s]

14828018.991780821
14527207.473972602


 43%|████▎     | 215/500 [00:02<00:03, 90.86it/s]

14241677.78630137
14063351.06849315


 47%|████▋     | 235/500 [00:02<00:02, 89.38it/s]

13727840.460273972
13488380.767123288


 51%|█████     | 253/500 [00:02<00:02, 88.97it/s]

13260563.112328768
13135040.263013698


 54%|█████▍    | 271/500 [00:03<00:02, 89.21it/s]

12961176.964383561
12739380.252054794


 58%|█████▊    | 291/500 [00:03<00:02, 90.80it/s]

12558612.208219178
12532338.487671234


 62%|██████▏   | 311/500 [00:03<00:02, 89.35it/s]

12565034.115068493
12181537.852054795


 66%|██████▌   | 330/500 [00:03<00:02, 79.83it/s]

12055698.367123287
11970292.591780823


 72%|███████▏  | 359/500 [00:04<00:01, 87.27it/s]

11855114.750684932
11723761.78630137


 75%|███████▌  | 377/500 [00:04<00:01, 87.74it/s]

11621985.939726027
11559375.682191782


 79%|███████▉  | 395/500 [00:04<00:01, 87.53it/s]

11442571.419178082
11379527.156164384


 83%|████████▎ | 414/500 [00:04<00:00, 88.62it/s]

11327502.619178083
11717975.14520548


 87%|████████▋ | 433/500 [00:04<00:00, 89.28it/s]

11153312.942465754
11140841.150684932


 91%|█████████ | 453/500 [00:05<00:00, 90.91it/s]

11034983.287671233
10962826.334246576


 95%|█████████▍| 473/500 [00:05<00:00, 88.83it/s]

11171221.293150686
10857353.49041096


 98%|█████████▊| 491/500 [00:05<00:00, 88.39it/s]

10859466.695890412
10759745.183561644


100%|██████████| 500/500 [00:05<00:00, 87.10it/s]


In [16]:
loss = nn.MSELoss()
in_features = train_features.shape[1]

def get_net():
    net = nn.Sequential(nn.Linear(in_features,1))
    return net

In [17]:
def log_rmse(net, features, labels):
    # 为了在取对数时进一步稳定该值，将小于1的值设置为1
    clipped_preds = torch.clamp(net(features), 1, float('inf'))
    rmse = torch.sqrt(loss(torch.log(clipped_preds),
                           torch.log(labels)))
    return rmse.item()

In [26]:
def train(net, train_features, train_labels, test_features, test_labels,
          num_epochs, learning_rate, weight_decay, batch_size):
    train_ls, test_ls = [], []
    train_iter = data_iter((train_features, train_labels), batch_size, True)
    # 这里使用的是Adam优化算法
    optimizer = torch.optim.Adam(net.parameters(),
                                 lr = learning_rate,
                                 weight_decay = weight_decay)
    for epoch in range(num_epochs):
        total_loss = 0
        for X, y in train_iter:
            optimizer.zero_grad()
            l = loss(net(X), y)
            l.backward()
            optimizer.step()
            total_loss += l.item()
        train_ls.append(log_rmse(net, train_features, train_labels))
        print(total_loss)
        if test_labels is not None:
            test_ls.append(log_rmse(net, test_features, test_labels))
    return train_ls, test_ls

In [23]:
def get_k_fold_data(k, i, X, y):
    assert k > 1
    fold_size = X.shape[0] // k
    X_train, y_train = None, None
    for j in range(k):
        idx = slice(j * fold_size, (j + 1) * fold_size)
        X_part, y_part = X[idx, :], y[idx]
        if j == i:
            X_valid, y_valid = X_part, y_part
        elif X_train is None:
            X_train, y_train = X_part, y_part
        else:
            X_train = torch.cat([X_train, X_part], 0)
            y_train = torch.cat([y_train, y_part], 0)
    return X_train, y_train, X_valid, y_valid

In [27]:
def k_fold(k, X_train, y_train, num_epochs, learning_rate, weight_decay,
           batch_size):
    train_l_sum, valid_l_sum = 0, 0
    for i in range(k):
        data = get_k_fold_data(k, i, X_train, y_train)
        net = get_net()
        train_ls, valid_ls = train(net, *data, num_epochs, learning_rate,
                                   weight_decay, batch_size)
        train_l_sum += train_ls[-1]
        valid_l_sum += valid_ls[-1]
        print(f'折{i + 1}，训练log rmse{float(train_ls[-1]):f}, '
              f'验证log rmse{float(valid_ls[-1]):f}')
    return train_l_sum / k, valid_l_sum / k

In [28]:
k, num_epochs, lr, weight_decay, batch_size = 5, 100, 5, 0, 64
train_l, valid_l = k_fold(k, train_features, train_label, num_epochs, lr,
                          weight_decay, batch_size)
print(f'{k}-折验证: 平均训练log rmse: {float(train_l):f}, '
      f'平均验证log rmse: {float(valid_l):f}')

  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


736747591680.0
702423980032.0
683098701824.0
644280659968.0
633627729920.0
605345622016.0
593776306176.0
553800847360.0
529852409856.0
506667507712.0
497439578112.0
479125628928.0
459017316352.0
442792247296.0
422855540736.0
413948108800.0
394569330688.0
372553231360.0
363968534528.0
358403741696.0
338520762368.0
325576989696.0
325537537024.0
304816826880.0
293318575104.0
289311612928.0
270889105920.0
262550835200.0
253933498368.0
243052049920.0
235914853376.0
226863068672.0
225653523456.0
213580354048.0
206204158208.0
203655651840.0
209634328064.0
191427623936.0
184975895296.0
185184509952.0
175490392832.0
177194488832.0
167488952576.0
169233084416.0
160406641792.0
157948566016.0
154282614784.0
153389716480.0
155087692288.0
149814015488.0
146357076992.0
148239803136.0
150376041984.0
144046568960.0
168666627072.0
141628624384.0
137597031936.0
146048007424.0
152802543104.0
134941509888.0
130461767168.0
128631552128.0
129331160832.0
130009022464.0
132768347648.0
128991219200.0
1259174097