# 实战Kaggle比赛：预测房价

In [38]:
import torch
from torch import nn, optim
from torch.utils.data import Subset, TensorDataset, DataLoader
import pytorch_lightning as pl

In [3]:
def add_to_class(Class):
    """Register functions as methods in created class."""
    def wrapper(obj):
        setattr(Class, obj.__name__, obj)
    return wrapper

In [4]:
import numpy as np
import pandas as pd

In [8]:
train_data = pd.read_csv("../data/kaggle_house_pred_train.csv")
test_data = pd.read_csv("../data/kaggle_house_pred_test.csv")
train_data.shape, test_data.shape

((1460, 81), (1459, 80))

In [11]:
all_features = pd.concat((train_data.iloc[:, 1:-1], test_data.iloc[:, 1:]))
numeric_features = all_features.dtypes[all_features.dtypes != 'object'].index
all_features[numeric_features] = all_features[numeric_features].apply(
    lambda x: (x - x.mean()) / (x.std()))
all_features[numeric_features] = all_features[numeric_features].fillna(0)
all_features = pd.get_dummies(all_features, dummy_na=True, dtype=int)
all_features.shape

(2919, 330)

In [13]:
n_train = train_data.shape[0]
train_features = torch.tensor(all_features[:n_train].values, dtype=torch.float32)
test_features = torch.tensor(all_features[n_train:].values, dtype=torch.float32)
train_labels = torch.tensor(
    train_data.SalePrice.values.reshape(-1, 1), dtype=torch.float32)
train_features.shape, test_features.shape, train_labels.shape

(torch.Size([1460, 330]), torch.Size([1459, 330]), torch.Size([1460, 1]))

In [20]:
from sklearn.model_selection import KFold

In [32]:
kf = KFold(5)
train_indices, val_indices = list(kf.split(train_features))[0]
len(train_indices), len(val_indices)
current_train_features = Subset(train_features, train_indices)
current_train_labels = Subset(train_labels, train_indices)
current_val_features = Subset(train_features, val_indices)
current_val_labels = Subset(train_labels, val_indices)

(1168, 292)

In [17]:
import os.path

In [74]:
class KaggleDataModel(pl.LightningDataModule):
  def __init__(self, batch_size=64, data_dir="../data",
               train_csv='kaggle_house_pred_train.csv',
               test_csv='kaggle_house_pred_test.csv',
               num_folds=5, kfold=True, num_workers=8):
    super().__init__()
    self.train_data_path = os.path.join(data_dir, train_csv)
    self.test_data_path = os.path.join(data_dir, test_csv)
    self.num_folds = num_folds
    self.kfold = kfold
    self.current_fold = 0
    self.batch_size = batch_size
    self.num_workers = num_workers

In [144]:
@add_to_class(KaggleDataModel)
def setup(self, stage=None):
  # 读取数据
  train_data = pd.read_csv(self.train_data_path)
  test_data = pd.read_csv(self.test_data_path)
  # 标准化数据 x <- (x - μ) / σ
  all_features = pd.concat((train_data.iloc[:, 1:-1], test_data.iloc[:, 1:]))
  numeric_features = all_features.dtypes[all_features.dtypes != 'object'].index
  all_features[numeric_features] = all_features[numeric_features].apply(
      lambda x: (x - x.mean()) / (x.std()))
  all_features[numeric_features] = all_features[numeric_features].fillna(0)
  # 对离散值使用 One-Hot Encoding
  all_features = pd.get_dummies(all_features, dummy_na=True, dtype=int)
  # 构造数据
  n_train = train_data.shape[0]
  if stage == "fit" or stage is None:
    train_features = all_features[:n_train]
    train_labels = train_data.SalePrice
    if self.kfold:
      # 使用 k-fold 验证
      kf = KFold(self.num_folds)
      train_indices, val_indices = list(kf.split(train_features))[0]
      len(train_indices), len(val_indices)
      self.train_features = train_features.iloc[train_indices]
      self.train_labels = train_labels.iloc[train_indices]
      self.val_features = train_features.iloc[val_indices]
      self.val_labels = train_labels.iloc[val_indices]
    else:
      # 直接进行训练，不设置验证集
      self.train_features = train_features
      self.train_labels = train_labels
      self.val_features = None
      self.val_labels = None
  if stage == "test" or stage is None:
    self.test_features = all_features[n_train:].values

In [153]:
@add_to_class(KaggleDataModel)
def train_dataloader(self):
  return DataLoader(
    TensorDataset(
      torch.tensor(self.train_features.values, dtype=torch.float32),
      torch.tensor(self.train_labels.values, dtype=torch.float32).reshape(-1,1)
    ), 
    batch_size=self.batch_size, shuffle=True,
    pin_memory=True, num_workers=self.num_workers)
  
@add_to_class(KaggleDataModel)
def val_dataloader(self):
  if self.val_features is None:
    return DataLoader(
      TensorDataset(
        torch.tensor(self.train_features.sample(frac=0.1, random_state=1).values, dtype=torch.float32),
        torch.tensor(self.train_labels.sample(frac=0.1, random_state=1).values, dtype=torch.float32).reshape(-1,1)
      ),
      batch_size=self.batch_size, shuffle=False,
      pin_memory=True, num_workers=self.num_workers
    )
  else:
    return DataLoader(
      TensorDataset(
        torch.tensor(self.val_features.values, dtype=torch.float32),
        torch.tensor(self.val_labels.values, dtype=torch.float32).reshape(-1,1)
      ), 
      batch_size=self.batch_size, shuffle=False,
      pin_memory=True, num_workers=self.num_workers)
    
@add_to_class(KaggleDataModel)
def test_dataloader(self):
  return DataLoader(
    TensorDataset(torch.tensor(self.test_features.values, dtype=torch.float32)), 
    batch_size=self.batch_size, shuffle=False,
    pin_memory=True, num_workers=self.num_workers)

In [101]:
class KaggleModel(pl.LightningModule):
  def __init__(self, batch_size=64, learning_rate=1, weight_decay=1e-4, hidden_size=1024):
    super().__init__()
    self.batch_size = batch_size
    self.hidden_size = hidden_size
    self.lr = learning_rate
    self.weight_decay = weight_decay
    self.input_size = 330
    self.output_size = 1

In [102]:
@add_to_class(KaggleModel)
def init_weight(self, m):
  if isinstance(m, nn.Linear):
    nn.init.kaiming_normal_(m.weight, nonlinearity='relu')

In [113]:
@add_to_class(KaggleModel)
def setup(self, stage=None):
  self.net = nn.Sequential(
    nn.Linear(self.input_size, self.hidden_size),
    nn.ReLU(),
    nn.Linear(self.hidden_size, self.output_size)
  )
  self.net.apply(self.init_weight)

In [104]:
@add_to_class(KaggleModel)
def forward(self, X):
  return self.net(X)

In [105]:
import torch.nn.functional as F

In [158]:
def log_mse_loss(preds, labels):
  clipped_preds = torch.clamp(preds, 1, float('inf'))
  mse = torch.sqrt(F.mse_loss(torch.log(clipped_preds), torch.log(labels)))
  return mse

In [160]:
@add_to_class(KaggleModel)
def training_step(self, batch):
  features, labels = batch
  preds = self(features)
  loss = log_mse_loss(preds, labels)
  self.log("train_loss", loss)
  return loss

In [108]:
@add_to_class(KaggleModel)
def validation_step(self, batch):
  features, labels = batch
  preds = self(features)
  loss = log_mse_loss(preds, labels)
  self.log("val_loss", loss)
  return loss

In [109]:
@add_to_class(KaggleModel)
def test_step(self, batch):
  features, labels = batch
  preds = self(features)
  loss = log_mse_loss(preds, labels)
  self.log("test_loss", loss)
  return loss

In [115]:
@add_to_class(KaggleModel)
def configure_optimizers(self):
  return optim.Adam(self.parameters(),
                    lr=self.lr,
                    weight_decay=self.weight_decay)

In [111]:
torch.set_float32_matmul_precision('high')

In [None]:
data = KaggleDataModel()
# model = KaggleModel()
trainer = pl.Trainer(max_epochs=50, log_every_n_steps=1)

for fold in range(data.num_folds):
  data.current_fold = fold
  data.setup()
  model = KaggleModel(batch_size=256, learning_rate=1, weight_decay=1e-4, hidden_size=1024)
  trainer.fit(model, datamodule=data)

In [None]:
data = KaggleDataModel(kfold=False)
model = KaggleModel(batch_size=64, learning_rate=10, weight_decay=1e-4, hidden_size=1024)
trainer = pl.Trainer(max_epochs=50, log_every_n_steps=1)
trainer.fit(model, datamodule=data)