In [1]:
import time
import os.path

import pandas as pd
import pytorch_lightning as pl
import torch
import torch.nn.functional as F
from pytorch_lightning.callbacks import ModelCheckpoint
from torch import nn, optim
from torch.utils.data import DataLoader, TensorDataset

In [2]:
torch.set_float32_matmul_precision("medium")

In [3]:
from torchmetrics.regression import MeanSquaredLogError

In [None]:
from sklearn.model_selection import KFold

In [4]:
def add_to_class(Class):
    """Register functions as methods in created class."""
    def wrapper(obj):
        setattr(Class, obj.__name__, obj)
    return wrapper

In [28]:
class KaggleData(pl.LightningDataModule):
    def __init__(self, batch_size=64, data_dir="../data",
                 train_csv='kaggle_house_pred_train.csv',
                 test_csv='kaggle_house_pred_test.csv',
                 num_workers=8):
        super().__init__()
        self.train_data_path = os.path.join(data_dir, train_csv)
        self.test_data_path = os.path.join(data_dir, test_csv)
        self.batch_size = batch_size
        self.num_workers = num_workers
    
    def prepare_data(self):
        # 读取数据
        self.origin_train_data = pd.read_csv(self.train_data_path)
        self.origin_test_data = pd.read_csv(self.test_data_path)
        self.test_data_id = self.origin_test_data["Id"]

        # 标准化数据 x <- (x - μ) / σ
        all_features = pd.concat((self.origin_train_data.iloc[:, 1:-1], self.origin_test_data.iloc[:, 1:]))
        numeric_features = all_features.dtypes[all_features.dtypes != "object"].index
        all_features[numeric_features] = all_features[numeric_features].apply(
            lambda x: (x - x.mean()) / (x.std())
        )
        all_features[numeric_features] = all_features[numeric_features].fillna(0)

        # 对离散值使用 One-Hot Encoding
        self.all_features = pd.get_dummies(all_features, dummy_na=True, dtype=int)

    def setup(self, stage=None):
        # 构造数据
        n_train = self.origin_train_data.shape[0]
        if stage == "fit" or stage is None:
            train_features = self.all_features[:n_train].values
            train_labels = self.origin_train_data.SalePrice.values
            self.train_data = TensorDataset(
                torch.tensor(train_features, dtype=torch.float32),
                torch.tensor(train_labels, dtype=torch.float32).reshape(-1, 1),
            )
        if stage == "predict" or stage is None:
            predict_features = self.all_features[n_train:].values
            self.predict_data = TensorDataset(torch.tensor(predict_features, dtype=torch.float32))

    def train_dataloader(self):
        return DataLoader(self.train_data, batch_size=self.batch_size,
                            shuffle=True, pin_memory=True, num_workers=self.num_workers)

    def predict_dataloader(self):
        return DataLoader(
            self.predict_data, batch_size=self.batch_size,
            shuffle=False, pin_memory=True, num_workers=self.num_workers)

In [29]:
class KaggleData_KFold(KaggleData):
    def __init__(self, num_folds=5):
        super().__init__()
        self.num_folds = num_folds
        self.current_fold = 0
        
    def setup(self, stage=None):
        n_train = self.origin_train_data.shape[0]
        features = self.all_features[:n_train]
        labels = self.origin_train_data.SalePrice
        # 使用 k-fold 验证
        kf = KFold(self.num_folds)
        train_indices, val_indices = list(kf.split(features))[0]
        len(train_indices), len(val_indices)
        train_features = features.iloc[train_indices]
        train_labels = labels.iloc[train_indices]
        val_features = features.iloc[val_indices]
        val_labels = labels.iloc[val_indices]

        if stage == "fit" or stage is None:
            self.train_data = TensorDataset(
                torch.tensor(train_features, dtype=torch.float32),
                torch.tensor(train_labels, dtype=torch.float32).reshape(-1, 1),
            )
            self.val_data = TensorDataset(
                torch.tensor(val_features, dtype=torch.float32),
                torch.tensor(val_labels, dtype=torch.float32).reshape(-1, 1),
            )
        elif stage == 'test':
            self.test_data = TensorDataset(
                torch.tensor(val_features, dtype=torch.float32),
                torch.tensor(val_labels, dtype=torch.float32).reshape(-1, 1),
            )
            
    def val_dataloader(self):
        return DataLoader(
            self.val_data, batch_size=self.batch_size, 
            shuffle=True, pin_memory=True, num_workers=self.num_workers)

In [32]:
data = KaggleData()
data.prepare_data()
data.setup()

In [39]:
dummy_batch = next(iter(data.train_dataloader()))

In [40]:
len(dummy_batch)

2

In [41]:
x = dummy_batch[0][0]
y = dummy_batch[1][0]

In [42]:
x.shape, y.shape

(torch.Size([330]), torch.Size([1]))

In [46]:
class KaggleClassifier(pl.LightningModule):
    def __init__(self, lr=0.1, momentum=0.9, weight_decay=1e-5):
        super().__init__()
        self.save_hyperparameters()
        self.train_loss = MeanSquaredLogError()
        self.val_loss = MeanSquaredLogError()
        self.test_loss = MeanSquaredLogError()

    def training_step(self, batch):
        features, labels = batch
        preds = self(features)
        loss = self.train_loss(preds, labels)
        self.log("train_loss", loss, prog_bar=True)
        return loss

    def validation_step(self, batch):
        features, labels = batch
        preds = self(features)
        loss = self.val_loss(preds, labels)
        self.log("val_loss", loss, prog_bar=True)
        return loss
    
    def test_step(self, batch):
        features, labels = batch
        preds = self(features)
        loss = self.test_loss(preds, labels)
        self.log("test_loss", loss, prog_bar=True)
        return loss
    
    def configure_optimizers(self):
        return optim.SGD(self.parameters(), lr=self.hparams.lr, momentum=self.hparams.momentum, weight_decay=self.hparams.weight_decay)

    def on_test_end(self):
        self.logger.log_hyperparams(self.hparams, 
                                    {"hp/train_loss": self.train_loss.compute(), 
                                     "hp/val_acc": self.val_acc.compute(), 
                                     "hp/test_acc": self.test_acc.compute()})