In [None]:
from urllib.parse import urlparse

import allrank.models.losses as losses
import numpy as np
import os
import torch
from allrank.config import Config
from allrank.data.dataset_loading import load_libsvm_dataset, create_data_loaders
from allrank.models.model import make_model
from allrank.models.model_utils import get_torch_device, CustomDataParallel
from allrank.training.train_utils import fit
from allrank.utils.command_executor import execute_command
from allrank.utils.experiments import dump_experiment_result, assert_expected_metrics
from allrank.utils.file_utils import create_output_dirs, PathsContainer, copy_local_to_gs
from allrank.utils.ltr_logging import init_logger
from allrank.utils.python_utils import dummy_context_mgr
from argparse import ArgumentParser, Namespace
from attr import asdict
from functools import partial
from pprint import pformat
from torch import optim




In [11]:
import pickle

def load_dataset(path:str="dataset.pkl"):
    ds = None
    with open(path, "rb") as f:
        ds = pickle.Unpickler(f).load()
    return ds

dataset = load_dataset()

for train,val,test in dataset.dataset_generator(select_label='log_10'):
    break

In [13]:
date = train.features_df.index.get_level_values('date').unique()

X = []
for d in date:
    X.append(train.features_df.xs(d, level='date').values)


In [None]:
y = []
for d in date:
    y.append(train.labels_df.xs(d, level='date').values.reshape(-1,))

In [18]:
y[0].reshape(-1,)

array([-0.07053239,  0.0919701 , -0.00689545,  0.01177618, -0.01992463,
       -0.05130711, -0.08648195, -0.00557836, -0.00230574, -0.02136601,
       -0.04565357,  0.0097459 , -0.0227221 , -0.02787481, -0.08394959,
       -0.06073666, -0.1422309 , -0.09336346, -0.06856097, -0.07854841,
       -0.01173215, -0.0752501 ,  0.01155894, -0.00730819, -0.10153928,
       -0.00960287,  0.01048213,  0.02739897, -0.00765385,  0.00861954,
       -0.08136405,  0.05853666, -0.08310259, -0.02583271, -0.15612815,
        0.02573671, -0.08730435, -0.04580954, -0.06571418,  0.01272552],
      dtype=float32)

In [None]:
from torch.utils.data import DataLoader, Dataset
from sklearn.datasets import load_svmlight_file


logger = init_logger('')

class LTRDataset(Dataset):
    """
    Learning to Rank dataset.
    """
    def __init__(self, meta):
        """
        :param meta: meta dataset
        """
        date = meta.features_df.index.get_level_values('date').unique()

        self.X_by_date = []
        self.y_by_date = []
        for d in date:
            self.X_by_date.append(meta.features_df.xs(d, level='date').values)
            self.y_by_date.append(meta.labels_df.xs(d, level='date').values.reshape(-1,))

        # 简单起见，目前使用定长的 document_dim
        # self.longest_query_length = max([len(a) for a in self.X_by_date])
        self.longest_query_length = len(self.X_by_date[0])

        logger.info(f"loaded dataset with {len(self.X_by_date)} queries")
        logger.info(f"longest query had {self.longest_query_length} documents")


    def __len__(self):
        """
        :return: number of groups (slates) in the dataset
        """
        return len(self.X_by_date)

    def __getitem__(self, idx):
        """
        :param idx: index of a group
        :return: ndarrays tuple containing features and labels of shapes [slate_length, features_dim] and [slate_length], respectively
        """
        X = self.X_by_date[idx]
        y = self.y_by_date[idx]

        sample = X, y

        return sample

    @property
    def shape(self):
        """
        :return: shape of the dataset [batch_dim, document_dim, features_dim] where batch_dim is the number of groups
            (slates) and document_dim is the length of the longest group
        """
        batch_dim = len(self)
        document_dim = self.longest_query_length
        features_dim = self[0][0].shape[-1]
        return [batch_dim, document_dim, features_dim]

In [24]:
train_ds = LTRDataset(train)
valid_ds = LTRDataset(val)
print(len(train_ds), len(valid_ds))

200 100


In [25]:
from allrank.config import Config

config = Config.from_json('config.json')


In [26]:
train_dl = DataLoader(train_ds, num_workers=config.data.num_workers, batch_size=config.data.batch_size)
val_dl = DataLoader(valid_ds, num_workers=config.data.num_workers, batch_size=config.data.batch_size)

In [27]:
from attr import asdict

import torch

from allrank.models.model import make_model
from allrank.models.model_utils import get_torch_device, CustomDataParallel

n_features =X[0].shape[1]
# gpu support
dev = get_torch_device()
print("Model training will execute on {}".format(dev.type))

# instantiate model
model = make_model(n_features=n_features, **asdict(config.model, recurse=False))
if torch.cuda.device_count() > 1:
    model = CustomDataParallel(model)
    print("Model training will be distributed to {} GPUs.".format(torch.cuda.device_count()))
model.to(dev)

Model training will execute on cpu


LTRModel(
  (input_layer): FCModel(
    (input_norm): Identity()
    (activation): Identity()
    (dropout): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0): Linear(in_features=6, out_features=64, bias=True)
    )
  )
  (encoder): Encoder(
    (layers): ModuleList(
      (0): EncoderLayer(
        (self_attn): MultiHeadedAttention(
          (linears): ModuleList(
            (0-3): 4 x Linear(in_features=64, out_features=64, bias=True)
          )
          (dropout): Dropout(p=0.0, inplace=False)
        )
        (feed_forward): PositionwiseFeedForward(
          (w_1): Linear(in_features=64, out_features=64, bias=True)
          (w_2): Linear(in_features=64, out_features=64, bias=True)
          (dropout): Dropout(p=0.0, inplace=False)
        )
        (sublayer): ModuleList(
          (0-1): 2 x SublayerConnection(
            (norm): LayerNorm()
            (dropout): Dropout(p=0.0, inplace=False)
          )
        )
      )
    )
    (norm): LayerNorm()
  )


In [28]:
from torch import optim

# load optimizer, loss and LR scheduler
optimizer = getattr(optim, config.optimizer.name)(params=model.parameters(), **config.optimizer.args)
loss_func = partial(getattr(losses, config.loss.name), **config.loss.args)
if config.lr_scheduler.name:
    scheduler = getattr(optim.lr_scheduler, config.lr_scheduler.name)(optimizer, **config.lr_scheduler.args)
else:
    scheduler = None

In [None]:
with torch.autograd.detect_anomaly() if config.detect_anomaly else dummy_context_mgr():  # type: ignore
    # run training
    result = fit(
        model=model,
        loss_func=loss_func,
        optimizer=optimizer,
        scheduler=scheduler,
        train_dl=train_dl,
        valid_dl=val_dl,
        config=config,
        device=dev,
        output_dir='output',
        tensorboard_output_path='output',
        **asdict(config.training)
    )