In [1]:
from recbole.data import create_dataset, data_preparation
from recbole.config import Config

In [2]:
model='NPE'
dataset='ml-100k'

config_dict = {
    'eval_args': {
        "order": "TO",
        "split": {"RS": [0.8, 0.1, 0.1]},
        "group_by": None
    },
    'train_neg_sample_args': None
}

config = Config(
    model=model,
    dataset=dataset,
    config_dict=config_dict
)

In [3]:
dataset = create_dataset(config)
# train_data, valid_data, test_data = data_preparation(config, dataset)
model_type = config["MODEL_TYPE"]
built_datasets = dataset.build()
train_dataset, valid_dataset, test_dataset = built_datasets

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  feat[field].fillna(value=0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  feat[field].fillna(value=feat[field].mean(), inplace=True)


In [12]:
dataset.inter_feat.timestamp

tensor([9.1585e-07, 2.3704e-06, 3.8250e-06,  ..., 1.0000e+00, 1.0000e+00,
        1.0000e+00])

In [4]:
train_dataset.inter_feat.timestamp

tensor([9.1585e-07, 2.3704e-06, 3.8250e-06,  ..., 7.8184e-01, 7.8184e-01,
        7.8184e-01])

In [5]:
valid_dataset.inter_feat.timestamp

tensor([0.7818, 0.7818, 0.7818,  ..., 0.8974, 0.8974, 0.8974])

In [6]:
test_dataset.inter_feat.timestamp

tensor([0.8974, 0.8974, 0.8974,  ..., 1.0000, 1.0000, 1.0000])

In [8]:
test_dataset.inter_feat

The batch_size of interaction: 9905
    user_id, torch.Size([9905]), cpu, torch.int64
    item_id, torch.Size([9905]), cpu, torch.int64
    rating, torch.Size([9905]), cpu, torch.float32
    timestamp, torch.Size([9905]), cpu, torch.float32
    item_length, torch.Size([9905]), cpu, torch.int64
    item_id_list, torch.Size([9905, 50]), cpu, torch.int64
    rating_list, torch.Size([9905, 50]), cpu, torch.float32
    timestamp_list, torch.Size([9905, 50]), cpu, torch.float32


In [7]:
for i in range(1, len(test_dataset)):
    if test_dataset.inter_feat.timestamp[i] - test_dataset.inter_feat.timestamp[i - 1] < 0:
        print("not increased")
        break

## Implement time cutoff Dataset

In [11]:
import numpy as np

from recbole.data.dataset import Dataset
from recbole.utils import (
    FeatureType,
    set_color,
)

class TimeCutoffDataset(Dataset):
    def __init__(self, config):
        self.timestamp_max, self.timestamp_min = 0., 0.

        super().__init__(config)

    def _normalize(self):

        ## TODO: HoangLe [Jun-05]: Find the timestamp column and save the max/min
        self.timestamp_max = max()
        self.timestamp_max = min()


        return super()._normalize()

    def _fill_nan(self):
        """Missing value imputation.

        For fields with type :obj:`~recbole.utils.enum_type.FeatureType.TOKEN`, missing value will be filled by
        ``[PAD]``, which indexed as 0.

        For fields with type :obj:`~recbole.utils.enum_type.FeatureType.FLOAT`, missing value will be filled by
        the average of original data.

        Note:
            This is similar to the recbole's original implementation. The difference is the change in inplace operation to suit the pandas 3.0
        """
        self.logger.debug(set_color("Filling nan", "green"))

        for feat_name in self.feat_name_list:
            feat = getattr(self, feat_name)
            for field in feat:
                ftype = self.field2type[field]
                if ftype == FeatureType.TOKEN:
                    feat[field] = feat[field].fillna(value=0)
                elif ftype == FeatureType.FLOAT:
                    feat[field] = feat[field].fillna(value=feat[field].mean())
                else:
                    dtype = np.int64 if ftype == FeatureType.TOKEN_SEQ else np.float
                    feat[field] = feat[field].apply(
                        lambda x: (
                            np.array([], dtype=dtype) if isinstance(x, float) else x
                        )
                    )

    def build(self):
        self._change_feat_format()

        if self.benchmark_filename_list is not None:
            super().build()

        # ordering
        ordering_args = self.config["eval_args"]["order"]
        if ordering_args == "TO":
            self.sort(by=self.time_field)
        else:
            raise AssertionError("The ordering_method must be 'TO.")

        # splitting & grouping
        split_args = self.config["eval_args"]["split"]
        if split_args is None:
            raise ValueError("The split_args in eval_args should not be None.")
        if not isinstance(split_args, dict):
            raise ValueError(f"The split_args [{split_args}] should be a dict.")

        split_mode = list(split_args.keys())[0]
        assert len(split_args.keys()) == 1
        if split_mode != "CO":
            raise NotImplementedError("The split_mode must be 'CO'.")
        elif split_mode == "CO":
            cutoff = split_args["RS"]
            # NOTE: HoangLe [Jun-05]: cutoff may come with different types: string, datetime

            group_by = self.config["eval_args"]["group_by"]
            datasets = self.split_by_cuttoff(cutoff=cutoff, group_by=group_by)
    
        
        return datasets

    def split_by_cuttoff(self, cutoff: str, group_by: str) -> list[Dataset]:
        """Split the interations by cutoff date

        Args:
            cutoff (str): cutoff date
            group_by (str): field to group by, usually the user_id

        Returns:
            list[Dataset]: list of training/validation/testing dataset, whose interaction features has been split.
        """

        # TODO: HoangLe [Jun-05]: Implement this, may follow method 
        
        self.logger.debug(f"split by cutoff date = '{cutoff}', group_by=[{group_by}]")

        grouped_inter_feat_index = self._grouped_index(
            self.inter_feat[group_by].numpy()
        )

        next_index = [[]]*3     # 'next_index' contains the indices for training/validation/testing dataset
        for grouped_index in grouped_inter_feat_index:
            # Split the grouped_index into into train/validation/test

            train_indices, val_indices, test_indices = [], [], []

            ## TODO: HoangLe [Jun-05]: Investivate how to access 'timestamp' and how to split the self.inter_feat using cutoff
            split_ids = self._calcu_split_ids(tot=tot_cnt, ratios=ratios)
            for index, start, end in zip(
                next_index, [0] + split_ids, split_ids + [tot_cnt]
            ):
                index.extend(grouped_index[start:end])

        self._drop_unused_col()
        next_df = [self.inter_feat[index] for index in next_index]
        next_ds = [self.copy(_) for _ in next_df]
        return next_ds