In [5]:
from recbole.data import create_dataset, data_preparation
from recbole.config import Config

In [None]:
model='NPE'
dataset='ml-100k'

config_dict = {
    'eval_args': {
        "order": "TO",
        "split": {"RS": [0.8, 0.1, 0.1]},
        "group_by": None
    },
    'train_neg_sample_args': None
}

config = Config(
    model=model,
    dataset=dataset,
    config_dict=config_dict
)

In [None]:
dataset.time_field

In [None]:
dataset = create_dataset(config)
# train_data, valid_data, test_data = data_preparation(config, dataset)
# model_type = config["MODEL_TYPE"]
# built_datasets = dataset.build()
# train_dataset, valid_dataset, test_dataset = built_datasets

In [None]:
# dataset.inter_feat.sort_values(by=dataset.time_field, ascending=True, inplace=True)
cutoff_conv = 0.351652

group_by = 'user_id'
grouped_inter_feat_index = dataset._grouped_index(dataset.inter_feat[group_by])

next_index = [[]]*3
for grouped_index in grouped_inter_feat_index:
    df_each_user = dataset.inter_feat.loc[grouped_index].sort_values(dataset.time_field)
    
    df_before = df_each_user[df_each_user[dataset.time_field] < cutoff_conv]
    df_after = df_each_user[df_each_user[dataset.time_field] >= cutoff_conv]

    if len(df_before) == 0:
        continue

    if len(df_before) >= 1:
        next_index[0].extend(df_before.iloc[:-1].index)
    if len(df_before) >= 2:
        next_index[1].extend(df_before.iloc[-1].index)

    if len(df_after) > 0:
        next_index[2].extend(df_after.iloc[0].index)


In [None]:
df_each_user

In [None]:
dataset.inter_feat.loc[grouped_index].sort_values(dataset.time_field)

In [None]:
dataset.inter_feat

In [None]:
dataset.float_like_fields

In [None]:
field = "timestamp"

assert field in dataset.fields(), f"Dataset not existed field '{field}'"

for feat in dataset.field2feats(field):
    break

feat

In [None]:
dataset.field2feats(field)

In [None]:
test_dataset.inter_feat

In [None]:
group_by = 'user_id'
# grouped_inter_feat_index = dataset._grouped_index(dataset.inter_feat[group_by].numpy())

dataset.inter_feat[group_by].to_numpy()

## Implement time cutoff Dataset

In [1]:
import copy
import importlib
import os
import pickle
import warnings
from typing import Literal

import numpy as np

from recbole.data.dataloader import *
from recbole.sampler import KGSampler, Sampler, RepeatableSampler
from recbole.utils import ModelType, ensure_dir, get_local_time, set_color
from recbole.utils.argument_list import dataset_arguments
from recbole.data.dataset import Dataset
from recbole.utils import (
    FeatureType,
    set_color,
)

In [8]:
# TODO: HoangLe [Jun-09]: How to replace config["MODEL_TYPE"] to TimeCutoffDataset

MODELTYPE_CUTOFF = 7

class TimeCutoffDataset(Dataset):
    def __init__(self, config):
        self.timestamp_max, self.timestamp_min = 0., 0.
        self.cutoff, self.cutoff_conv = 0., 0.

        super().__init__(config)

    def _normalize(self):
        # Extract max-min of field self.time_field
        # feat_timestamp = self.field2feats(self.time_field)[0]
        # assert feat_timestamp and self.time_field in feat_timestamp, f"Feat not exist field '{self.time_field}'"

        # self.timestamp_max = np.max(feat_timestamp[self.time_field])
        # self.timestamp_min = np.min(feat_timestamp[self.time_field])

        self.timestamp_max = np.max(self.inter_feat[self.time_field])
        self.timestamp_min = np.min(self.inter_feat[self.time_field])

        return super()._normalize()

    def _fill_nan(self):
        """Missing value imputation.

        For fields with type :obj:`~recbole.utils.enum_type.FeatureType.TOKEN`, missing value will be filled by
        ``[PAD]``, which indexed as 0.

        For fields with type :obj:`~recbole.utils.enum_type.FeatureType.FLOAT`, missing value will be filled by
        the average of original data.

        Note:
            This is similar to the recbole's original implementation. The difference is the change in inplace operation to suit the pandas 3.0
        """
        self.logger.debug(set_color("Filling nan", "green"))

        for feat_name in self.feat_name_list:
            feat = getattr(self, feat_name)
            for field in feat:
                ftype = self.field2type[field]
                if ftype == FeatureType.TOKEN:
                    feat[field] = feat[field].fillna(value=0)
                elif ftype == FeatureType.FLOAT:
                    feat[field] = feat[field].fillna(value=feat[field].mean())
                else:
                    dtype = np.int64 if ftype == FeatureType.TOKEN_SEQ else np.float64
                    feat[field] = feat[field].apply(
                        lambda x: (
                            np.array([], dtype=dtype) if isinstance(x, float) else x
                        )
                    )

    def build(self):
        self._change_feat_format()

        if self.benchmark_filename_list is not None:
            super().build()

        # ordering
        ordering_args = self.config["eval_args"]["order"]
        if ordering_args == "TO":
            self.sort(by=self.time_field)
        else:
            raise AssertionError("The ordering_method must be 'TO.")

        # splitting & grouping
        split_args = self.config["eval_args"]["split"]
        if split_args is None:
            raise ValueError("The split_args in eval_args should not be None.")
        if not isinstance(split_args, dict):
            raise ValueError(f"The split_args [{split_args}] should be a dict.")

        split_mode = list(split_args.keys())[0]
        assert len(split_args.keys()) == 1
        if split_mode != "CO":
            raise NotImplementedError("The split_mode must be 'CO'.")
        elif split_mode == "CO":
            cutoff = split_args["RS"]
            # NOTE: HoangLe [Jun-05]: cutoff may come with different types: string, int

            group_by = self.config["eval_args"]["group_by"]
            datasets = self.split_by_cuttoff(cutoff=cutoff, group_by=group_by)
    
        
        return datasets

    def split_by_cuttoff(self, cutoff: str|int, group_by: str) -> list[Dataset]:
        """Split the interations by cutoff date

        Args:
            cutoff (str | int): cutoff date in Unix timestamp format
            group_by (str): field to group by, usually the user_id

        Returns:
            list[Dataset]: list of training/validation/testing dataset, whose interaction features has been split.

        Notes:
            cutoff may be different types: string of Unix timestamp (e.g. '1717923174'), integer of Unix timestamp (e.g. 1717923174)
        """
        
        self.logger.debug(f"split by cutoff date = '{cutoff}', group_by=[{group_by}]")

        # Convert cutoff to suitable format and apply 0-1 normalization with max/min timestamp
        if isinstance(cutoff, str):
            cutoff_conv = float(cutoff)
        else:
            cutoff_conv = float(cutoff)

        def norm_timestamp(timestamp: float):
            mx, mn = self.timestamp_max, self.timestamp_min
            if mx == mn:
                self.logger.warning(
                    f"All the same value in [{field}] from [{feat}_feat]."
                )
                arr = 1.0
            else:
                arr = (timestamp - mn) / (mx - mn)
            return arr

        cutoff_conv = norm_timestamp(cutoff_conv)
            

        grouped_inter_feat_index = self._grouped_index(self.inter_feat[group_by].to_numpy())

        next_index = [[]]*3     # 'next_index' contains the indices for training/validation/testing dataset
        for grouped_index in grouped_inter_feat_index:
            df_each_user = dataset.inter_feat.loc[grouped_index].sort_values(dataset.time_field)
            
            df_before = df_each_user[df_each_user[dataset.time_field] < cutoff_conv]
            df_after = df_each_user[df_each_user[dataset.time_field] >= cutoff_conv]

            if len(df_before) == 0:
                continue

            if len(df_before) >= 1:
                next_index[0].extend(df_before.iloc[:-1].index)
            if len(df_before) >= 2:
                next_index[1].extend(df_before.iloc[-1].index)

            if len(df_after) > 0:
                next_index[2].extend(df_after.iloc[0].index)
            

        self._drop_unused_col()
        next_df = [self.inter_feat[index] for index in next_index]
        next_ds = [self.copy(_) for _ in next_df]
        return next_ds

In [9]:
model='NPE'
dataset='ml-100k'

config_dict = {
    'eval_args': {
        "order": "TO",
        "split": {"cutoff": '12731432'},
        "group_by": 'user_id'
    },
    'train_neg_sample_args': None
}

config = Config(
    model=model,
    dataset=dataset,
    config_dict=config_dict
)

# Set model_type as type of TimeCutoffDataset
config['MODEL_TYPE'] = MODELTYPE_CUTOFF
dataset = TimeCutoffDataset(config)

In [11]:
dataset.inter_feat

Unnamed: 0,user_id,item_id,rating,timestamp
0,1,1,0.50,0.351593
1,2,2,0.50,0.915478
2,3,3,0.00,0.224244
3,4,4,0.25,0.316897
4,5,5,0.00,0.628862
...,...,...,...,...
99995,876,174,0.50,0.293651
99996,709,248,1.00,0.273185
99997,38,1005,0.00,0.003830
99998,59,444,0.25,0.413451
