In [1]:
import pandas as pd
import numpy as np

In [5]:
pd.read_csv('logs/lightning_logs/version_1/metrics.csv')

Unnamed: 0,val_HR@5,val_NDCG@5,val_MRR@5,val_HR@10,val_NDCG@10,val_MRR@10,val_HR@20,val_NDCG@20,val_MRR@20,epoch,step,train_loss
0,0.060105,0.041397,0.042704,0.087249,0.049956,0.042704,0.142564,0.064117,0.042704,0,1091,
1,,,,,,,,,,0,1091,6.54175
2,0.084968,0.056759,0.056998,0.128536,0.07071,0.056998,0.183508,0.084537,0.056998,1,2183,
3,,,,,,,,,,1,2183,5.93877
4,0.088504,0.058479,0.058949,0.133896,0.073,0.058949,0.197194,0.088956,0.058949,2,3275,
5,,,,,,,,,,2,3275,5.712047
6,0.096715,0.064412,0.065046,0.145187,0.079893,0.065046,0.217495,0.09807,0.065046,3,4367,
7,,,,,,,,,,3,4367,5.503188
8,0.106182,0.070787,0.071798,0.160812,0.088318,0.071798,0.24099,0.108425,0.071798,4,5459,
9,,,,,,,,,,4,5459,5.342911


In [3]:
uid_field = 'user_id'
iid_field = 'item_id'
item_text_field = 'item_text'
item_seq_field = 'interactions'
inter_table = 'interactions'
item_table = 'items'
seq_len = 20

MIND_configs = {
    inter_table: {
        'filepath': 'MIND-small/behaviors.csv',
        'field_separator': '\t',
        'seq_separator': ' ',
        'header': 0,
        'usecols': ['userid', 'behaviors'],
        'rename_cols': {
            'userid': uid_field,
            'behaviors': item_seq_field
        },
        'filed_type': {
            'userid': str,
            'behaviors': str
            },
        'token_seq_fields': [item_seq_field],
        'max_item_seq_length': None,
        'min_item_seq_length': 5,
        },
    item_table: {
        'filepath': 'MIND-small/news.csv',
        'field_separator': '\t',
        'header': 0,
        'usecols': ['newid', 'title'],
        'filed_type': {
            'newid': str,
            'title': str
            },
        'rename_cols': {
            'newid': iid_field,
            'title': item_seq_field
            },
        }
    }

In [None]:
  user_id                                       interactions
0  U65916  [19012, 44697, 30555, 28950, 18562, 19292, 247...
1  U49985  [33251, 34121, 32633, 9813, 23038, 7016, 9440,...
2  U25550  [20620, 25940, 24563, 12976, 11791, 35050, 245...
3  U19710  [30053, 3965, 35238, 29936, 35346, 26608, 1916...
4  U38106                [17314, 18160, 27298, 44397, 39131]
   item_id                                          item_text
0        1  The Brands Queen Elizabeth, Prince Charles, an...
1        2  Dispose of unwanted prescription drugs during ...
2        3  The Cost of Trump's Aid Freeze in the Trenches...
3        4  I Was An NBA Wife. Here's How It Affected My M...
4        5  How to Get Rid of Skin Tags, According to a De...

In [4]:
class DataPreprocessor:
    def __init__(
        self,
        uid_field,
        iid_field,
        item_text_field,
        item_seq_field,
        inter_table,
        item_table,
        data_configs,
        ) -> None:
        self.uid_field = uid_field
        self.iid_field = iid_field
        self.item_text_field = item_text_field
        self.item_seq_field = item_seq_field
        self.inter_table = inter_table
        self.item_table = item_table
        self.data_configs = data_configs
        self.lookup_df = self.load_data(data_configs)     

        # self.drop_duplicates()
        
        item_token_id, item_id_token, item_id_text = self.map_item_ID()
        self.item_token_id: dict = item_token_id
        self.item_id_token: list = item_id_token
        self.item_id_text: list = item_id_text
        
        _max = data_configs[self.inter_table]['max_item_seq_length']
        _min = data_configs[self.inter_table]['min_item_seq_length']
        self.filter_item_seq_by_num(_min, _max)
        
        self.train_data, self.valid_data, self.test_data = \
            self.ratio_split()
        
    @property
    def min_item_seq_len(self):
        return self.lookup_df[self.inter_table][self.item_seq_field].apply(len).min()
    
    @property
    def max_item_seq_len(self):
        return self.lookup_df[self.inter_table][self.item_seq_field].apply(len).max()
    
    def load_data(self, datatables_config):
        lookup_df = {}
        for table_name, cfg in self.data_configs.items():
            lookup_df[table_name] = pd.read_csv(
                cfg['filepath'],
                delimiter=cfg['field_separator'],
                header=cfg['header'],
                usecols=cfg['usecols'],
                dtype=cfg['filed_type'],
                encoding='utf-8',
                engine='python'
            )
            lookup_df[table_name].rename(columns=cfg['rename_cols'], inplace=True)
            
            if 'token_seq_fields' in cfg:
                for field in cfg['token_seq_fields']:
                    lookup_df[table_name][field] = \
                        [
                            np.array(list(filter(None, seq.split(cfg['seq_separator']))))
                            for seq in lookup_df[table_name][field].values
                        ]
        return lookup_df
    
    def drop_duplicates(self):
        self.lookup_df[self.inter_table] = self.lookup_df[self.inter_table].drop_duplicates(
            subset=[self.uid_field, self.item_seq_field]
        )
        self.lookup_df[self.item_table] = self.lookup_df[self.item_table].drop_duplicates(
            subset=[self.iid_field]
        )
    
    def filter_item_seq_by_num(self, _min, _max):
        assert _min > 0, 'min_item_seq_length must be greater than 0'
        if _min is not None and _max is not None:
            _max = float('inf') if _max is None else _max
            _min = 0 if _min is None else _min
            self.lookup_df[self.inter_table] = \
                self.lookup_df[self.inter_table][
                    self.lookup_df[self.inter_table][self.item_seq_field]
                        .apply(lambda x: len(x) >= _min and len(x) <= _max)
                    ]
            
    def map_item_ID(self):
        item_tokens = [self.lookup_df[self.item_table][self.iid_field].values]
        item_tokens.append(self.lookup_df[self.inter_table][self.item_seq_field].agg(np.concatenate))
        split_point = np.cumsum(list(map(len, item_tokens)))[:-1]
        item_tokens = np.concatenate(item_tokens)
        
        new_ids_list, mappings = pd.factorize(item_tokens)
        [item_tab_new_ids, inter_tab_new_ids] = np.split(new_ids_list + 1, split_point)
        item_id_token = np.array(['[PAD]'] + list(mappings))
        item_token_id = {token: idx for idx, token in enumerate(item_id_token)}
        
        self.lookup_df[self.item_table][self.iid_field] = item_tab_new_ids
        split_point = np.cumsum(self.lookup_df[self.inter_table][self.item_seq_field].agg(len))[:-1]
        self.lookup_df[self.inter_table][self.item_seq_field] = np.split(inter_tab_new_ids, split_point)
        
        # item already sorted by id when performing factorize
        item_id_text = self.lookup_df[item_table][item_seq_field].values
        item_id_text = np.concatenate([['[PAD]'], item_id_text])
        return item_token_id, item_id_token, item_id_text
    
    def ratio_split(self):
        """ fixed ratio split (train:0.8, valid:0.1, test:0.1) """
        inter_table = self.lookup_df[self.inter_table]
        train_data = inter_table.sample(frac=0.8)
        rest_data = inter_table[~inter_table.index.isin(train_data.index)]
        valid_data = rest_data.sample(frac=0.5)
        test_data = rest_data[~rest_data.index.isin(valid_data.index)]
        
        train_data = train_data[item_seq_field].values
        valid_data = valid_data[item_seq_field].values
        test_data = test_data[item_seq_field].values
        
        return train_data, valid_data, test_data
    
    def convert_id_to_text(self, data):
        item_seq_text = []
        for seq in data:
            item_seq_text.append(self.item_id_text[seq])
        return np.array(item_seq_text)
        
    

In [5]:
dataprep = DataPreprocessor(uid_field, iid_field, item_text_field, item_seq_field, inter_table, item_table, MIND_configs)

In [6]:
dataprep.train_data[0]

array([14167, 29018, 18810, 49172, 39770, 35654, 34987], dtype=int64)

In [7]:
import numpy as np
from torch.utils.data import Dataset

class TextSeqRecDataset(Dataset):
    def __init__(
        self,
        data,
        seq_len,
        padding_idx=0,
        ):
        self._len = len(data)
        self.seq_len = seq_len
        self.padding_idx = padding_idx
        self.item_seqs, self.targets = self._right_padding_left_trancate(data, self.seq_len)
        self.seq_masks = self._get_masks(self.item_seqs)
        
    def _right_padding_left_trancate(self, data, seq_len):
        """ Generate items seq like [1, 2, 3, 0 , 0] and targets like [2, 3, 4, 0, 0] """
        item_seqs = np.zeros((len(data), seq_len), dtype=np.int64)
        targets = np.zeros((len(data), seq_len), dtype=np.int64)
        for i, data in enumerate(data):
            if len(data) > seq_len:
                item_seqs[i] = data[-seq_len-1:-1]
                targets[i] = data[-seq_len:]
            else:
                item_seqs[i, :len(data)-1] = data[:-1]
                targets[i, :len(data)-1] = data[1:]
                
        return item_seqs, targets
    
    def _get_masks(self, data):
        masks = np.zeros(data.shape, dtype=np.int32)
        masks = np.where(data != self.padding_idx, True, False)
        return masks
                
    def __len__(self):
        return self._len
    
    def __getitem__(self, idx):
        item_seq = self.item_seqs[idx]
        target = self.targets[idx]
        seq_mask = self.seq_masks[idx]
        return item_seq, target, seq_mask

In [8]:
train_dataset = TextSeqRecDataset(dataprep.train_data, seq_len)
valid_dataset = TextSeqRecDataset(dataprep.valid_data, seq_len)
test_dataset = TextSeqRecDataset(dataprep.test_data, seq_len)

In [9]:
from torch.utils.data import DataLoader
dataloader = DataLoader(train_dataset, batch_size=2, shuffle=True)

In [10]:
_, k, v = next(iter(dataloader))

In [11]:
k != 0

tensor([[ True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
         False, False, False, False, False, False, False, False, False, False],
        [ True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
          True,  True,  True, False, False, False, False, False, False, False]])

In [12]:
v.sum(1)

tensor([10, 13])

In [13]:
import torch
# import our library
import torchmetrics

# simulate a classification problem
preds = torch.randn(10, 5).softmax(dim=-1)
target = torch.randint(5, (10,))

acc = torchmetrics.functional.accuracy(preds, target)

In [14]:
from torchmetrics import RetrievalHitRate
indexes = torch.tensor([[0, 0, 0, 0], [1, 1, 1, 1]])
preds = torch.tensor([[0.2, 0.3, 0.5, 0.6], [0.1, 0.3, 0.5, 0.2]])
target = torch.tensor([[True, False, False, False], [False, True, False, True]])
hr2 = RetrievalHitRate(k=2)
hr2(preds, target, indexes=indexes)

tensor(0.5000)

In [19]:
from torchmetrics import RetrievalHitRate

batch_size = 16
vocab_size = len(dataprep.item_token_id)

preds = torch.randn(batch_size, vocab_size).softmax(dim=-1)
indexes = torch.arange(0, batch_size).reshape(-1, 1).expand(-1, vocab_size).to(preds.device) # (batch_size, vocab_size)
target_id = torch.randint(0, vocab_size, (batch_size, 1)).to(preds.device) # (batch_size, 1)
target = torch.zeros(batch_size, vocab_size, dtype=torch.bool).scatter_(1, target_id, 1).to(preds.device) # (batch_size, vocab_size)

hr5 = RetrievalHitRate(k=5)
hr10 = RetrievalHitRate(k=10)
hr20 = RetrievalHitRate(k=20)

print(
    hr5(preds, target, indexes=indexes),
    hr10(preds, target, indexes=indexes),
    hr20(preds, target, indexes=indexes),
)

tensor(0.) tensor(0.) tensor(0.)
