In [1]:
import pandas as pd
import numpy as np
from pathlib import Path

PATH_PROJ = Path.cwd()
PATH_DATA = PATH_PROJ/'data'
train = 'rsc15_train_full.txt'
test = 'rsc15_test.txt'
PATH_TO_TRAIN = PATH_DATA / train
PATH_TO_TEST = PATH_DATA / test

df_train = pd.read_csv(PATH_TO_TRAIN, sep='\t', dtype={'ItemId': np.int64})[:10000]

In [2]:
df_train.head()

Unnamed: 0,SessionId,ItemId,Time
0,1,214536502,1396835000.0
1,1,214536500,1396836000.0
2,1,214536506,1396836000.0
3,1,214577561,1396836000.0
4,2,214662742,1396847000.0


In [3]:
from modules.model import GRU4REC
session_key = 'SessionId'
time_key = 'Time'
item_key = 'ItemId'

df, click_offsets, session_idx_arr = GRU4REC.init_data(df_train, session_key, time_key, item_key,
                                                               time_sort=False.

In [4]:
df_train.head()

Unnamed: 0,SessionId,ItemId,Time
0,1,214536502,1396835000.0
1,1,214536500,1396836000.0
2,1,214536506,1396836000.0
3,1,214577561,1396836000.0
4,2,214662742,1396847000.0


In [5]:
df.head()

Unnamed: 0,SessionId,ItemId,Time,iidx
0,1,214536502,1396835000.0,0
4,1,214536500,1396836000.0,1
5,1,214536506,1396836000.0,2
6,1,214577561,1396836000.0,3
7,2,214662742,1396847000.0,4


In [6]:
from modules.generator import *

In [7]:
def init_df(df, session_key, time_key, item_key, iids=None):
    '''
    Initialize a dataframe.
    Involves the following 3 steps:
        1) Renaming the columns
        2) Adding item indices to the columns
        3) Sort the df

    Args:
        session_key: session identifier
        time_key: timestamp
        item_key: item identifier
        iids: unique item ids. Should be `None` if the df is a training set, and should include the
              ids for the items included in the training set if the df is a test set.
    '''

    # Rename the session IDs, timestamps, and item IDs
    df = df.rename(columns={session_key: 'sid', time_key: 't', item_key: 'iid'})

    # add item index column named "iidx" to the df
    if iids is None: iids = df['iid'].unique()  # unique item ids
    item2idx = pd.Series(data=np.arange(len(iids)), index=iids)
    df = pd.merge(df, pd.DataFrame({'iid': iids, 'iidx': item2idx[iids].values}), on='iid', how='inner')

    # Sort the df by time, and then by session ID. That is, df is sorted by session ID and
    # clicks within a session are next to each other, where the clicks within a session are time-ordered.
    df = df.sort_values(['sid', 't'])

    return df

In [10]:
df_.head()

Unnamed: 0,sid,iid,t,iidx
0,1,214536502,1396835000.0,0
4,1,214536500,1396836000.0,1
5,1,214536506,1396836000.0,2
6,1,214577561,1396836000.0,3
7,2,214662742,1396847000.0,4


In [12]:
df_ = init_df(df_train, session_key, time_key, item_key)
co_ = get_click_offsets(df_)
sia_ = order_session_idx(df_)

In [None]:
# from modules.model import GRU4REC
# import torch

# session_key = 'SessionId'
# time_key = 'Time'
# item_key = 'ItemId'

# input_size = df_train[item_key].nunique()
# output_size = input_size
# hidden_size = 100
# num_layers = 1
# batch_size = 10
# dropout_input = 0
# dropout_hidden = .5

# optimizer_type = 'Adagrad'
# lr = .01
# weight_decay = 1e-6
# momentum = 0
# eps = 1e-6

# loss_type = 'TOP1'

# n_epochs = 5
# time_sort = False
# n_samples = 100
# n_samples = -1
# use_cuda = False

# torch.manual_seed(7)
# torch.cuda.manual_seed(7)

# train_data = df_train[:n_samples] if n_samples != -1 else df_train

# model = GRU4REC(input_size, hidden_size, output_size, num_layers=num_layers, batch_size=batch_size,
#                 dropout_input=dropout_input, dropout_hidden=dropout_hidden, optimizer_type=optimizer_type, lr=lr,
#                 weight_decay=weight_decay, momentum=momentum, eps=eps, loss_type=loss_type,
#                 time_sort=time_sort, use_cuda=use_cuda)

# model.train(train_data, session_key, time_key, item_key, n_epochs=n_epochs)