In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
import tensorflow as tf
import keras
import keras.backend as K
from keras.models import Model,Sequential
from keras.utils import to_categorical
from keras.callbacks import ModelCheckpoint
from keras.losses import categorical_crossentropy
from keras.layers import Input, Dense, Dropout, GRU, Embedding
from tensorflow.keras.optimizers import Adam

ModuleNotFoundError: No module named 'tensorflow'

# Session data

In [None]:
class SessionDataset:   
    def __init__(self, data, session_key='SESSIONID', item_key='NAVIGATIONCODE', time_key='TIMESTAMP', n_samples=-1, itemmap=None, time_sort=False) :
        """
        Args:
            path: path of the csv file
            sep: separator for the csv
            session_key, item_key, time_key: name of the fields corresponding to the sessions, items, time
            n_samples: the number of samples to use. If -1, use the whole dataset.
            itemmap: mapping between item IDs and item indices
            time_sort: whether to sort the sessions by time or not
        """
        self.df = data
        self.session_key = session_key
        self.item_key = item_key
        self.time_key = time_key
        self.time_sort = time_sort
        self.add_item_indices(itemmap=itemmap)
        self.df.sort_values([self.session_key, self.time_key], inplace=True)

        # Sort the df by time, and then by session ID. That is, df is sorted by session ID and
        # clicks within a session are next to each other, where the clicks within a session are time-ordered.

        self.click_offsets = self.get_click_offsets()
        self.session_idx_arr = self.order_session_idx()
        
    def get_click_offsets(self):
        """
        Return the offsets of the beginning clicks of each session IDs,
        where the offset is calculated against the first click of the first session ID.
        """
        offsets = np.zeros(self.df[self.session_key].nunique() + 1, dtype=np.int32)
        # group & sort the df by session_key and get the offset values
        offsets[1:] = self.df.groupby(self.session_key).size().cumsum()

        return offsets

    def order_session_idx(self):
        """ Order the session indices """
        if self.time_sort:
            # starting time for each sessions, sorted by session IDs
            sessions_start_time = self.df.groupby(self.session_key)[self.time_key].min().values
            # order the session indices by session starting times
            session_idx_arr = np.argsort(sessions_start_time)
        else:
            session_idx_arr = np.arange(self.df[self.session_key].nunique())

        return session_idx_arr
    
    def add_item_indices(self, itemmap=None):
        """ 
        Add item index column named "item_idx" to the df
        Args:
            itemmap (pd.DataFrame): mapping between the item Ids and indices
        """
        if itemmap is None:
            item_ids = self.df[self.item_key].unique()  # unique item ids
            item2idx = pd.Series(data=np.arange(len(item_ids)),
                                 index=item_ids)
            itemmap = pd.DataFrame({self.item_key:item_ids,
                                   'item_idx':item2idx[item_ids].values})
        
        self.itemmap = itemmap
        self.df = pd.merge(self.df, self.itemmap, on=self.item_key, how='inner')
        
    @property    
    def items(self):
        return self.itemmap.NAVIGATIONCODE.unique()


In [None]:
class SessionDataLoader:
    """Credit to yhs-968/pyGRU4REC."""    
    def __init__(self, dataset, batch_size=32):
        """
        A class for creating session-parallel mini-batches.
        Args:
            dataset (SessionDataset): the session dataset to generate the batches from
            batch_size (int): size of the batch
        """
        self.dataset = dataset
        self.batch_size = batch_size
        self.done_sessions_counter = 0
        
    def __iter__(self):
        """ Returns the iterator for producing session-parallel training mini-batches.
        Yields:
            input (B,):  Item indices that will be encoded as one-hot vectors later.
            target (B,): a Variable that stores the target item indices
            masks: Numpy array indicating the positions of the sessions to be terminated
        """

        df = self.dataset.df
        session_key = 'session_id'
        item_key = 'song_id'
        time_key = 'ts'
        self.n_items = df[item_key].nunique()+1
        click_offsets = self.dataset.click_offsets
        session_idx_arr = self.dataset.session_idx_arr

        iters = np.arange(self.batch_size)
        maxiter = iters.max()
        start = click_offsets[session_idx_arr[iters]]
        end = click_offsets[session_idx_arr[iters] + 1]
        mask = [] # indicator for the sessions to be terminated
        finished = False        

        while not finished:
            minlen = (end - start).min()
            # Item indices (for embedding) for clicks where the first sessions start
            #print(df.TIMESTAMP.values[start])
            idx_target = df.item_idx.values[start]
           
            for i in range(minlen - 1):
                # Build inputs & targets
                idx_input = idx_target
                idx_target = df.item_idx.values[start + i + 1]               
                inp = idx_input
                target = idx_target

                yield inp, target, mask
                
            # click indices where a particular session meets second-to-last element
            start = start + (minlen - 1)
            # see if how many sessions should terminate
            mask = np.arange(len(iters))[(end - start) <= 1]
            self.done_sessions_counter = len(mask)
            for idx in mask:
                maxiter += 1
                if maxiter >= len(click_offsets) - 1:
                    finished = True
                    break
                # update the next starting/ending point
                iters[idx] = maxiter
                start[idx] = click_offsets[session_idx_arr[maxiter]]
                end[idx] = click_offsets[session_idx_arr[maxiter] + 1]

In [None]:
class GRU4Rec:

    def __init__(self,  epochs = 5,
                        batch_size = 32,
                        dropout = 0.25,
                        learning_rate = 0.001,
                        decay=0.0,
                        beta_1=0.9,
                        beta_2=0.999,
                        session_key = 'session_id',
                        item_key = 'song_id',
                        time_key = 'ts',
                        n_samples = -1,
                        itemmap = None,
                        time_sort = False,
                        emb_size = 50,
                        hidden_units = 100,
                        save_weights = False,
                        train_n_items = 96 ):

        self.epochs = epochs
        self.batch_size = batch_size
        self.dropout = dropout
        self.learning_rate = learning_rate
        self.decay = decay
        self.beta_1 = beta_1
        self.beta_2 = beta_2
        self.session_key = session_key
        self.item_key = item_key
        self.time_key = time_key
        self.n_samples= n_samples
        self.itemmap= itemmap
        self.time_sort = time_sort
        self.train_n_items = train_n_items
        self.emb_size = emb_size
        self.hidden_units = hidden_units
        self.save_weights = save_weights
        self.current_session_id= -1
        self.Build_Model()


        
    def Build_Model(self):   
        size = self.emb_size
        inputs = Input(batch_shape=(self.batch_size, 1, self.train_n_items))
        gru, gru_states = GRU(self.hidden_units, stateful=True, return_state=True)(inputs)
        drop2 = Dropout(self.dropout)(gru)
        predictions = Dense(self.train_n_items, activation='softmax')(drop2)
        self.model = Model(inputs=inputs, outputs=[predictions])
        opt = keras.optimizers.Adam(lr=self.learning_rate, beta_1=self.beta_1, beta_2=self.beta_2, decay=self.decay, amsgrad=False)
        self.model.compile(loss=categorical_crossentropy, optimizer=opt)
        self.model.summary()

    
    def get_states(self):
        return [K.get_value(s) for s,_ in self.model.state_updates]
    
    def fit(self,train_data=None):

        train_samples_qty = len(train_data[self.session_key].unique() )+1
        print('Fitting the model...')
        self.train_dataset = SessionDataset(train_data)
        model_to_train = self.model
        batch_size = self.batch_size
        for epoch in range(1, self.epochs+1):
            with tqdm(total=train_samples_qty) as pbar:
                loader = SessionDataLoader(self.train_dataset, batch_size=self.batch_size)
                for feat, target, mask in loader:
                    
                    real_mask = np.ones((self.batch_size, 1))
                    for elt in mask:
                        real_mask[elt, :] = 0

                    hidden_states = self.get_states()[0]
                    hidden_states = np.multiply(real_mask, hidden_states)
                    hidden_states = np.array(hidden_states, dtype=np.float32)
                    self.model.layers[1].reset_states(hidden_states)

                    input_oh = to_categorical(feat, num_classes=loader.n_items) 
                    input_oh = np.expand_dims(input_oh, axis=1)
                    target_oh = to_categorical(target, num_classes=loader.n_items)
                   
                    tr_loss = self.model.train_on_batch(input_oh, target_oh)
                    
                    pbar.set_description("Epoch {0}. Loss: {1:.5f}".format(epoch, tr_loss))
                    pbar.update(loader.done_sessions_counter)
                
            if self.save_weights:
                print("Saving weights...")
                self.model.save('./GRU4REC_{}.h5'.format(epoch))
                
    def predict_next(self, session_id, item):

        if session_id != self.current_session_id : 
            self.model.reset_states()
            self.current_session_id = session_id

        feat = np.zeros((self.batch_size,1))
        feat[0] = item

        input_oh  = to_categorical(feat,  num_classes=self.train_n_items) 
        input_oh = np.expand_dims(input_oh, axis=1)
        preds = self.model.predict(input_oh, batch_size=self.batch_size)
        itemid = preds.argsort()[0][::-1]
        return pd.DataFrame(data=preds[0][itemid], index=itemid)
        
    def Evaluate(self,test_data=None ,recall_k=5, mrr_k=5):
        train_generator_map = self.train_dataset.itemmap
        test_dataset = SessionDataset(test_data, itemmap=train_generator_map)
        test_generator = SessionDataLoader(test_dataset, batch_size=self.batch_size)

        n = 0
        rec_sum = 0
        mrr_sum = 0

        print("Evaluating model...")
        for feat, label, mask in test_generator:

            target_oh = to_categorical(label, num_classes=self.train_n_items)
            input_oh  = to_categorical(feat,  num_classes=self.train_n_items) 
            input_oh = np.expand_dims(input_oh, axis=1)
            
            pred = self.model.predict(input_oh, batch_size=self.batch_size)

            for row_idx in range(feat.shape[0]):
                pred_row = pred[row_idx] 
                label_row = target_oh[row_idx]

                rec_idx =  pred_row.argsort()[-recall_k:][::-1]
                mrr_idx =  pred_row.argsort()[-mrr_k:][::-1]
                tru_idx = label_row.argsort()[-1:][::-1]

                n += 1

                if tru_idx[0] in rec_idx:
                    rec_sum += 1

                if tru_idx[0] in mrr_idx:
                    mrr_sum += 1/int((np.where(mrr_idx == tru_idx[0])[0]+1))

        recall = rec_sum/n
        mrr = mrr_sum/n
        return (recall, recall_k), (mrr, mrr_k)

    def save(self,name='GRU4REC_NEW'):
        self.model.save('./'+name+'.h5')


In [95]:
train = pd.read_csv('train.csv')

In [96]:
test = pd.read_csv('test.csv')

In [97]:
def preprocess_dataframe(df):
    df = df.assign(sequence=df.sequence.str.strip('[]').str.split(','))
    df = df.explode('sequence')
    df['sequence'] = df.sequence.apply(lambda st: re.sub(r'\W+', '', st))
    df['sequence'] = df['sequence'].astype(int)
    return df

In [98]:
preprocess_dataframe(test).to_csv('test.csv', index=False)
preprocess_dataframe(train).to_csv('train.csv', index=False)

In [61]:
train = train.assign(sequence=train.sequence.str.strip('[]').str.split(','))
train

Unnamed: 0,session_id,sequence,ts,user_id
0,223,"['3772', '3953']",1419418147,15861
1,226,"['245', '1271', '379']",1419433841,15861
2,243,"['245', '1197', '4307', '3868']",1421674741,15861
3,245,"['409', '234', '2334', '2431', '231', '47...",1421679507,15861
4,353,"['4255', '652', '4256', '4257', '4256', '...",1420927951,4296
...,...,...,...,...
48063,2764044,"['3051', '7182', '310']",1421233642,4503
48064,2764047,"['1871', '1475', '577', '3152', '1075', '...",1421319245,4503
48065,2764159,"['528', '6475']",1421059220,12934
48066,2764161,"['6349', '2803']",1421141469,12934


In [68]:
pip install re

Note: you may need to restart the kernel to use updated packages.


ERROR: Could not find a version that satisfies the requirement re (from versions: none)
ERROR: No matching distribution found for re
You should consider upgrading via the 'c:\users\ilyes\appdata\local\programs\python\python38\python.exe -m pip install --upgrade pip' command.


In [70]:
import re
train.explode('sequence').sequence.apply(lambda st: re.sub(r'\W+', '', st)).astype(int)

0        3772
0        3953
1         245
1        1271
1         379
         ... 
48067    1485
48067    5733
48067    1482
48067    2445
48067     915
Name: sequence, Length: 207214, dtype: int32