In [1]:
import os
import sys
import math
import psutil
import sklearn
import humanize
import warnings
import subprocess
import numpy as np
import pandas as pd
import GPUtil as GPU
from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn.metrics import recall_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from tensorflow.python.client import device_lib

In [2]:
import tensorflow as tf
config = tf.ConfigProto()
config.gpu_options.allow_growth = True

In [3]:
import keras
import keras.backend as K
from keras.utils import to_categorical
from keras.models import Model, Sequential
from keras.callbacks import ModelCheckpoint
from keras.initializers import glorot_uniform
from keras.layers.core import Permute, Reshape, RepeatVector
from keras.losses import cosine_proximity, categorical_crossentropy
from keras.layers import Input, Dense, Dropout, CuDNNGRU, Embedding, concatenate, Lambda, multiply, merge, Flatten

Using TensorFlow backend.


In [4]:
class SessionDataset:
    """Credit to yhs-968/pyGRU4REC."""    
    def __init__(self, data, sep='\t', user_key='UserId', movie_key='MovieId', time_key='Time', n_samples=-1, movie_map=None, time_sort=False):
        """
        Args:
            data: 讀取好的dataframe
            sep: separator for the csv
            user_key, movie_key, time_key: name of the fields corresponding to the users, movies, time
            n_samples: the number of samples to use. If -1, use the whole dataset.
            movie_map: mapping between movie IDs and movie indices
            time_sort: whether to sort the users by time or not
        """
        self.df = data
        self.user_key = user_key
        self.movie_key = movie_key
        self.time_sort = time_sort #False
        self.add_item_indices(movie_map=movie_map)
        self.df.sort_values([user_key, time_key], inplace=True)

        #Sort the df by time, and then by session ID. That is, df is sorted by session ID and
        #clicks within a session are next to each other, where the clicks within a session are time-ordered.

        self.click_offsets = self.get_click_offsets()
        self.session_idx_arr = self.order_session_idx()
        
    def get_click_offsets(self):
        """
        Return the offsets of the beginning clicks of each session IDs,
        where the offset is calculated against the first click of the first session ID.
        """
        offsets = np.zeros(self.df[self.user_key].nunique() + 1, dtype=np.int32)
        # group & sort the df by user_key and get the offset values
        offsets[1:] = self.df.groupby(self.user_key).size().cumsum()

        return offsets

    def order_session_idx(self):
        """ Order the session indices """
        if self.time_sort:
            # starting time for each users, sorted by session IDs
            users_start_time = self.df.groupby(self.user_key)[self.time_key].min().values
            # order the session indices by session starting times
            session_idx_arr = np.argsort(users_start_time)
        else:
            session_idx_arr = np.arange(self.df[self.user_key].nunique())

        return session_idx_arr
    
    def add_item_indices(self, movie_map=None):
        """ 
        Add item index column named "item_idx" to the df
        Args:
            movie_map (pd.DataFrame): mapping between the item Ids and indices
        """
        if movie_map is None:
            item_ids = self.df[self.movie_key].unique()  # unique item ids
            item2idx = pd.Series(data=np.arange(len(item_ids)),
                                 index=item_ids)
            movie_map = pd.DataFrame({self.movie_key:item_ids,
                                   'item_idx':item2idx[item_ids].values})
        
        self.movie_map = movie_map
        self.df = pd.merge(self.df, self.movie_map, on=self.movie_key, how='inner')
        
    @property    
    def items(self):
        return self.movie_map.MovieId.unique()

In [5]:
def create_model():   
    emb_size = 50
    hidden_units = 100
    size = emb_size

    inputs = Input(batch_shape=(batch_size, 1, num_movies))
    gru, gru_states = CuDNNGRU(hidden_units, stateful=True, return_state=True)(inputs)# drop1) #
    drop2 = Dropout(0.25)(gru)
    predictions = Dense(num_movies, activation='softmax')(drop2)
    
    model = Model(input=inputs, output=[predictions])
    
    opt = keras.optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False)
    model.compile(loss=categorical_crossentropy, optimizer=opt)
    model.summary()

    filepath='DwellTimeModel_checkpoint.h5'
    checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=2, save_best_only=True, mode='min')
    callbacks_list = []
    return model


def get_states(model):
    return [K.get_value(s) for s,_ in model.state_updates]


def set_states(model, states):
    for (d,_), s in zip(model.state_updates, states):
        K.set_value(d, s)


def get_recall(model, loader, epoch, train_generator_map, recall_k=20):

    test_dataset = SessionDataset(test_data, movie_map=train_generator_map)
    test_generator = SessionDataLoader(test_dataset, batch_size=batch_size)

    n = 0
    suma = 0
    suma_baseline = 0

    for feat, label, mask in test_generator:

        input_oh = to_categorical(feat, num_classes=loader.num_movies) 
        input_oh = np.expand_dims(input_oh, axis=1)
        target_oh = to_categorical(label, num_classes=loader.num_movies)
        pred = model.predict(input_oh, batch_size=batch_size)

        if n%100 == 0:
            try:
                print("{}:{}".format(n, suma/n))
            except:
                pass

        for row_idx in range(feat.shape[0]):
            pred_row = pred[row_idx] 
            label_row = target_oh[row_idx]

            idx1 = pred_row.argsort()[-recall_k:][::-1]
            idx2 = label_row.argsort()[-1:][::-1]

            n += 1
            if idx2[0] in idx1:
                suma += 1

    print("Recall@{} epoch {}: {}".format(recall_k, epoch, suma/n))


def get_mrr(model, loader,epoch,train_generator_map, mrr_k=20):

    test_dataset = SessionDataset(test_data, movie_map = train_generator_map)
    test_generator = SessionDataLoader(test_dataset, batch_size=batch_size)

    n = 0
    suma = 0
    suma_baseline = 0

    for feat, label, mask in test_generator:
        input_oh = to_categorical(feat, num_classes=loader.num_movies) 
        input_oh = np.expand_dims(input_oh, axis=1)
        target_oh = to_categorical(label, num_classes=loader.num_movies)
        pred = model.predict(input_oh, batch_size=batch_size)

        if n%100 == 0:
            try:
                print("{}:{}".format(n, suma/n))
            except:
                pass

        for row_idx in range(feat.shape[0]):
            pred_row = pred[row_idx] 
            label_row = target_oh[row_idx]

            idx1 = pred_row.argsort()[-mrr_k:][::-1]
            idx2 = label_row.argsort()[-1:][::-1]

            n += 1
            if idx2[0] in idx1:
                suma += 1/int((np.where(idx1 == idx2[0])[0]+1))        

    print("MRR@{} epoch {}: {}".format(mrr_k, epoch, suma/n))


def train_model(model, save_weights = False, path_to_weights = True):
    train_dataset = SessionDataset(train_data)
    
    print("train_dataset \n", train_dataset)

    model_to_train = model

    with tqdm(total=train_samples_qty) as pbar: #顯示處理進度條，每一個user處理一次
        for epoch in range(1, 10):  #跑9次
            
            if path_to_weights: # 
                loader = SessionDataLoader(train_dataset, batch_size=batch_size)
                
                
            for feat, target, mask in loader:
                #print("feat", feat)
                #print("target", target)
                #print("mask", mask)

                input_oh = to_categorical(feat, num_classes=loader.num_movies) 
                input_oh = np.expand_dims(input_oh, axis=1)
                #print("input_oh", input_oh)

                target_oh = to_categorical(target, num_classes=loader.num_movies)
                #print("target_oh", target_oh)
                
                tr_loss = model_to_train.train_on_batch(input_oh, target_oh)
                #print("tr_loss", tr_loss)

                real_mask = np.ones((batch_size, 1))
                for elt in mask:
                    real_mask[elt, :] = 0

                hidden_states = get_states(model_to_train)[0]
                hidden_states = np.multiply(real_mask, hidden_states)
                hidden_states = np.array(hidden_states, dtype=np.float32)
                #print("hidden_states", hidden_states)
                
                model_to_train.layers[1].reset_states(hidden_states)

                pbar.set_description("Epoch {0}. Loss: {1:.5f}".format(epoch, tr_loss))
                pbar.update(loader.done_users_counter)

            # get metrics for epoch
            get_recall(model_to_train, loader, epoch, train_dataset.movie_map)
            get_mrr   (model_to_train, loader, epoch, train_dataset.movie_map)

            # save model
            if save_weights:
                model_to_train.save('DwellTimeEpoch{}.h5'.format(epoch))

In [6]:
class SessionDataLoader:
    """Credit to yhs-968/pyGRU4REC."""    
    def __init__(self, dataset, batch_size=50):
        """
        A class for creating session-parallel mini-batches.
        Args:
             dataset (SessionDataset): the session dataset to generate the batches from
             batch_size (int): size of the batch
        """
        self.dataset = dataset
        self.batch_size = batch_size
        self.done_users_counter = 0
        
    def __iter__(self):
        """ Returns the iterator for producing session-parallel training mini-batches.
        Yields:
            input (B,):  Item indices that will be encoded as one-hot vectors later.
            target (B,): a Variable that stores the target item indices
            masks: Numpy array indicating the positions of the users to be terminated
        """

        # initializations
        df = self.dataset.df
        user_key='UserId'
        movie_key='MovieId'
        time_key='TimeStamp'
        self.num_movies = df[movie_key].nunique()+1
        click_offsets = self.dataset.click_offsets
        session_idx_arr = self.dataset.session_idx_arr

        iters = np.arange(self.batch_size)
        maxiter = iters.max()
        start = click_offsets[session_idx_arr[iters]]
        end = click_offsets[session_idx_arr[iters] + 1]
        mask = [] # indicator for the users to be terminated
        finished = False        
        
        

        while not finished:
            minlen = (end - start).min()
            # Item indices (for embedding) for clicks where the first users start
            idx_target = df.item_idx.values[start]
            for i in range(minlen - 1):
                # Build inputs & targets
                idx_input = idx_target
                idx_target = df.item_idx.values[start + i + 1]
                input = idx_input
                target = idx_target
                yield input, target, mask
                
            # click indices where a particular session meets second-to-last element
            start = start + (minlen - 1)
            # see if how many users should terminate
            mask = np.arange(len(iters))[(end - start) <= 1]
            self.done_users_counter = len(mask)
            for idx in mask:
                maxiter += 1
                if maxiter >= len(click_offsets) - 1:
                    finished = True
                    break
                # update the next starting/ending point
                iters[idx] = maxiter
                start[idx] = click_offsets[session_idx_arr[maxiter]]
                end[idx] = click_offsets[session_idx_arr[maxiter] + 1]
        
        #print("num_movies ",self.num_movies)
        #print("click_offsets ",click_offsets)
        #print("session_idx_arr ",session_idx_arr)
        #print("iters ",iters)
        #print("maxiter " ,maxiter)
        #print("start ",start)
        #print("end ",end)
        #print("mask ",mask)
        #print("finished ",finished)

In [7]:
    PATH_TO_TRAIN = 'processed_augmented_train.csv'
    PATH_TO_DEV = 'processed_dev.csv'
    PATH_TO_TEST = 'processed_test.csv'
    train_data = pd.read_csv(PATH_TO_TRAIN, sep='\t', dtype={'MovieId':np.int64})
    dev_data = pd.read_csv(PATH_TO_DEV, sep='\t', dtype={'MovieId':np.int64})
    test_data = pd.read_csv(PATH_TO_TEST, sep='\t', dtype={'MovieId': np.int64})
    
    batch_size = 512
    session_max_len = 100
    embeddingp=False

    num_movies = len(train_data['MovieId'].unique())+1
    print("Unique training movies:", num_movies)

    dev_num_movies = len(dev_data['MovieId'].unique())+1
    print("Unique dev movies:", dev_num_movies)

    test_num_movies = len(test_data['MovieId'].unique())+1
    print("Unique testing movies:", test_num_movies)

    train_samples_qty = len(train_data['UserId'].unique()) 
    print("Training users:", train_samples_qty)

    dev_samples_qty = len(dev_data['UserId'].unique()) 
    print("Dev users:",dev_samples_qty)

    test_samples_qty = len(test_data['UserId'].unique())
    print("Testing users:", test_samples_qty)
    
    train_fraction = 1 # (1 / fraction) most recent session quantity to consider
    dev_fraction = 1

    train_offset_step=train_samples_qty//batch_size
    dev_offset_step=dev_samples_qty//batch_size
    test_offset_step=test_samples_qty//batch_size
    aux = [0]
    aux.extend(list(train_data['MovieId'].unique()))
    itemids = np.array(aux)
    itemidmap = pd.Series(data=np.arange(num_movies), index=itemids) 
    
    model = create_model()
    
    train_model(model)

Unique training movies: 11619
Unique dev movies: 10105
Unique testing movies: 10366
Training users: 19853
Dev users: 5749
Testing users: 5271
Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (512, 1, 11619)           0         
_________________________________________________________________
cu_dnngru_1 (CuDNNGRU)       [(512, 100), (512, 100)]  3516300   
_________________________________________________________________
dropout_1 (Dropout)          (512, 100)                0         
_________________________________________________________________
dense_1 (Dense)              (512, 11619)              1173519   
Total params: 4,689,819
Trainable params: 4,689,819
Non-trainable params: 0
_

  # This is added back by InteractiveShellApp.init_path()
  0%|          | 0/19853 [00:00<?, ?it/s]

train_dataset 
 <__main__.SessionDataset object at 0x7f3418ebe400>
Instructions for updating:
Use tf.cast instead.


Epoch 1. Loss: 5.00708:  98%|█████████▊| 19408/19853 [04:02<00:04, 89.60it/s] 

12800:0.100234375


Epoch 1. Loss: 5.00708:  98%|█████████▊| 19415/19853 [04:20<00:04, 89.60it/s]

25600:0.0952734375
38400:0.09270833333333334
51200:0.09263671875
64000:0.091875
76800:0.0918359375
89600:0.09114955357142857
102400:0.091337890625
115200:0.09184027777777778
128000:0.092140625
140800:0.09235085227272727
153600:0.09272135416666667
Recall@20 epoch 1: 0.09292829692556634
12800:0.019512992638972212
25600:0.018142292174334156
38400:0.017779179198701957
51200:0.017827835164857374
64000:0.01754193255534555
76800:0.017604920432064103
89600:0.01763028926634335
102400:0.017672822962482876
115200:0.017744824815142814
128000:0.01783160922375306
140800:0.0178565254250353
153600:0.01784887960399495


Epoch 2. Loss: 7.34693:  98%|█████████▊| 19415/19853 [08:31<1:24:24, 11.56s/it]

MRR@20 epoch 1: 0.01786657848998304


Epoch 2. Loss: 4.12299: : 38823it [11:15, 117.65it/s]                          

12800:0.118125


Epoch 2. Loss: 4.12299: : 38830it [11:30, 117.65it/s]

25600:0.1140625
38400:0.11125
51200:0.11130859375
64000:0.111015625
76800:0.11092447916666667
89600:0.11065848214285715
102400:0.11140625
115200:0.11206597222222223
128000:0.1123125
140800:0.11232244318181818
153600:0.11225911458333333
Recall@20 epoch 2: 0.11237105582524272
12800:0.023281662895766463
25600:0.023217009787027402
38400:0.022223348354803395
51200:0.02208594757007383
64000:0.021882454465374856
76800:0.021960250547315166
89600:0.02184225831950687
102400:0.022099497669668632
115200:0.022367177314190596
128000:0.022406675552219393
140800:0.022361517727785666
153600:0.02223557297794026


Epoch 3. Loss: 6.45690: : 38830it [15:40,  8.85s/it] 

MRR@20 epoch 2: 0.02226387815024286


Epoch 3. Loss: 3.87167: : 58238it [18:23, 122.65it/s]

12800:0.12765625


Epoch 3. Loss: 3.87167: : 58245it [18:40, 122.65it/s]

25600:0.1233203125
38400:0.12059895833333334
51200:0.12001953125
64000:0.1186875
76800:0.11828125
89600:0.11797991071428572
102400:0.11912109375
115200:0.11971354166666667
128000:0.1195234375
140800:0.11950994318181818
153600:0.11961588541666666
Recall@20 epoch 3: 0.11955147653721683
12800:0.026091440309896977
25600:0.025343144730374384
38400:0.0246395592671111
51200:0.02471899779613925
64000:0.024298768285646213
76800:0.024318445486457452
89600:0.024407743500734057
102400:0.024802098706210116
115200:0.025037391194029132
128000:0.024942575030808885
140800:0.025052647507187893
153600:0.025039210650636554


Epoch 4. Loss: 6.20623: : 58245it [22:48,  8.85s/it] 

MRR@20 epoch 3: 0.025043308302798255


Epoch 4. Loss: 3.65637: : 77660it [25:31, 116.19it/s]

12800:0.130234375


Epoch 4. Loss: 3.65637: : 77660it [25:50, 116.19it/s]

25600:0.126171875
38400:0.12427083333333333
51200:0.12326171875
64000:0.1218125
76800:0.12149739583333333
89600:0.12125
102400:0.123037109375
115200:0.12363715277777777
128000:0.1239375
140800:0.12421875
153600:0.12451822916666666
Recall@20 epoch 4: 0.12463971480582524
12800:0.02714745980820855
25600:0.026846522146825744
38400:0.02652192059625444
51200:0.02623165200386024
64000:0.025705010275832663
76800:0.02566828192329864
89600:0.025735277997473962
102400:0.02610972640769789
115200:0.026352241821791852
128000:0.026284784790265584
140800:0.02623655045261437
153600:0.026216356997001006


Epoch 5. Loss: 6.06589: : 77660it [29:57, 116.19it/s]

MRR@20 epoch 4: 0.02623726071903876


Epoch 5. Loss: 3.59197: : 97068it [32:41, 116.91it/s]

12800:0.130234375


Epoch 5. Loss: 3.59197: : 97075it [33:00, 116.91it/s]

25600:0.1274609375
38400:0.12565104166666666
51200:0.12486328125
64000:0.12371875
76800:0.12388020833333334
89600:0.123828125
102400:0.125419921875
115200:0.12585069444444444
128000:0.1258125
140800:0.12598011363636363
153600:0.12651692708333334
Recall@20 epoch 5: 0.12651066949838188
12800:0.027197609088980824
25600:0.026897133070995984
38400:0.02626856206374714
51200:0.026192876797594026
64000:0.025670199480990735
76800:0.025634520715432638
89600:0.02581076114205678
102400:0.02618882398176889
115200:0.026367160673391943
128000:0.02629716131177047
140800:0.026303151829607397
153600:0.026324741107546835


Epoch 6. Loss: 5.96029: : 97075it [37:08,  8.90s/it] 

MRR@20 epoch 5: 0.026354503477719067


Epoch 6. Loss: 3.56784: : 116483it [39:51, 114.61it/s]

12800:0.137109375


Epoch 6. Loss: 3.56784: : 116490it [40:11, 114.61it/s]

25600:0.1330078125
38400:0.130390625
51200:0.12916015625
64000:0.12775
76800:0.12760416666666666
89600:0.12762276785714285
102400:0.1287890625
115200:0.12935763888888888
128000:0.1293828125
140800:0.12926136363636365
153600:0.12951171875
Recall@20 epoch 6: 0.12959521642394822
12800:0.02761591456949895
25600:0.027548155889739547
38400:0.02649091751952813
51200:0.026284846556696384
64000:0.025697627988048546
76800:0.025822280161031573
89600:0.026070362485536548
102400:0.02646686744073304
115200:0.02664915536111053
128000:0.026579847818193933
140800:0.026586664757651368
153600:0.026620108010148004


Epoch 7. Loss: 5.97729: : 116490it [44:16, 11.38s/it] 

MRR@20 epoch 6: 0.026648131331948127


Epoch 7. Loss: 3.47688: : 135905it [46:59, 116.67it/s]

12800:0.134375


Epoch 7. Loss: 3.47688: : 135905it [47:11, 116.67it/s]

25600:0.13109375
38400:0.12822916666666667
51200:0.12767578125
64000:0.125984375
76800:0.1263671875
89600:0.126640625
102400:0.128017578125
115200:0.12866319444444443
128000:0.128609375
140800:0.12855823863636365
153600:0.12893880208333333
Recall@20 epoch 7: 0.12902002427184467
12800:0.02857523122430048
25600:0.028141013670033865
38400:0.02707428861571551
51200:0.026982382308691308
64000:0.026582802329933614
76800:0.026533352611068536
89600:0.02661944395433467
102400:0.0269332654589624
115200:0.02712180977597928
128000:0.02705554164117572
140800:0.027008375966167095
153600:0.027026098243324435


Epoch 8. Loss: 5.94701: : 135905it [51:23, 116.67it/s]

MRR@20 epoch 7: 0.027039523300886964


Epoch 8. Loss: 3.41789: : 155313it [54:07, 116.59it/s]

12800:0.137578125


Epoch 8. Loss: 3.41789: : 155320it [54:21, 116.59it/s]

25600:0.131640625
38400:0.12924479166666666
51200:0.1289453125
64000:0.127015625
76800:0.12713541666666667
89600:0.127421875
102400:0.1292578125
115200:0.12996527777777778
128000:0.1299375
140800:0.13011363636363638
153600:0.13048177083333334
Recall@20 epoch 8: 0.1306002224919094
12800:0.028618198757753323
25600:0.028266411724764278
38400:0.027433265018265433
51200:0.027373533393430963
64000:0.02681065497419083
76800:0.026654580763855216
89600:0.02689540169717884
102400:0.02736621517813131
115200:0.027548340672139154
128000:0.027438491610567252
140800:0.027364839233804734
153600:0.027422475895229573


Epoch 9. Loss: 5.85908: : 155320it [58:33,  8.87s/it] 

MRR@20 epoch 8: 0.027426659837399447


Epoch 9. Loss: 3.31624: : 174728it [1:01:16, 115.02it/s]

12800:0.138203125


Epoch 9. Loss: 3.31624: : 174735it [1:01:31, 115.02it/s]

25600:0.13390625
38400:0.13192708333333333
51200:0.130625
64000:0.129328125
76800:0.12954427083333334
89600:0.12982142857142856
102400:0.13138671875
115200:0.1321875
128000:0.1322421875
140800:0.1320028409090909
153600:0.13239583333333332
Recall@20 epoch 9: 0.1324585355987055
12800:0.028832712789245023
25600:0.028095617603749044
38400:0.02729182868266846
51200:0.027318726554686236
64000:0.02680505349000359
76800:0.026604746100969072
89600:0.026660255416019237
102400:0.027056620328798235
115200:0.027245063146182247
128000:0.02719429955951851
140800:0.027215120384645134
153600:0.027209141745917154


Epoch 9. Loss: 3.31624: : 174735it [1:05:42, 44.33it/s] 

MRR@20 epoch 9: 0.027240993072724835



