<a href="https://colab.research.google.com/github/wendy60/recommenders/blob/main/SLi_REC.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install papermill



In [None]:
pip install scrapbook



In [None]:
pip install recommenders



In [None]:
pip install tensorflow-gpu==1.15.2



In [None]:
import sys
import os
import logging
import papermill as pm
import scrapbook as sb
from tempfile import TemporaryDirectory
import numpy as np
import tensorflow as tf
tf.get_logger().setLevel('ERROR') # only show error messages

from recommenders.utils.timer import Timer
from recommenders.utils.constants import SEED
from recommenders.models.deeprec.deeprec_utils import (
    prepare_hparams
)
from recommenders.datasets.amazon_reviews import download_and_extract, data_preprocessing
from recommenders.datasets.download_utils import maybe_download


from recommenders.models.deeprec.models.sequential.sli_rec import SLI_RECModel as SeqModel
####  to use the other model, use one of the following lines:
# from recommenders.models.deeprec.models.sequential.asvd import A2SVDModel as SeqModel
# from recommenders.models.deeprec.models.sequential.caser import CaserModel as SeqModel
# from recommenders.models.deeprec.models.sequential.gru4rec import GRU4RecModel as SeqModel
# from recommenders.models.deeprec.models.sequential.sum import SUMModel as SeqModel

#from recommenders.models.deeprec.models.sequential.nextitnet import NextItNetModel

from recommenders.models.deeprec.io.sequential_iterator import SequentialIterator
#from recommenders.models.deeprec.io.nextitnet_iterator import NextItNetIterator

print("System version: {}".format(sys.version))
print("Tensorflow version: {}".format(tf.__version__))

System version: 3.7.12 (default, Sep 10 2021, 00:21:48) 
[GCC 7.5.0]
Tensorflow version: 1.15.2


In [None]:
##  ATTENTION: change to the corresponding config file, e.g., caser.yaml for CaserModel, sum.yaml for SUMModel
yaml_file = '../../recommenders/models/deeprec/config/sli_rec.yaml'

In [None]:
EPOCHS = 10
BATCH_SIZE = 400
RANDOM_SEED = SEED  # Set None for non-deterministic result

data_path = os.path.join("..", "..", "tests", "resources", "deeprec", "slirec")


In [None]:
# for test
train_file = os.path.join(data_path, r'train_data')
valid_file = os.path.join(data_path, r'valid_data')
test_file = os.path.join(data_path, r'test_data')
user_vocab = os.path.join(data_path, r'user_vocab.pkl')
item_vocab = os.path.join(data_path, r'item_vocab.pkl')
cate_vocab = os.path.join(data_path, r'category_vocab.pkl')
output_file = os.path.join(data_path, r'output.txt')

reviews_name = 'reviews_Movies_and_TV_5.json'
meta_name = 'meta_Movies_and_TV.json'
reviews_file = os.path.join(data_path, reviews_name)
meta_file = os.path.join(data_path, meta_name)
train_num_ngs = 4 # number of negative instances with a positive instance for training
valid_num_ngs = 4 # number of negative instances with a positive instance for validation
test_num_ngs = 9 # number of negative instances with a positive instance for testing
sample_rate = 0.01 # sample a small item set for training and testing here for fast example

input_files = [reviews_file, meta_file, train_file, valid_file, test_file, user_vocab, item_vocab, cate_vocab]

if not os.path.exists(train_file):
    download_and_extract(reviews_name, reviews_file)
    download_and_extract(meta_name, meta_file)
    data_preprocessing(*input_files, sample_rate=sample_rate, valid_num_ngs=valid_num_ngs, test_num_ngs=test_num_ngs)
    #### uncomment this for the NextItNet model, because it does not need to unfold the user history
    # data_preprocessing(*input_files, sample_rate=sample_rate, valid_num_ngs=valid_num_ngs, test_num_ngs=test_num_ngs, is_history_expanding=False)

NameError: ignored

In [None]:
user_vocab = os.path.join(data_path, r'user_vocab.pkl')
item_vocab = os.path.join(data_path, r'item_vocab.pkl')
cate_vocab = os.path.join(data_path, r'category_vocab.pkl')
train_num_ngs = 4

In [None]:
### NOTE:  
### remember to use `_create_vocab(train_file, user_vocab, item_vocab, cate_vocab)` to generate the user_vocab, item_vocab and cate_vocab files, if you are using your own dataset rather than using our demo Amazon dataset.
hparams = prepare_hparams(yaml_file, 
                          embed_l2=0., 
                          layer_l2=0., 
                          learning_rate=0.001,  # set to 0.01 if batch normalization is disable
                          epochs=EPOCHS,
                          batch_size=BATCH_SIZE,
                          show_step=20,
                          MODEL_DIR=os.path.join(data_path, "model/"),
                          SUMMARIES_DIR=os.path.join(data_path, "summary/"),
                          user_vocab=user_vocab,
                          item_vocab=item_vocab,
                          cate_vocab=cate_vocab,
                          need_sample=True,
                          train_num_ngs=train_num_ngs, 
                          # provides the number of negative instances for each positive instance for loss computation.
           
            )

FileNotFoundError: ignored

In [None]:
input_creator = SequentialIterator
#### uncomment this for the NextItNet model, because it needs a special data iterator for training
#input_creator = NextItNetIterator

In [None]:
model = SeqModel(hparams, input_creator, seed=RANDOM_SEED)

## sometimes we don't want to train a model from scratch
## then we can load a pre-trained model like this: 
#model.load_model(r'your_model_path')

NameError: ignored

In [None]:
# test_num_ngs is the number of negative lines after each positive line in your test_file
print(model.run_eval(test_file, num_ngs=test_num_ngs))


In [None]:
with Timer() as train_time:
    model = model.fit(train_file, valid_file, valid_num_ngs=valid_num_ngs) 

# valid_num_ngs is the number of negative lines after each positive line in your valid_file 
# we will evaluate the performance of model on valid_file every epoch
print('Time cost for training is {0:.2f} mins'.format(train_time.interval/60.0))


In [None]:
res_syn = model.run_eval(test_file, num_ngs=test_num_ngs)
print(res_syn)

In [None]:
sb.glue("res_syn", res_syn)

In [None]:
model = model.predict(test_file, output_file)

In [None]:
model_best_trained = SeqModel(hparams, input_creator, seed=RANDOM_SEED)
path_best_trained = os.path.join(hparams.MODEL_DIR, "best_model")
print('loading saved model in {0}'.format(path_best_trained))
model_best_trained.load_model(path_best_trained)

In [None]:
model_best_trained.run_eval(test_file, num_ngs=test_num_ngs)

In [None]:
model_best_trained.predict(test_file, output_file)

In [None]:
with model_best_trained.sess as sess:
    graph_def = model_best_trained.graph.as_graph_def()
    output_graph_def = tf.graph_util.convert_variables_to_constants(
        sess,
        graph_def,
        ["pred"]
    )

    outfilepath = os.path.join(hparams.MODEL_DIR, "serving_model.pb")
    with tf.gfile.GFile(outfilepath, 'wb') as f:
        f.write(output_graph_def.SerializeToString())

In [None]:
class LoadFrozedPredModel:
    def __init__(self, graph):
        self.pred = graph.get_tensor_by_name('import/pred:0') 
        self.items = graph.get_tensor_by_name('import/items:0') 
        self.cates = graph.get_tensor_by_name('import/cates:0') 
        self.item_history = graph.get_tensor_by_name('import/item_history:0') 
        self.item_cate_history = graph.get_tensor_by_name('import/item_cate_history:0') 
        self.mask = graph.get_tensor_by_name('import/mask:0')  
        self.time_from_first_action = graph.get_tensor_by_name('import/time_from_first_action:0') 
        self.time_to_now = graph.get_tensor_by_name('import/time_to_now:0') 
        self.layer_keeps = graph.get_tensor_by_name('import/layer_keeps:0') 
        self.is_training = graph.get_tensor_by_name('import/is_training:0')

In [None]:
def infer_as_serving(model, infile, outfile, hparams, iterator, sess):
    preds = []
    
    for batch_data_input in iterator.load_data_from_file(infile, batch_num_ngs=0):
        if batch_data_input:
            feed_dict = {
                model.layer_keeps:np.ones(3, dtype=np.float32),
                model.is_training:False,
                model.items: batch_data_input[iterator.items],
                model.cates: batch_data_input[iterator.cates],
                model.item_history: batch_data_input[iterator.item_history],
                model.item_cate_history: batch_data_input[iterator.item_cate_history],
                model.mask: batch_data_input[iterator.mask],
                model.time_from_first_action: batch_data_input[iterator.time_from_first_action],
                model.time_to_now: batch_data_input[iterator.time_to_now]
            }
            step_pred = sess.run(model.pred, feed_dict=feed_dict)
            preds.extend(np.reshape(step_pred, -1))
                
    with open(outfile, "w") as wt:
        for line in preds:
            wt.write('{0}\n'.format(line))

In [None]:
G = tf.Graph()
with tf.gfile.GFile(
        os.path.join(hparams.MODEL_DIR, "serving_model.pb"),
        'rb'
) as f, G.as_default():
    graph_def_optimized = tf.GraphDef()
    graph_def_optimized.ParseFromString(f.read())
    
    ####  uncomment this line if you want to check what conent is included in the graph
    #print('graph_def_optimized = ' + str(graph_def_optimized))


with tf.Session(graph=G) as sess:
    tf.import_graph_def(graph_def_optimized)

    model = LoadFrozedPredModel(sess.graph)
    
    serving_output_file = os.path.join(data_path, r'output_serving.txt')  
    iterator = input_creator(hparams, tf.Graph())
    infer_as_serving(model, test_file, serving_output_file, hparams, iterator, sess)
