### The purpose of this notebook is to create a recommendation engine (DREAM) for cluster 1 from customer segmentation of instacart dataset

### Data
   * Input: sequence of baskets of cluster 1 customers
   * Negative sample : The food items which is never purchased by customers
   
### Evaluation Metric (Top 15 items)
   * Hit Rate @15
       - Counts the fraction of times that the ground truth next item is among the top 15 items.
       - we only have one test item for each user, Hit@15 is equivalent to Recall@15
       - It is also propotional to Precision@15
   * NDCG@15
       - A position aware metric with assigns larger weights on higher positions.
       
### Model 
   * Model is saved under runs folder with this key 1605813697
   * This key 1605813697 is required to run it on Test data

In [1]:
import os
import math
import random
import time
import logging
import pickle
import torch
import numpy as np
from math import ceil
import data_helper as dh
from configc2 import Config
from rnn_model import DRModel

In [2]:
logging.info("✔︎ DREAM Model Training...")
logger = dh.logger_fn("torch-log", "logs/training-{0}.log".format(time.asctime()))

In [3]:
dilim = '-' * 120
logger.info(dilim)
for attr in sorted(Config().__dict__):
    logger.info('{:>50}|{:<50}'.format(attr.upper(), Config().__dict__[attr]))
logger.info(dilim)

INFO:torch-log:------------------------------------------------------------------------------------------------------------------------
INFO:torch-log:                                         MODEL_DIR|runs/                                             
INFO:torch-log:                                       NEG_SAMPLES|../data/neg_sample_insta_all_sampled_b4.pickle    
INFO:torch-log:                                       TESTSET_DIR|../data/allb4_cluster1_test.json                  
INFO:torch-log:                                   TRAININGSET_DIR|../data/allb4_cluster1_train.json                 
INFO:torch-log:                                 VALIDATIONSET_DIR|../data/allb4_cluster1_val.json                   
INFO:torch-log:                                  BASKET_POOL_TYPE|max                                               
INFO:torch-log:                                        BATCH_SIZE|500                                               
INFO:torch-log:                              

In [4]:
def train():
    # Load data
    logger.info("✔︎ Loading data...")

    logger.info("✔︎ Training data processing...")
    train_data = dh.load_data(Config().TRAININGSET_DIR)

    logger.info("✔︎ Validation data processing...")
    validation_data = dh.load_data(Config().VALIDATIONSET_DIR)

    logger.info("✔︎ Test data processing...")
    test_data = dh.load_data(Config().TESTSET_DIR)

    logger.info("✔︎ Load negative sample...")
    with open(Config().NEG_SAMPLES, 'rb') as handle:
        neg_samples = pickle.load(handle)

    # Model config
    model = DRModel(Config())

    # Optimizer
    optimizer = torch.optim.Adam(model.parameters(), lr=Config().learning_rate)

    def bpr_loss(uids, baskets, dynamic_user, item_embedding):
        """
        Bayesian personalized ranking loss for implicit feedback.

        Args:
            uids: batch of users' ID
            baskets: batch of users' baskets
            dynamic_user: batch of users' dynamic representations
            item_embedding: item_embedding matrix
        """
        loss = 0
        for uid, bks, du in zip(uids, baskets, dynamic_user):
            du_p_product = torch.mm(du, item_embedding.t())  # shape: [pad_len, num_item]
            loss_u = []  # loss for user
            for t, basket_t in enumerate(bks):
                if basket_t[0] != 0 and t != 0:
                    pos_idx = torch.cuda.LongTensor(basket_t) 

                    # Sample negative products
                    neg = random.sample(list(neg_samples[uid]), len(basket_t))
                    neg_idx = torch.cuda.LongTensor(neg) 

                    # Score p(u, t, v > v')
                    score = du_p_product[t - 1][pos_idx] - du_p_product[t - 1][neg_idx]

                    # Average Negative log likelihood for basket_t
                    loss_u.append(torch.mean(-torch.nn.LogSigmoid()(score)))
            for i in loss_u:
                loss = loss + i / len(loss_u)
        avg_loss = torch.div(loss, len(baskets))
        return avg_loss

    def train_model():
        model.train()  # turn on training mode for dropout
        dr_hidden = model.init_hidden(Config().batch_size)
        train_loss = 0
        start_time = time.clock()
        num_batches = ceil(len(train_data) / Config().batch_size)
        for i, x in enumerate(dh.batch_iter(train_data, Config().batch_size, Config().seq_len, shuffle=True)):
            uids, baskets, lens = x
            model.zero_grad()  
            dynamic_user, _ = model(baskets, lens, dr_hidden)

            loss = bpr_loss(uids, baskets, dynamic_user, model.encode.weight)
            loss.backward()

            # Clip to avoid gradient exploding
            torch.nn.utils.clip_grad_norm_(model.parameters(), Config().clip)

            # Parameter updating
            optimizer.step()
            train_loss += loss.data

            # Logging
            if i % Config().log_interval == 0 and i > 0:
                elapsed = (time.clock() - start_time) / Config().log_interval
                cur_loss = train_loss.item() / Config().log_interval  # turn tensor into float
                train_loss = 0
                start_time = time.clock()
                logger.info('[Training]| Epochs {:3d} | Batch {:5d} / {:5d} | ms/batch {:02.2f} | Loss {:05.4f} |'
                            .format(epoch, i, num_batches, elapsed, cur_loss))

    def validate_model():
        model.eval()
        dr_hidden = model.init_hidden(Config().batch_size)
        val_loss = 0
        start_time = time.clock()
        num_batches = ceil(len(validation_data) / Config().batch_size)
        for i, x in enumerate(dh.batch_iter(validation_data, Config().batch_size, Config().seq_len, shuffle=False)):
            uids, baskets, lens = x
            dynamic_user, _ = model(baskets, lens, dr_hidden)
            loss = bpr_loss(uids, baskets, dynamic_user, model.encode.weight)
            val_loss += loss.data

        # Logging
        elapsed = (time.clock() - start_time) * 1000 / num_batches
        val_loss = val_loss.item() / num_batches
        logger.info('[Validation]| Epochs {:3d} | Elapsed {:02.2f} | Loss {:05.4f} |'
                    .format(epoch, elapsed, val_loss))
        return val_loss

    def test_model():
        model.eval()
        item_embedding = model.encode.weight
        dr_hidden = model.init_hidden(Config().batch_size)

        hitratio_numer = 0
        hitratio_denom = 0
        ndcg = 0.0

        for i, x in enumerate(dh.batch_iter(train_data, Config().batch_size, Config().seq_len, shuffle=False)):
            uids, baskets, lens = x
            dynamic_user, _ = model(baskets, lens, dr_hidden)
            for uid, l, du in zip(uids, lens, dynamic_user):
                scores = []
                du_latest = du[l - 1].unsqueeze(0)

                # calculating <u,p> score for all test items <u,p> pair
                positives = test_data[test_data['userID'] == uid].baskets.values[0]  # list dim 1
                p_length = len(positives)
                positives = torch.cuda.LongTensor(positives)

                # Deal with positives samples
                scores_pos = list(torch.mm(du_latest, item_embedding[positives].t()).data.numpy()[0])
                for s in scores_pos:
                    scores.append(s)

                # Deal with negative samples
                negtives = random.sample(list(neg_samples[uid]), Config().neg_num)
                negtives = torch.cuda.LongTensor(negtives) 
                scores_neg = list(torch.mm(du_latest, item_embedding[negtives].t()).data.numpy()[0])
                for s in scores_neg:
                    scores.append(s)

                # Calculate hit-ratio
                index_k = []
                for k in range(Config().top_k):
                    index = scores.index(max(scores))
                    index_k.append(index)
                    scores[index] = -9999
                hitratio_numer += len((set(np.arange(0, p_length)) & set(index_k)))
                hitratio_denom += p_length

                # Calculate NDCG
                u_dcg = 0
                u_idcg = 0
                for k in range(Config().top_k):
                    if index_k[k] < p_length:  
                        u_dcg += 1 / math.log(k + 1 + 1, 2)
                    u_idcg += 1 / math.log(k + 1 + 1, 2)
                ndcg += u_dcg / u_idcg

        hit_ratio = hitratio_numer / hitratio_denom
        ndcg = ndcg / len(train_data)
        logger.info('[Test]| Epochs {:3d} | Hit ratio {:02.4f} | NDCG {:05.4f} |'
                    .format(epoch, hit_ratio, ndcg))
        return hit_ratio, ndcg

    timestamp = str(int(time.time()))
    out_dir = os.path.abspath(os.path.join(os.path.curdir, "runs", timestamp))
    if not os.path.exists(out_dir):
        os.makedirs(out_dir)
    logger.info('Save into {0}'.format(out_dir))
    checkpoint_dir = out_dir + '/model-{epoch:02d}-{hitratio:.4f}-{ndcg:.4f}.model'

    best_hit_ratio = None

    try:
        # Training
        for epoch in [99, 102, 104, 106]: #range(100, Config().epochs):
            train_model()
            logger.info('-' * 89)

            val_loss = validate_model()
            logger.info('-' * 89)

            hit_ratio, ndcg = test_model()
            logger.info('-' * 89)

            # Checkpoint
            if not best_hit_ratio or hit_ratio > best_hit_ratio:
                with open(checkpoint_dir.format(epoch=epoch, hitratio=hit_ratio, ndcg=ndcg), 'wb') as f:
                    torch.save(model, f)
                best_hit_ratio = hit_ratio
    except KeyboardInterrupt:
        logger.info('*' * 89)
        logger.info('Early Stopping!')


if __name__ == '__main__':
    train()

INFO:torch-log:✔︎ Loading data...
INFO:torch-log:✔︎ Training data processing...
INFO:torch-log:✔︎ Validation data processing...
INFO:torch-log:✔︎ Test data processing...
INFO:torch-log:✔︎ Load negative sample...
INFO:torch-log:Save into /home/reshmask/Next-Basket-Recommendation-master/DREAM/runs/1606163676
INFO:torch-log:[Training]| Epochs  99 | Batch     1 /    47 | ms/batch 751.75 | Loss 1.4692 |
INFO:torch-log:[Training]| Epochs  99 | Batch     2 /    47 | ms/batch 390.50 | Loss 0.6960 |
INFO:torch-log:[Training]| Epochs  99 | Batch     3 /    47 | ms/batch 388.84 | Loss 0.6902 |
INFO:torch-log:[Training]| Epochs  99 | Batch     4 /    47 | ms/batch 390.76 | Loss 0.6799 |
INFO:torch-log:[Training]| Epochs  99 | Batch     5 /    47 | ms/batch 415.52 | Loss 0.6732 |
INFO:torch-log:[Training]| Epochs  99 | Batch     6 /    47 | ms/batch 324.05 | Loss 0.6665 |
INFO:torch-log:[Training]| Epochs  99 | Batch     7 /    47 | ms/batch 366.36 | Loss 0.6610 |
INFO:torch-log:[Training]| Epochs 

INFO:torch-log:[Training]| Epochs 102 | Batch    21 /    47 | ms/batch 384.54 | Loss 0.1972 |
INFO:torch-log:[Training]| Epochs 102 | Batch    22 /    47 | ms/batch 337.93 | Loss 0.1914 |
INFO:torch-log:[Training]| Epochs 102 | Batch    23 /    47 | ms/batch 336.92 | Loss 0.1899 |
INFO:torch-log:[Training]| Epochs 102 | Batch    24 /    47 | ms/batch 358.81 | Loss 0.1906 |
INFO:torch-log:[Training]| Epochs 102 | Batch    25 /    47 | ms/batch 399.93 | Loss 0.1954 |
INFO:torch-log:[Training]| Epochs 102 | Batch    26 /    47 | ms/batch 312.33 | Loss 0.1936 |
INFO:torch-log:[Training]| Epochs 102 | Batch    27 /    47 | ms/batch 354.10 | Loss 0.1988 |
INFO:torch-log:[Training]| Epochs 102 | Batch    28 /    47 | ms/batch 338.81 | Loss 0.1904 |
INFO:torch-log:[Training]| Epochs 102 | Batch    29 /    47 | ms/batch 426.63 | Loss 0.1823 |
INFO:torch-log:[Training]| Epochs 102 | Batch    30 /    47 | ms/batch 344.27 | Loss 0.1892 |
INFO:torch-log:[Training]| Epochs 102 | Batch    31 /    47 

INFO:torch-log:[Training]| Epochs 106 | Batch     7 /    47 | ms/batch 336.46 | Loss 0.1614 |
INFO:torch-log:[Training]| Epochs 106 | Batch     8 /    47 | ms/batch 391.39 | Loss 0.1631 |
INFO:torch-log:[Training]| Epochs 106 | Batch     9 /    47 | ms/batch 332.78 | Loss 0.1614 |
INFO:torch-log:[Training]| Epochs 106 | Batch    10 /    47 | ms/batch 420.06 | Loss 0.1661 |
INFO:torch-log:[Training]| Epochs 106 | Batch    11 /    47 | ms/batch 397.97 | Loss 0.1649 |
INFO:torch-log:[Training]| Epochs 106 | Batch    12 /    47 | ms/batch 402.46 | Loss 0.1655 |
INFO:torch-log:[Training]| Epochs 106 | Batch    13 /    47 | ms/batch 359.98 | Loss 0.1660 |
INFO:torch-log:[Training]| Epochs 106 | Batch    14 /    47 | ms/batch 387.93 | Loss 0.1707 |
INFO:torch-log:[Training]| Epochs 106 | Batch    15 /    47 | ms/batch 347.17 | Loss 0.1651 |
INFO:torch-log:[Training]| Epochs 106 | Batch    16 /    47 | ms/batch 384.16 | Loss 0.1614 |
INFO:torch-log:[Training]| Epochs 106 | Batch    17 /    47 

In [1]:
import time
import random
import math
import pickle
import torch
import numpy as np
from configc2 import Config
import data_helper as dh


logger = dh.logger_fn("torch-log", "logs/test-{0}.log".format(time.asctime()))

MODEL = input("☛ Please input the model file you want to test: ")

while not (MODEL.isdigit() and len(MODEL) == 10):
    MODEL = input("✘ The format of your input is illegal, it should be like(1490175368), please re-input: ")
logger.info("✔︎ The format of your input is legal, now loading to next step...")

MODEL_DIR = dh.load_model_file(MODEL)


def test():
    # Load data
    logger.info("✔︎ Loading data...")

    logger.info("✔︎ Training data processing...")
    train_data = dh.load_data(Config().TRAININGSET_DIR)

    logger.info("✔︎ Test data processing...")
    test_data = dh.load_data(Config().TESTSET_DIR)

    logger.info("✔︎ Load negative sample...")
    with open(Config().NEG_SAMPLES, 'rb') as handle:
        neg_samples = pickle.load(handle)

    # Load model
    dr_model = torch.load(MODEL_DIR)

    dr_model.eval()

    item_embedding = dr_model.encode.weight
    hidden = dr_model.init_hidden(Config().batch_size)

    hitratio_numer = 0
    hitratio_denom = 0
    ndcg = 0.0
    results = []

    for i, x in enumerate(dh.batch_iter(train_data, Config().batch_size, Config().seq_len, shuffle=False)):
        uids, baskets, lens = x
        dynamic_user, _ = dr_model(baskets, lens, hidden)
        for uid, l, du in zip(uids, lens, dynamic_user):
            scores = []
            du_latest = du[l - 1].unsqueeze(0)

            # calculating <u,p> score for all test items <u,p> pair
            positives = test_data[test_data['userID'] == uid].baskets.values[0]  # list dim 1
            p_length = len(positives)
            positives = torch.LongTensor(positives)

            # Deal with positives samples
            scores_pos = list(torch.mm(du_latest, item_embedding[positives].t()).data.numpy()[0])
            for s in scores_pos:
                scores.append(s)

            # Deal with negative samples
            negtives = random.sample(list(neg_samples[uid]), Config().neg_num)
            negtives = torch.LongTensor(negtives)
            scores_neg = list(torch.mm(du_latest, item_embedding[negtives].t()).data.numpy()[0])
            for s in scores_neg:
                scores.append(s)

            # Calculate hit-ratio
            index_k = []
            for k in range(Config().top_k):
                index = scores.index(max(scores))
                index_k.append(index)
                scores[index] = -9999
            single_hit = len((set(np.arange(0, p_length)) & set(index_k)))/p_length
            results.append([uid,index_k, set(np.arange(0, p_length)), single_hit])
            hitratio_numer += len((set(np.arange(0, p_length)) & set(index_k)))
            hitratio_denom += p_length

            # Calculate NDCG
            u_dcg = 0
            u_idcg = 0
            for k in range(Config().top_k):
                if index_k[k] < p_length:  
                    u_dcg += 1 / math.log(k + 1 + 1, 2)
                u_idcg += 1 / math.log(k + 1 + 1, 2)
            ndcg += u_dcg / u_idcg

    hitratio = hitratio_numer / hitratio_denom
    ndcg = ndcg / len(train_data)
    print('Hit ratio[{0}]: {1}'.format(Config().top_k, hitratio))
    print('NDCG[{0}]: {1}'.format(Config().top_k, ndcg))
    return results

if __name__ == '__main__':
    results_ = test()

☛ Please input the model file you want to test: 1605813697
Hit ratio[15]: 0.4014114566000608
NDCG[15]: 0.359027566598392


In [2]:
import pandas as pd
result_df = pd.DataFrame(results_, columns=['UserID', 'Prediction', 'Actual', 'Hit-Ratio'])

In [3]:
def confi_cut(score):
    if score >= 0.0 and score <0.2:
        return '0-0.2'
    elif score >= 0.2 and score <0.4:
        return '0.2-0.4'
    elif score >= 0.4 and score <0.6:
        return '0.4-0.6'
    elif score >= 0.6 and score <0.8:
        return '0.6-0.8'
    else:
        return '0.8-1.0'

In [4]:
result_df['confi'] = result_df['Hit-Ratio'].apply(lambda x: confi_cut(x))

In [11]:
result_df.to_csv('results/final_cluster_1_result.csv', index=False)