### The purpose of this notebook is to create a recommendation Model 3- DREAM for sequential basket dataset of instacart 2017 dataset

#### Author :  Reshma Patil

### Data
   * Input: sequence of baskets
   * Negative sample : The food items which is never purchased by customers
   
### Evaluation Metric (Top 15 items)
   * Hit Rate @15
       - Counts the fraction of times that the ground truth next item is among the top 15 items.
       - we only have one test item for each user, Hit@15 is equivalent to Recall@15
       - It is also propotional to Precision@15
   * NDCG@15
       - A position aware metric with assigns larger weights on higher positions.
       
### Model 
    * Model is saved under runs folder with this key 1605474009
    * This key 1605474009 is required to run it on Test data

In [1]:
import os
import math
import random
import time
import logging
import pickle
import torch
import numpy as np
from math import ceil
import data_helper as dh
from config import Config
from rnn_model import DRModel
from train import train

In [2]:
logging.info("✔︎ DREAM Model Training...")
logger = dh.logger_fn("torch-log", "logs/training-{0}.log".format(time.asctime()))

In [3]:
dilim = '-' * 120
logger.info(dilim)
for attr in sorted(Config().__dict__):
    logger.info('{:>50}|{:<50}'.format(attr.upper(), Config().__dict__[attr]))
logger.info(dilim)

INFO:torch-log:------------------------------------------------------------------------------------------------------------------------
INFO:torch-log:                                         MODEL_DIR|runs/                                             
INFO:torch-log:                                       NEG_SAMPLES|../data/neg_sample_insta_all_sampled_b4.pickle    
INFO:torch-log:                                       TESTSET_DIR|../data/test_insta_all_b4.json                    
INFO:torch-log:                                   TRAININGSET_DIR|../data/train_insta_all_b4.json                   
INFO:torch-log:                                 VALIDATIONSET_DIR|../data/validation_insta_all_b4.json              
INFO:torch-log:                                  BASKET_POOL_TYPE|max                                               
INFO:torch-log:                                        BATCH_SIZE|1000                                              
INFO:torch-log:                              

In [None]:
if __name__ == '__main__':
    train()

INFO:torch-log:✔︎ Loading data...
INFO:torch-log:✔︎ Training data processing...
INFO:torch-log:✔︎ Validation data processing...
INFO:torch-log:✔︎ Test data processing...
INFO:torch-log:✔︎ Load negative sample...
INFO:torch-log:Save into /home/reshmask/Next-Basket-Recommendation-master/DREAM/runs/1605474009
INFO:torch-log:[Training]| Epochs 102 | Batch     1 /   157 | ms/batch 854.51 | Loss 1.4687 |
INFO:torch-log:[Training]| Epochs 102 | Batch     2 /   157 | ms/batch 524.27 | Loss 0.6940 |
INFO:torch-log:[Training]| Epochs 102 | Batch     3 /   157 | ms/batch 526.58 | Loss 0.6847 |
INFO:torch-log:[Training]| Epochs 102 | Batch     4 /   157 | ms/batch 537.58 | Loss 0.6768 |
INFO:torch-log:[Training]| Epochs 102 | Batch     5 /   157 | ms/batch 484.71 | Loss 0.6679 |
INFO:torch-log:[Training]| Epochs 102 | Batch     6 /   157 | ms/batch 477.83 | Loss 0.6669 |
INFO:torch-log:[Training]| Epochs 102 | Batch     7 /   157 | ms/batch 528.21 | Loss 0.6537 |
INFO:torch-log:[Training]| Epochs 

INFO:torch-log:[Training]| Epochs 102 | Batch    77 /   157 | ms/batch 472.27 | Loss 0.1895 |
INFO:torch-log:[Training]| Epochs 102 | Batch    78 /   157 | ms/batch 460.45 | Loss 0.1941 |
INFO:torch-log:[Training]| Epochs 102 | Batch    79 /   157 | ms/batch 480.62 | Loss 0.1958 |
INFO:torch-log:[Training]| Epochs 102 | Batch    80 /   157 | ms/batch 459.72 | Loss 0.1959 |
INFO:torch-log:[Training]| Epochs 102 | Batch    81 /   157 | ms/batch 461.50 | Loss 0.1906 |
INFO:torch-log:[Training]| Epochs 102 | Batch    82 /   157 | ms/batch 456.97 | Loss 0.1981 |
INFO:torch-log:[Training]| Epochs 102 | Batch    83 /   157 | ms/batch 455.64 | Loss 0.1989 |
INFO:torch-log:[Training]| Epochs 102 | Batch    84 /   157 | ms/batch 466.23 | Loss 0.1983 |
INFO:torch-log:[Training]| Epochs 102 | Batch    85 /   157 | ms/batch 454.86 | Loss 0.1943 |
INFO:torch-log:[Training]| Epochs 102 | Batch    86 /   157 | ms/batch 504.92 | Loss 0.1910 |
INFO:torch-log:[Training]| Epochs 102 | Batch    87 /   157 

INFO:torch-log:-----------------------------------------------------------------------------------------
INFO:torch-log:[Test]| Epochs 102 | Hit ratio 0.4583 | NDCG 0.3552 |
INFO:torch-log:-----------------------------------------------------------------------------------------
INFO:torch-log:[Training]| Epochs 104 | Batch     1 /   157 | ms/batch 823.68 | Loss 0.3504 |
INFO:torch-log:[Training]| Epochs 104 | Batch     2 /   157 | ms/batch 429.21 | Loss 0.1769 |
INFO:torch-log:[Training]| Epochs 104 | Batch     3 /   157 | ms/batch 427.74 | Loss 0.1784 |
INFO:torch-log:[Training]| Epochs 104 | Batch     4 /   157 | ms/batch 447.88 | Loss 0.1780 |
INFO:torch-log:[Training]| Epochs 104 | Batch     5 /   157 | ms/batch 431.19 | Loss 0.1853 |
INFO:torch-log:[Training]| Epochs 104 | Batch     6 /   157 | ms/batch 436.53 | Loss 0.1793 |
INFO:torch-log:[Training]| Epochs 104 | Batch     7 /   157 | ms/batch 446.57 | Loss 0.1745 |
INFO:torch-log:[Training]| Epochs 104 | Batch     8 /   157 | m

INFO:torch-log:[Training]| Epochs 104 | Batch    86 /   157 | ms/batch 428.13 | Loss 0.1716 |
INFO:torch-log:[Training]| Epochs 104 | Batch    87 /   157 | ms/batch 419.90 | Loss 0.1723 |
INFO:torch-log:[Training]| Epochs 104 | Batch    88 /   157 | ms/batch 419.68 | Loss 0.1826 |
INFO:torch-log:[Training]| Epochs 104 | Batch    89 /   157 | ms/batch 417.26 | Loss 0.1776 |
INFO:torch-log:[Training]| Epochs 104 | Batch    90 /   157 | ms/batch 425.96 | Loss 0.1765 |
INFO:torch-log:[Training]| Epochs 104 | Batch    91 /   157 | ms/batch 413.96 | Loss 0.1780 |
INFO:torch-log:[Training]| Epochs 104 | Batch    92 /   157 | ms/batch 434.49 | Loss 0.1775 |
INFO:torch-log:[Training]| Epochs 104 | Batch    93 /   157 | ms/batch 443.13 | Loss 0.1792 |
INFO:torch-log:[Training]| Epochs 104 | Batch    94 /   157 | ms/batch 440.98 | Loss 0.1774 |
INFO:torch-log:[Training]| Epochs 104 | Batch    95 /   157 | ms/batch 436.48 | Loss 0.1771 |
INFO:torch-log:[Training]| Epochs 104 | Batch    96 /   157 

INFO:torch-log:[Training]| Epochs 106 | Batch    13 /   157 | ms/batch 430.27 | Loss 0.1598 |
INFO:torch-log:[Training]| Epochs 106 | Batch    14 /   157 | ms/batch 431.73 | Loss 0.1622 |
INFO:torch-log:[Training]| Epochs 106 | Batch    15 /   157 | ms/batch 425.48 | Loss 0.1632 |
INFO:torch-log:[Training]| Epochs 106 | Batch    16 /   157 | ms/batch 429.05 | Loss 0.1537 |
INFO:torch-log:[Training]| Epochs 106 | Batch    17 /   157 | ms/batch 427.56 | Loss 0.1587 |
INFO:torch-log:[Training]| Epochs 106 | Batch    18 /   157 | ms/batch 435.65 | Loss 0.1617 |
INFO:torch-log:[Training]| Epochs 106 | Batch    19 /   157 | ms/batch 420.58 | Loss 0.1605 |
INFO:torch-log:[Training]| Epochs 106 | Batch    20 /   157 | ms/batch 427.79 | Loss 0.1529 |
INFO:torch-log:[Training]| Epochs 106 | Batch    21 /   157 | ms/batch 409.54 | Loss 0.1590 |
INFO:torch-log:[Training]| Epochs 106 | Batch    22 /   157 | ms/batch 432.42 | Loss 0.1620 |
INFO:torch-log:[Training]| Epochs 106 | Batch    23 /   157 

In [4]:
#key to use:1605474009

logger = dh.logger_fn("torch-log", "logs/test-{0}.log".format(time.asctime()))

MODEL = input("☛ Please input the model file you want to test: ")

while not (MODEL.isdigit() and len(MODEL) == 10):
    MODEL = input("✘ The format of your input is illegal, it should be like(1490175368), please re-input: ")
logger.info("✔︎ The format of your input is legal, now loading to next step...")

MODEL_DIR = dh.load_model_file(MODEL)


def test():
    # Load data
    logger.info("✔︎ Loading data...")

    logger.info("✔︎ Training data processing...")
    train_data = dh.load_data(Config().TRAININGSET_DIR)

    logger.info("✔︎ Test data processing...")
    test_data = dh.load_data(Config().TESTSET_DIR)

    logger.info("✔︎ Load negative sample...")
    with open(Config().NEG_SAMPLES, 'rb') as handle:
        neg_samples = pickle.load(handle)

    # Load model
    dr_model = torch.load(MODEL_DIR)

    dr_model.eval()

    item_embedding = dr_model.encode.weight
    hidden = dr_model.init_hidden(Config().batch_size)

    hitratio_numer = 0
    hitratio_denom = 0
    ndcg = 0.0
    results = []

    for i, x in enumerate(dh.batch_iter(train_data, Config().batch_size, Config().seq_len, shuffle=False)):
        uids, baskets, lens = x
        dynamic_user, _ = dr_model(baskets, lens, hidden)
        for uid, l, du in zip(uids, lens, dynamic_user):
            scores = []
            du_latest = du[l - 1].unsqueeze(0)

            # calculating <u,p> score for all test items <u,p> pair
            positives = test_data[test_data['userID'] == uid].baskets.values[0]  # list dim 1
            p_length = len(positives)
            positives = torch.LongTensor(positives)

            # Deal with positives samples
            scores_pos = list(torch.mm(du_latest, item_embedding[positives].t()).data.numpy()[0])
            for s in scores_pos:
                scores.append(s)

            # Deal with negative samples
            negtives = random.sample(list(neg_samples[uid]), Config().neg_num)
            negtives = torch.LongTensor(negtives)
            scores_neg = list(torch.mm(du_latest, item_embedding[negtives].t()).data.numpy()[0])
            for s in scores_neg:
                scores.append(s)

            # Calculate hit-ratio
            index_k = []
            for k in range(Config().top_k):
                index = scores.index(max(scores))
                index_k.append(index)
                scores[index] = -9999
            single_hit = len((set(np.arange(0, p_length)) & set(index_k)))/p_length
            results.append([uid,index_k, set(np.arange(0, p_length)), single_hit])
            hitratio_numer += len((set(np.arange(0, p_length)) & set(index_k)))
            hitratio_denom += p_length

            # Calculate NDCG
            u_dcg = 0
            u_idcg = 0
            for k in range(Config().top_k):
                if index_k[k] < p_length:  # p_length 
                    u_dcg += 1 / math.log(k + 1 + 1, 2)
                u_idcg += 1 / math.log(k + 1 + 1, 2)
            ndcg += u_dcg / u_idcg

    hitratio = hitratio_numer / hitratio_denom
    ndcg = ndcg / len(train_data)
    print('Hit ratio[{0}]: {1}'.format(Config().top_k, hitratio))
    print('NDCG[{0}]: {1}'.format(Config().top_k, ndcg))
    return results

if __name__ == '__main__':
    results_ = test()

☛ Please input the model file you want to test: 1605474009


INFO:torch-log:✔︎ The format of your input is legal, now loading to next step...
INFO:torch-log:✔︎ Loading data...
INFO:torch-log:✔︎ Training data processing...
INFO:torch-log:✔︎ Test data processing...
INFO:torch-log:✔︎ Load negative sample...


Hit ratio[15]: 0.4787466836172534
NDCG[15]: 0.3686306380642009


In [5]:
result_df = pd.DataFrame(results_, columns=['UserID', 'Prediction', 'Actual', 'Hit-Ratio'])

In [6]:
def confi_cut(score):
    if score >= 0.0 and score <0.2:
        return '0-0.2'
    elif score >= 0.2 and score <0.4:
        return '0.2-0.4'
    elif score >= 0.4 and score <0.6:
        return '0.4-0.6'
    elif score >= 0.6 and score <0.8:
        return '0.6-0.8'
    else:
        return '0.8-1.0'

In [7]:
result_df['confi'] = result_df['Hit-Ratio'].apply(lambda x: confi_cut(x))

In [8]:
result_df['confi'].value_counts()

0.4-0.6    48859
0.6-0.8    33118
0.2-0.4    32967
0.8-1.0    28707
0-0.2      13349
Name: confi, dtype: int64

In [9]:
result_df['confi'].value_counts(normalize=True)

0.4-0.6    0.311204
0.6-0.8    0.210943
0.2-0.4    0.209981
0.8-1.0    0.182847
0-0.2      0.085025
Name: confi, dtype: float64

In [10]:
result_df.to_csv('results/final_all_data_result.csv', index=False)