## Deep Interest Network

https://arxiv.org/pdf/1706.06978.pdf

https://github.com/shenweichen/DeepCTR-Torch

In [1]:
import sys, os, json, time
import numpy as np
import pandas as pd
import torch
from deepctr_torch.inputs import (DenseFeat, SparseFeat, VarLenSparseFeat,
                                  get_feature_names)
from deepctr_torch.models.din import DIN

In [2]:
sys.path.append("src/")
from constants import *

### prepare history movieIds

In [3]:
# read raw ratings data
%time df_train = pd.read_hdf(TRAIN_FN.format(1), key='stage')
%time df_test = pd.read_hdf(TEST_FN.format(1), key='stage')

CPU times: user 6.87 s, sys: 3.19 s, total: 10.1 s
Wall time: 11.1 s
CPU times: user 50.9 ms, sys: 36.6 ms, total: 87.4 ms
Wall time: 89.5 ms


In [10]:
# read user2idx and item2idx
print('userid mapping')
user2idx = json.load(open(USER2IDX_FN))
df_train['mapped_user'] = df_train['User'].apply(
    lambda x: user2idx[str(x)])
df_test['mapped_user'] = df_test['User'].apply(
    lambda x: user2idx[str(x)])
del user2idx

print('movieid mapping')
item2idx = json.load(open(ITEM2IDX_FN))
df_train['mapped_movie'] = df_train['Movie'].apply(
    lambda x: item2idx[str(x)])
df_test['mapped_movie'] = df_test['Movie'].apply(
    lambda x: item2idx[str(x)])
del item2idx

userid mapping
movieid mapping


In [12]:
df_train.sort_values(by=['mapped_user', 'Date'], ascending=[True, True],
                     inplace=True)
df_test.sort_values(by=['mapped_user', 'Date'], ascending=[True, True],
                     inplace=True)

In [13]:
def mark_last_timestamp(df):
    user, movie = 'mapped_user', 'mapped_movie'
    last = df[[user, movie]].groupby(
        by=user, as_index=False).tail(1).copy()
    last['last'] = 1
    df = pd.merge(df, last, how='left', on=[user, movie])
    df.loc[~df['last'].isnull(), 'last'] = 1
    df.loc[df['last'].isnull(), 'last'] = 0
    
    return df

In [14]:
df_train = mark_last_timestamp(df_train)
df_test = mark_last_timestamp(df_test)

In [18]:
item2idx = json.load(open(ITEM2IDX_FN))
candidate_movie_ids = list(item2idx.values())
del item2idx
print(len(candidate_movie_ids))

17770


In [20]:
def neg_sampling(candidates, filters, length):
    max_len = len(candidates)
    
    res = []
    for i in range(length):
        while(1):
            c = candidates[np.random.randint(0, max_len)]
            if c not in filters:
                res.append(str(c))
                filters.add(c)
                break
    return res


def get_hist_movie_ids(df, candidates, max_len=10):
    hist_movie_ids = list()
    neg_hist_movie_ids = list()
    for _, group in df.groupby(by='mapped_user'):
        tmp_hist_movie_ids = list()
        for _, row in group.iterrows():
            # keep high rated movies
            if row['Rating'] >= 4 and row['last'] == 0:
                tmp_hist_movie_ids.append(str(int(row['mapped_movie'])))
        # keep latest high rated movies
        tmp_hist_movie_ids.reverse()
        tmp_hist_movie_ids = tmp_hist_movie_ids[:max_len]
        # revert to timestamp order
        tmp_hist_movie_ids.reverse()
        tmp_neg_hist_movie_ids = neg_sampling(
            candidates, set(hist_movie_ids), len(tmp_hist_movie_ids))
        hist_movie_ids.append('|'.join(tmp_hist_movie_ids))
        neg_hist_movie_ids.append('|'.join(tmp_neg_hist_movie_ids))
    return hist_movie_ids, neg_hist_movie_ids

In [23]:
print('train')
start = time.time()
train_hist_movie_ids, train_neg_hist_movie_ids = get_hist_movie_ids(
    df_train, candidate_movie_ids, 10)
print('time taken: %0.2f' % (time.time() - start))

print('test')
start = time.time()
test_hist_movie_ids, test_neg_hist_movie_ids = get_hist_movie_ids(
    df_test, candidate_movie_ids, 10)
print('time taken: %0.2f' % (time.time() - start))

train
time taken: 14240.11
test
time taken: 295.01


In [35]:
print(len(train_hist_movie_ids), len(train_neg_hist_movie_ids),
      train_hist_movie_ids[0])
print('\n')
print(len(test_hist_movie_ids), len(test_neg_hist_movie_ids),
      test_hist_movie_ids[:20])

469941 469941 4314|4431|405|2685|132|2657|2642|527|2456|413


143001 143001 ['', '', '', '', '', '2464|3712', '2912', '', '4066', '', '', '', '3488|4097|2313', '3714', '', '', '', '787', '', '']


In [57]:
df_train1 = df_train[df_train['last'] == 1]
df_train1['histHighRatedMovieIds'] = train_hist_movie_ids
df_train1['negHistMovieIds'] = train_neg_hist_movie_ids

df_test1 = df_test[df_test['last'] == 1]
df_test1['histHighRatedMovieIds'] = test_hist_movie_ids
df_test1['negHistMovieIds'] = test_neg_hist_movie_ids

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

Se

In [58]:
print(df_train1.shape, df_test1.shape)
print(df_train1['mapped_user'].nunique(),
      df_train['mapped_user'].nunique())
print(df_test1['mapped_user'].nunique(),
      df_test['mapped_user'].nunique())

(469941, 13) (143001, 13)
469941 469941
143001 143001


In [65]:
for col in ['histHighRatedMovieIds', 'negHistMovieIds']:
    df_train1[col] = df_train1[col].apply(lambda x: x.split('|'))
    df_test1[col] = df_test1[col].apply(lambda x: x.split('|'))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [68]:
def add_mask_value(inp_lst, zero_val, max_len=10):
    out_lst = []
    for inp in inp_lst:
        if inp == '':
            out = 0
        elif int(inp) == 0:
            out = zero_val
        elif int(inp) != 0:
            out = int(inp)
        out_lst.append(out)
    length = len(out_lst)
    if length == max_len:
        return out_lst
    else:
        return out_lst + [0]*(max_len - length)

In [69]:
# mask value = 0. So, replace movie id 0 with the max(candidate_movie_ids)

print('train - mapping zero in mapped_movie')
mask = df_train1['mapped_movie'] == 0
df_train1.loc[mask, 'mapped_movie'] = max(candidate_movie_ids)+1
print(df_train1['mapped_movie'].min())

print('test - mapping zero in mapped_movie')
mask = df_test1['mapped_movie'] == 0
df_test1.loc[mask, 'mapped_movie'] = max(candidate_movie_ids)+1
print(df_test1['mapped_movie'].min())

print('mapping zero in histHighRatedMovieIds and negHistMovieIds')
for col in ['histHighRatedMovieIds', 'negHistMovieIds']:
    print('Col: ', col)
    print('\n')
    print('Train:\n')
    df_train1[col] = df_train1[col].apply(
        lambda x: add_mask_value(x, max(candidate_movie_ids)+1))
    print('\n')
    print('Test:\n')
    df_test1[col] = df_test1[col].apply(
        lambda x: add_mask_value(x, max(candidate_movie_ids)+1))
    print('\n')

train - mapping zero in mapped_movie
1
test - mapping zero in mapped_movie
1
mapping zero in histHighRatedMovieIds and negHistMovieIds
Col:  histHighRatedMovieIds


Train:



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy




Test:



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy




Col:  negHistMovieIds


Train:



Test:





In [73]:
# create and save dictionary for later use
train_histHighRatedMovieIds = dict(zip(df_train1['mapped_user'],
    df_train1['histHighRatedMovieIds']))
test_histHighRatedMovieIds = dict(zip(df_test1['mapped_user'],
    df_test1['histHighRatedMovieIds']))

train_negHistMovieIds = dict(zip(df_train1['mapped_user'],
    df_train1['negHistMovieIds']))
test_negHistMovieIds = dict(zip(df_test1['mapped_user'],
    df_test1['negHistMovieIds']))

In [76]:
# save
print('train_histHighRatedMovieIds\n')
out_fn = os.path.join(MOVIE_METADATA_DIR,
                      'train_histHighRatedMovieIds.json')
json.dump(train_histHighRatedMovieIds, open(out_fn, 'w'))

print('train_negHistMovieIds\n')
out_fn = os.path.join(MOVIE_METADATA_DIR,
                      'train_negHistMovieIds.json')
json.dump(train_negHistMovieIds, open(out_fn, 'w'))

print('test_histHighRatedMovieIds\n')
out_fn = os.path.join(MOVIE_METADATA_DIR,
                      'test_histHighRatedMovieIds.json')
json.dump(test_histHighRatedMovieIds, open(out_fn, 'w'))

print('test_negHistMovieIds\n')
out_fn = os.path.join(MOVIE_METADATA_DIR,
                      'test_negHistMovieIds.json')
json.dump(test_negHistMovieIds, open(out_fn, 'w'))

train_histHighRatedMovieIds

train_negHistMovieIds

test_histHighRatedMovieIds

test_negHistMovieIds



### Data Preparation

In [3]:
# Globals
N_USERS = 480189
N_ITEMS = 17770
HIST_LEN = 10

In [4]:
# read train and test data
train_fn = os.path.join(PREPARED_DATA_DIR, 'user_train_data_1.h5')
df_train = pd.read_hdf(train_fn, key='stage')

test_fn = os.path.join(PREPARED_DATA_DIR, 'user_test_data_1.h5')
df_test = pd.read_hdf(test_fn, key='stage')

In [5]:
print(df_train.shape)
print(df_test.shape)

df_train.head()

(22851074, 20)
(240538, 20)


Unnamed: 0,User,Rating,Date,Movie,Rating_class,days_since_first_user_rating,sqrt_days_since_first_user_rating,rating_age_days_user,rating_age_weeks_user,rating_age_months_user,mean_ratings_user,num_ratings_user,days_since_first_item_rating,sqrt_days_since_first_item_rating,rating_age_days_item,rating_age_weeks_item,rating_age_months_item,mean_ratings_movie,weighted_mean_ratings_movie,num_ratings_movie
0,161459,4.0,2004-07-17,2138,0,23,4.795832,251,35.857143,8.366667,3.396365,28,1611,40.137264,2143,306.142857,71.433333,3.526814,3.527663,21220
1,87375,2.0,2004-03-14,3253,0,13,3.605551,617,88.142857,20.566667,4.3337,163,395,19.874607,1052,150.285714,35.066667,2.977046,2.979649,59554
2,191296,2.0,2005-12-23,1154,0,453,21.283797,455,65.0,15.166667,3.955031,108,507,22.51666,514,73.428571,17.133333,3.818879,3.790705,1695
3,27266,5.0,2004-09-26,1201,1,15,3.872983,429,61.285714,14.3,3.757806,124,1754,41.880783,2215,316.428571,73.833333,3.771652,3.77108,74899
4,175666,3.0,2004-08-03,4377,0,446,21.118712,835,119.285714,27.833333,3.280928,51,565,23.769729,1080,154.285714,36.0,3.48806,3.518392,670


In [6]:
# add histmovieids to the dataframes
inp_fn = os.path.join(MOVIE_METADATA_DIR,
                      'train_histHighRatedMovieIds.json')
train_histHighRatedMovieIds = json.load(open(inp_fn))

inp_fn = os.path.join(MOVIE_METADATA_DIR,
                      'test_histHighRatedMovieIds.json')
test_histHighRatedMovieIds = json.load(open(inp_fn))

df_train['histHighRatedMovieIds'] = df_train['User'].apply(
    lambda x: train_histHighRatedMovieIds[str(x)])
df_test['histHighRatedMovieIds'] = df_test['User'].apply(
    lambda x: test_histHighRatedMovieIds[str(x)])
del train_histHighRatedMovieIds, test_histHighRatedMovieIds

In [7]:
print('train - mapping zero in Movie col')
print(df_train['Movie'].min())
mask = df_train['Movie'] == 0
df_train.loc[mask, 'Movie'] = N_ITEMS
print(df_train['Movie'].min())

print('test - mapping zero in Movie col')
print(df_test['Movie'].min())
mask = df_test['Movie'] == 0
df_test.loc[mask, 'Movie'] = N_ITEMS
print(df_test['Movie'].min())

train - mapping zero in Movie col
0
1
test - mapping zero in Movie col
0
1


In [8]:
# save item baseline dict to create hist features
dense_movie_features = [
    'days_since_first_item_rating',
    'sqrt_days_since_first_item_rating',
    'rating_age_days_item', 'rating_age_weeks_item',
    'rating_age_months_item', 'mean_ratings_movie',
    'weighted_mean_ratings_movie', 'num_ratings_movie']

d = {}
for col in dense_movie_features:
    print(col)
    d[col] = {}
    tmp = df_train[['Movie', col]]
    tmp.drop_duplicates(inplace=True)
    d[col] = dict(zip(tmp['Movie'], tmp[col]))
    del tmp
    print('\n')
    
print(len(d))

days_since_first_item_rating


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  




sqrt_days_since_first_item_rating


rating_age_days_item


rating_age_weeks_item


rating_age_months_item


mean_ratings_movie


weighted_mean_ratings_movie


num_ratings_movie


8


In [9]:
# filter by latest timestamp
tmp = df_train.groupby(['User'])['Date'].max().rename(
    'Date').reset_index()
print(tmp.shape)
df_train = pd.merge(df_train, tmp, on=['User', 'Date'],
                    how='inner')
print(df_train.shape)
del tmp

(469941, 2)
(1419479, 21)


In [10]:
# create hist feature for item baseline features
def create_hist_feats_movie_baseline(baseline_dct, col, inp_lst):
    out_lst = []
    for inp in inp_lst:
        if inp == 0:
            out_lst.append(0)
        else:
            out_lst.append(baseline_dct[col][inp])
    return out_lst


for col in dense_movie_features:
    print(col, '\n')
    print('Train\n')
    df_train['hist_'+col] = df_train['histHighRatedMovieIds'].apply(
        lambda x: create_hist_feats_movie_baseline(d, col, x))
    print('\n')
    print('Test\n')
    df_test['hist_'+col] = df_test['histHighRatedMovieIds'].apply(
        lambda x: create_hist_feats_movie_baseline(d, col, x))
    print('\n')

days_since_first_item_rating 

Train



Test



sqrt_days_since_first_item_rating 

Train



Test



rating_age_days_item 

Train



Test



rating_age_weeks_item 

Train



Test



rating_age_months_item 

Train



Test



mean_ratings_movie 

Train



Test



weighted_mean_ratings_movie 

Train



Test



num_ratings_movie 

Train



Test





In [11]:
#df_train.drop(dense_movie_features, axis=1, inplace=True)
#df_test.drop(dense_movie_features, axis=1, inplace=True)
df_train.shape, df_test.shape

((1419479, 29), (240538, 29))

In [12]:
del d

In [13]:
sparse_features = ['User', 'Movie']
dense_features = [
'days_since_first_user_rating',
'sqrt_days_since_first_user_rating',
'rating_age_days_user', 'rating_age_weeks_user',
'rating_age_months_user', 'mean_ratings_user',
'num_ratings_user', 'days_since_first_item_rating',
'sqrt_days_since_first_item_rating',
'rating_age_days_item', 'rating_age_weeks_item',
'rating_age_months_item', 'mean_ratings_movie',
'weighted_mean_ratings_movie', 'num_ratings_movie']
hist_features = ['hist_Movie']
target = 'Rating'

#### simple Transformation for dense features

In [14]:
from sklearn.preprocessing import MinMaxScaler

mms = MinMaxScaler(feature_range=(0, 1))
print('Train')
%time df_train[dense_features] = mms.fit_transform(df_train[dense_features])

print('Test')
%time df_test[dense_features] = mms.transform(df_test[dense_features])

Train


  return self.partial_fit(X, y)


CPU times: user 3.71 s, sys: 1.75 s, total: 5.46 s
Wall time: 5.56 s
Test
CPU times: user 541 ms, sys: 275 ms, total: 816 ms
Wall time: 820 ms


In [15]:
df_train.rename(columns={'histHighRatedMovieIds': 'hist_Movie'},
                inplace=True)
df_test.rename(columns={'histHighRatedMovieIds': 'hist_Movie'},
               inplace=True)

In [50]:
sparse_features_count = [N_USERS, N_ITEMS+1]
feature_columns = [SparseFeat(
    name=feat, vocabulary_size=sparse_features_count[i],
    embedding_dim=70) for i, feat in enumerate(sparse_features)]

feature_columns += [DenseFeat(feat, 1) for feat in dense_features]

feature_columns += [VarLenSparseFeat(
    SparseFeat(name='hist_Movie', vocabulary_size=N_ITEMS+1,
               embedding_dim=70), HIST_LEN)]

In [51]:
feature_columns

[SparseFeat(name='User', vocabulary_size=480189, embedding_dim=70, use_hash=False, dtype='int32', embedding_name='User', group_name='default_group'),
 SparseFeat(name='Movie', vocabulary_size=17771, embedding_dim=70, use_hash=False, dtype='int32', embedding_name='Movie', group_name='default_group'),
 DenseFeat(name='days_since_first_user_rating', dimension=1, dtype='float32'),
 DenseFeat(name='sqrt_days_since_first_user_rating', dimension=1, dtype='float32'),
 DenseFeat(name='rating_age_days_user', dimension=1, dtype='float32'),
 DenseFeat(name='rating_age_weeks_user', dimension=1, dtype='float32'),
 DenseFeat(name='rating_age_months_user', dimension=1, dtype='float32'),
 DenseFeat(name='mean_ratings_user', dimension=1, dtype='float32'),
 DenseFeat(name='num_ratings_user', dimension=1, dtype='float32'),
 DenseFeat(name='days_since_first_item_rating', dimension=1, dtype='float32'),
 DenseFeat(name='sqrt_days_since_first_item_rating', dimension=1, dtype='float32'),
 DenseFeat(name='ratin

In [52]:
behavior_feature_list = ["Movie"]

In [53]:
feature_names = get_feature_names(feature_columns)
train_model_input = {name: np.array(df_train[name].values.tolist())
                     for name in feature_names}
test_model_input = {name: np.array(df_test[name].values.tolist())
                    for name in feature_names}

In [54]:
print(len(train_model_input), len(test_model_input),
      train_model_input['hist_Movie'].shape)

18 18 (1419479, 10)


In [55]:
train_y = df_train[target].values
test_y = df_test[target].values

In [56]:
train_model_input['hist_Movie'], test_y

(array([[2782, 3105, 2199, ...,    0,    0,    0],
        [2782, 3105, 2199, ...,    0,    0,    0],
        [2782, 3105, 2199, ...,    0,    0,    0],
        ...,
        [1541, 4305,    0, ...,    0,    0,    0],
        [  32,  667, 3522, ..., 4379, 3934, 2289],
        [3924, 1328, 3289, ...,  196, 2371, 3281]]),
 array([3., 3., 4., ..., 4., 2., 5.]))

In [57]:
device = 'cpu'
use_cuda = True
if use_cuda and torch.cuda.is_available():
    print('cuda ready...')
    device = 'cuda:0'
    
device

'cpu'

In [58]:
model = DIN(feature_columns, behavior_feature_list, device=device,
            task='regression', l2_reg_embedding=1e-3,
            att_activation='Dice', dnn_use_bn=True, l2_reg_dnn=1e-3,
            dnn_dropout=0.4)

In [59]:
model

DIN(
  (embedding_dict): ModuleDict(
    (Movie): Embedding(17771, 70)
    (User): Embedding(480189, 70)
    (hist_Movie): Embedding(17771, 70)
  )
  (linear_model): Linear(
    (embedding_dict): ModuleDict()
  )
  (out): PredictionLayer()
  (attention): AttentionSequencePoolingLayer(
    (local_att): LocalActivationUnit(
      (dnn): DNN(
        (dropout): Dropout(p=0, inplace=False)
        (linears): ModuleList(
          (0): Linear(in_features=280, out_features=64, bias=True)
          (1): Linear(in_features=64, out_features=16, bias=True)
        )
        (activation_layers): ModuleList(
          (0): Sigmoid()
          (1): Sigmoid()
        )
      )
      (dense): Linear(in_features=16, out_features=1, bias=True)
    )
  )
  (dnn): DNN(
    (dropout): Dropout(p=0.4, inplace=False)
    (linears): ModuleList(
      (0): Linear(in_features=225, out_features=256, bias=True)
      (1): Linear(in_features=256, out_features=128, bias=True)
    )
    (bn): ModuleList(
      (0): 

In [60]:
model.compile("adam", "mse", metrics=['mse'])

In [61]:
start = time.time()
model.fit(train_model_input, train_y,
          batch_size = 3000, epochs = 3,
          validation_data = (test_model_input, test_y), verbose=2)
print('time taken: %0.2f' % (time.time() - start))

cpu
Train on 1419479 samples, validate on 240538 samples, 474 steps per epoch
Epoch 1/3
597s - loss:  1.7025 - mse:  1.7016 - val_mse:  0.8657
Epoch 2/3
572s - loss:  0.7874 - mse:  0.7875 - val_mse:  1.0144


KeyboardInterrupt: 

In [29]:
model.metrics.items()

dict_items([('mse', <function mean_squared_error at 0x1361df950>)])

In [62]:
np.sqrt(0.8657)

0.9304300081145277