In [22]:
import re
import ast
import itertools
from collections import Counter
import pandas as pd
import numpy as np
import scipy.sparse as sparse

from keras.preprocessing.text import one_hot
from keras.preprocessing.text import text_to_word_sequence
from keras.preprocessing.text import Tokenizer
from keras.utils import plot_model

from IPython.display import Image
import matplotlib.pyplot as plt
from tqdm import tqdm

In [23]:
itemIndex = 'ItemIndex'
userIndex = 'UserIndex'
rating = 'review/overall'
binary = 'Binary'
keyPhrase = 'keyVector'
dataPath = 'data/beer/advocate/'
dataName = 'Data.csv'
KeyPhraseName = 'KeyPhrases.csv'
embedding_dims = 100
num_keyPhrases = 100

In [24]:
df = pd.read_csv(dataPath+dataName)
df.head()

Unnamed: 0.1,Unnamed: 0,beer/ABV,beer/style,review/overall,review/timeUnix,keyVector,UserIndex,ItemIndex,Binary
0,0,5.0,Hefeweizen,1.5,1234817823,[],10907.0,2809,0
1,1,6.2,English Strong Ale,3.0,1235915097,[],10907.0,2650,0
2,2,6.5,Foreign / Export Stout,3.0,1235916604,[],10907.0,463,0
3,3,5.0,German Pilsener,3.0,1234725145,[40],10907.0,2808,0
4,4,7.7,American Double / Imperial IPA,4.0,1293735206,"[21, 41, 72]",8278.0,709,0


In [25]:
num_user = df[userIndex].nunique()
num_item = df[itemIndex].nunique()
print("Number of User: {0}".format(num_user))
print("Number of Item: {0}".format(num_item))
print("Number of key Phrase: {0}".format(num_keyPhrases))

Number of User: 11950
Number of Item: 3688
Number of key Phrase: 100


In [26]:
def getSparseMatrixFromLists(keyvector):
    indecs = []
    for i in range(len(keyvector)):
        for j in keyvector[i]:
            indecs.append([i,j])
            
    indecs = np.array(indecs)
    return sparse.csr_matrix((np.ones(len(indecs)), (indecs[:,0], indecs[:,1])), shape=(len(keyvector), num_keyPhrases))

In [27]:
msk = np.random.rand(len(df)) < 0.8
train_df = df[msk]
test_df = df[~msk]

In [42]:
def getArrays(df):
    users = df[userIndex].as_matrix()
    items = df[itemIndex].as_matrix()
    ratings = df[binary].as_matrix()
    keys = getSparseMatrixFromLists(df[keyPhrase].apply(ast.literal_eval).values.tolist()).todense()
    return [users,items,ratings,keys]

train_array = getArrays(train_df)
test_array = getArrays(test_df)

In [44]:
test_array

[array([10907.,  2936.,  3702., ...,  2074.,   108.,  3566.]),
 array([ 463,  669,  669, ..., 1943, 1943,  873]),
 array([0, 1, 0, ..., 0, 0, 0]),
 matrix([[0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         ...,
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.]])]

## Negative Sampling
Use negative sampling only when implicit feedback!

In [60]:
def negative_data(df, size_per_user):

    m = df[userIndex].nunique()
    n = df[itemIndex].nunique()
    
    users = []
    items = []

    negative_data = []
    for i in range(m):
        sampled_items = np.random.choice(n, size_per_user, replace=False)
        observed_items = df[df[userIndex] == i][itemIndex].as_matrix().flatten()
        sampled_items = sampled_items[np.invert(np.isin(sampled_items, observed_items))]
        users += [i]*len(sampled_items)
        items += sampled_items.tolist()
        
    ratings = [0]*len(users)
    keys = np.zeros([len(users), num_keyPhrases])

    return [np.array(users),np.array(items),np.array(ratings),keys]
negative_data = negative_data(train_df, 100)

In [61]:
def concateData(positive, negative):
    users = np.concatenate([positive[0], negative[0]])
    items = np.concatenate([positive[1], negative[1]])
    ratings = np.concatenate([positive[2], negative[2]])
    keys = np.concatenate([positive[3], negative[3]])
    return [np.array(users),np.array(items),np.array(ratings),keys]

train_array = concateData(train_array, negative_data)

## Model

In [62]:
from keras.models import Model
from keras.layers import *   
from keras.backend import reshape

In [63]:
def build_model():
    userIndex = Input(shape=(1,), name='userIndex')
    itemIndex = Input(shape=(1,), name='itemIndex')

    userEmbedding = Embedding(num_user, embedding_dims, name='userEmbedding')(userIndex)
    itemEmbedding = Embedding(num_item, embedding_dims, name='itemEmbedding')(itemIndex)

    merged = concatenate([userEmbedding, itemEmbedding])
    x = Flatten()(merged)
    #x = BatchNormalization()(x)
    x = Activation('selu')(x)
    x = Dropout(0.1)(x)
    # a layer instance is callable on a tensor, and returns a tensor
    x = Dense(128)(x)
    #x = BatchNormalization()(x)
    x = Activation('selu')(x)
    hidden = Dropout(0.1)(x)
    pred_rating= Dense(1, name='rating')(hidden)
    pred_keyPhrase= Dense(num_keyPhrases, name='keyPhrase')(hidden)

    # This creates a model that includes
    # the Input layer and three Dense layers
    model = Model(inputs=[userIndex, itemIndex], outputs=[pred_rating, pred_keyPhrase])
    model.compile(optimizer='rmsprop',
                  loss=["mean_squared_error", "mean_squared_error"],
                  loss_weights = [1.1,1])

    return model

model = build_model()

In [64]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
userIndex (InputLayer)          (None, 1)            0                                            
__________________________________________________________________________________________________
itemIndex (InputLayer)          (None, 1)            0                                            
__________________________________________________________________________________________________
userEmbedding (Embedding)       (None, 1, 100)       1195000     userIndex[0][0]                  
__________________________________________________________________________________________________
itemEmbedding (Embedding)       (None, 1, 100)       368800      itemIndex[0][0]                  
__________________________________________________________________________________________________
concatenat

In [65]:
# plot_model(model, to_file='multiple_outputs.png')
# Image(filename='multiple_outputs.png') 

In [None]:
history = model.fit(train_array[:2], train_array[2:], 
                    validation_data=(test_array[:2], test_array[2:]), epochs=10, batch_size=32)

Train on 1172874 samples, validate on 20075 samples
Epoch 1/10
Epoch 2/10

In [None]:
# # Plot training & validation accuracy values
# plt.plot(history.history['rating_acc'])
# plt.plot(history.history['val_rating_acc'])
# plt.title('Model accuracy')
# plt.ylabel('Accuracy')
# plt.xlabel('Epoch')
# plt.legend(['Train', 'Test'], loc='upper left')
# plt.show()
# Plot training & validation loss values
plt.plot(history.history['rating_loss'])
plt.plot(history.history['val_rating_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()

In [None]:
# # Plot training & validation accuracy values
# plt.plot(history.history['keyPhrase_acc'])
# plt.plot(history.history['val_keyPhrase_acc'])
# plt.title('Model accuracy')
# plt.ylabel('Accuracy')
# plt.xlabel('Epoch')
# plt.legend(['Train', 'Test'], loc='upper left')
# plt.show()
# Plot training & validation loss values
plt.plot(history.history['keyPhrase_loss'])
plt.plot(history.history['val_keyPhrase_loss'])
plt.title('Model loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.show()

In [54]:
prediction = model.predict(test_array[:2])

In [55]:
prediction

[array([[0.1657393 ],
        [0.3058729 ],
        [0.44386387],
        ...,
        [0.21540907],
        [0.24298677],
        [0.19516508]], dtype=float32),
 array([[ 6.02425961e-03, -8.38455977e-04,  3.53189148e-02, ...,
          5.35007529e-02,  6.96169771e-03,  3.71172912e-02],
        [ 6.08524028e-03,  2.59315479e-04,  1.55964149e-02, ...,
          1.06756799e-02,  1.15493685e-02,  9.66539979e-03],
        [ 5.34568680e-03, -4.37212060e-04,  4.64895321e-03, ...,
         -4.08636406e-03,  1.06381234e-02,  4.31131572e-04],
        ...,
        [ 1.70879066e-05, -1.39816012e-03,  2.68331133e-02, ...,
          5.40661253e-02,  1.30385561e-02,  2.38017552e-03],
        [-1.39652845e-03, -1.07891532e-03,  2.76409313e-02, ...,
          3.31832021e-02,  2.04184018e-02,  1.42875127e-03],
        [-7.47113954e-03,  4.96100774e-03,  4.92741019e-02, ...,
          7.98110515e-02,  1.48157943e-02,  5.00184409e-02]], dtype=float32)]

In [56]:
def PredictTopK(model, topK_item, topK_keyPhrase, keyPhrase):
    
    explain_dicts = []
    for i in tqdm(range(5)):
        inputs = [np.array([i]*num_item), np.arange(num_item)]
        predictions = model.predict(inputs)
        index_topK = np.argsort(predictions[0].flatten())[::-1][:topK_item]
        for j in index_topK:
            explain = keyPhrase[np.argsort(predictions[1][j].flatten())[::-1][:topK_keyPhrase]]
            explain_dicts.append({'UserIndex': i, 'ItemIndex': j, 'Explanation': explain})
            
    return pd.DataFrame(explain_dicts)

In [57]:
df = pd.read_csv(dataPath+KeyPhraseName)
keyPhrase = np.array(df['Phrases'].tolist())

In [58]:
df_explain = PredictTopK(model, 10, 3, keyPhrase)

100%|██████████| 5/5 [00:00<00:00,  5.65it/s]


In [59]:
df_explain

Unnamed: 0,Explanation,ItemIndex,UserIndex
0,"[oz bottl, brown head, light brown]",1173,0
1,"[brown head, light brown, oz bottl]",759,0
2,"[oz bottl, brown head, light brown]",861,0
3,"[golden color, gold color, light brown]",1706,0
4,"[brown head, light brown, oz bottl]",2419,0
5,"[brown head, oz bottl, light brown]",1226,0
6,"[brown head, oz bottl, golden color]",1175,0
7,"[brown head, oz bottl, light brown]",1172,0
8,"[oz bottl, golden color, gold color]",2908,0
9,"[brown head, oz bottl, golden color]",1160,0


# Problem
it seems key word selected from data processing has problem. Those keys phrases are too sparse and also not very informative. Should think better idea to extract key phrases.