# Matrix Factorisation - Implicit

In [1]:
import sys
sys.path.append("../")

import warnings
warnings.filterwarnings("ignore")

In [None]:
import pandas as pd
from reco.preprocess import encode_user_item, random_split, neg_feedback_samples

In [4]:
%matplotlib inline

## Prepare the data

In [5]:
df_ratings = pd.read_csv("data/ratings.csv")
df_items = pd.read_csv("data/items.csv")
df_implicit = neg_feedback_samples(df_ratings, rating_threshold=3, ratio_neg_per_user=1)
df_implicit["unix_timestamp"] = 1


In [6]:
df_ratings.shape, df_implicit.shape

((100000, 4), (165040, 4))

In [7]:
DATA, user_encoder, item_encoder = encode_user_item(df_implicit, "user_id", "movie_id", "rating", "unix_timestamp")

Number of users:  943
Number of items:  1682


In [8]:
n_users = DATA.USER.nunique()
n_items = DATA.ITEM.nunique()
n_users, n_items

(943, 1682)

In [9]:
max_rating = DATA.RATING.max()
min_rating = DATA.RATING.min()
min_rating, max_rating

(np.int64(0), np.int64(1))

In [10]:
train, test = random_split(DATA, [0.8, 0.2])

In [11]:
train.shape, test.shape

((132032, 7), (33008, 7))

# Implicit Matrix Factorisation

In [27]:
from keras.models import Model
from keras.layers import Input, Embedding, Flatten, Dot
from keras.regularizers import l2
from sklearn.neighbors import NearestNeighbors

In [13]:
def ImplicitMF (n_users, n_items, n_factors):
    
    # Item Layer
    item_input = Input(shape=[1], name='Item')
    item_embedding = Embedding(n_items, n_factors, 
                               embeddings_regularizer=l2(1e-6), 
                               name='ItemEmbedding')(item_input)
    item_vec = Flatten(name='FlattenItemsE')(item_embedding)

    # User Layer
    user_input = Input(shape=[1], name='User')
    user_embedding = Embedding(n_users, n_factors, 
                               embeddings_regularizer=l2(1e-6), 
                               name='UserEmbedding')(user_input)
    user_vec = Flatten(name='FlattenUsersE')(user_embedding)

    # Dot Product of Item and User
    rating = Dot(axes=1, name='DotProduct')([item_vec, user_vec])
    
    # Model Creation
    model = Model([user_input, item_input], rating)
    
    # Compile Model
    model.compile(loss='binary_crossentropy', optimizer="sgd")
    
    return model

In [14]:
n_factors = 40
model = ImplicitMF(n_users, n_items, n_factors)

In [15]:
%%time
output = model.fit([train.USER, train.ITEM], train.RATING, 
                                  batch_size=128, epochs=5, verbose=1, validation_split=0.2)

Epoch 1/5
[1m826/826[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - loss: 5.3389 - val_loss: 5.4325
Epoch 2/5
[1m826/826[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - loss: 4.1589 - val_loss: 5.3641
Epoch 3/5
[1m826/826[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - loss: 3.0583 - val_loss: 5.3318
Epoch 4/5
[1m826/826[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - loss: 2.4312 - val_loss: 5.3087
Epoch 5/5
[1m826/826[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - loss: 2.0991 - val_loss: 5.2884
CPU times: total: 13 s
Wall time: 5 s


In [None]:
def get_embedding(model, name):
    embedding = model.get_layer(name=name).get_weights()[0]
    return embedding

In [17]:
item_embedding = get_embedding(model, "ItemEmbedding")
user_embedding = get_embedding(model, "UserEmbedding")

In [28]:
def get_similar(embedding, k):
    model_similar_items = NearestNeighbors(n_neighbors=k, algorithm="ball_tree").fit(
        embedding
    )
    distances, indices = model_similar_items.kneighbors(embedding)

    return distances, indices

In [29]:
%%time
item_distances, item_similar_indices = get_similar(item_embedding, 5)

CPU times: total: 78.1 ms
Wall time: 84 ms


In [30]:
item_similar_indices

array([[   0,  143,  237,  635,  624],
       [   1,  465,  211,  245,  692],
       [   2,  716,  211,   45,  549],
       ...,
       [1679,  919, 1339,  137, 1542],
       [1680, 1560, 1076, 1669,  869],
       [1681, 1570, 1649, 1450, 1061]])

In [31]:
def show_similar(item_index, item_similar_indices, item_encoder, df_items):
    s = item_similar_indices[item_index]
    movie_ids = item_encoder.inverse_transform(s)

    titles = []
    for movie_id in movie_ids:
        movie = df_items[df_items["movie_id"] == movie_id]
        if not movie.empty:
            # Extract just the string value from the Series
            title = movie['title'].values[0]
            titles.append(title)

    if not titles:
        print("No images found to display")
        return

    for title in titles:
        print(title)

In [32]:
show_similar(0, item_similar_indices, item_encoder, df_items=df_items)

Toy Story (1995)
Die Hard (1988)
Raising Arizona (1987)
Escape from New York (1981)
Sword in the Stone, The (1963)
