In [36]:
import pandas as pd
import numpy as np
import os
import datetime

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

## Dataset - movielens-latest-small 

We use movielens-latest-small here as it's more convienient than the 100k one.
But we still use only users/movies that were available in the original.

In [4]:
data_path = os.path.join('.', 'data')
ratings_df = pd.read_csv(os.path.join(data_path, 'ratings.csv'))
ratings_df

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [7]:
user_encoder = LabelEncoder()
ratings_df['userEncoded'] = user_encoder.fit_transform(ratings_df['userId'])

movie_encoder = LabelEncoder()
ratings_df['movieEncoded'] = movie_encoder.fit_transform(ratings_df['movieId'])

In [24]:
X = ratings_df[['userEncoded', 'movieEncoded']].values
y = ratings_df['rating'].values
n_users = ratings_df['userEncoded'].nunique()
n_movies = ratings_df['movieEncoded'].nunique()
print(f"Number of unique users: {n_users}")
print(f"Number of unique movies: {n_movies}")

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=2137)
[a.shape for a in [X_train, X_test, y_train, y_test]]

Number of unique users: 610
Number of unique movies: 9724


[(85710, 2), (15126, 2), (85710,), (15126,)]

In [34]:
X_train_inputs = [X_train[:, 0], X_train[:, 1]]
X_test_inputs = [X_test[:, 0], X_test[:, 1]]

## Baseline Collaborative ANN

Just the user - item relationship

In [83]:
import tensorflow as tf
from tensorflow.keras import Model
from tensorflow.keras.layers import Input, Concatenate, Dense, Dropout, Embedding, Flatten, Lambda, Dot, Multiply
from tensorflow.keras.regularizers import l2

def make_model(n_users, n_movies, embedding_size=50, dense_size=10, min_rating=0.0, max_rating=5.0, join_method='concat'):
    user = Input(shape=(1, ))
    u_emb = Embedding(
        n_users, embedding_size, 
        embeddings_initializer='he_normal',
        embeddings_regularizer=l2(1e-6)
    )(user)
    u_emb = Flatten()(u_emb)
    
    movie = Input(shape=(1, ))
    m_emb = Embedding(
        n_movies, embedding_size, 
        embeddings_initializer='he_normal',
        embeddings_regularizer=l2(1e-6)
    )(movie)
    m_emb = Flatten()(m_emb)
    
    if join_method == 'concat':
        x = Concatenate()([u_emb, m_emb])
    elif join_method == 'product':
        x = Multiply()([u_emb, m_emb])
    else:
        raise ValueError(f"Unsupported join method: {join_method}")
    x = Dropout(0.05)(x)
    
    x = Dense(dense_size, activation='relu')(x)
    x = Dropout(0.5)(x)
    
    x = Dense(1, activation='sigmoid')(x)
    # sigmoid output is 0...1 do it must be denormalized to min...max of rating
    x = Lambda(lambda x: x * (max_rating - min_rating) + min_rating)(x)
    
    model = Model(inputs=[user, movie], outputs=x)
    model.compile(loss='mse', optimizer='adam')
    
    return model

In [41]:
model = make_model(n_users, n_movies)

In [42]:
# tensorboard
log_dir = "logs/" + datetime.datetime.now().strftime("concat-%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

In [44]:
model.fit(
    x=X_train_inputs, y=y_train, 
    validation_data=(X_test_inputs, y_test),
    batch_size=32, epochs=15, 
    callbacks=[tensorboard_callback]
)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<tensorflow.python.keras.callbacks.History at 0x1633f3860>

In [84]:
model = make_model(n_users, n_movies, join_method='product')

In [85]:
# tensorboard
log_dir = "logs/" + datetime.datetime.now().strftime("product-%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

In [86]:
model.fit(
    x=X_train_inputs, y=y_train, 
    validation_data=(X_test_inputs, y_test),
    batch_size=32, epochs=15, 
    callbacks=[tensorboard_callback]
)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<tensorflow.python.keras.callbacks.History at 0x164264ac8>

## Hybrid model with DNN

Using movie/user features

Interesting approach: https://arxiv.org/pdf/2009.09748.pdf

In [70]:
nlp_weights_fp = os.path.join('.', 'data', 'embeddings.npz')
nlp_weights_archive = np.load(nlp_weights_fp)

In [72]:
nlp_weights = np.vstack([
    nlp_weights_archive[str(c)] if str(c) in nlp_weights_archive else np.zeros((300,), dtype=np.float32) 
    for c in movie_encoder.classes_
])
nlp_weights.shape

In [107]:
def make_model_2(n_users, n_movies, nlp_weights, embedding_size=50, dense_size=10, min_rating=0.0, max_rating=5.0, join_method='concat'):
    user = Input(shape=(1, ))
    u_emb = Embedding(
        n_users, embedding_size, 
        embeddings_initializer='he_normal',
        embeddings_regularizer=l2(1e-6)
    )(user)
    u_emb = Flatten()(u_emb)
    
    movie = Input(shape=(1, ))
    m_emb = Embedding(
        n_movies, embedding_size, 
        embeddings_initializer='he_normal',
        embeddings_regularizer=l2(1e-6)
    )(movie)
    m_emb = Flatten()(m_emb)
    
    m_content_emb = Embedding(
        nlp_weights.shape[0], nlp_weights.shape[1]
    )
    m_content_emb.build((1,))
    m_content_emb.set_weights([nlp_weights])
    m_content_emb.trainable = False
    
    m_content_emb = m_content_emb(movie)
    m_content_emb = Flatten()(m_content_emb)
    
    if join_method == 'concat':
        um = Concatenate()([u_emb, m_emb])
    elif join_method == 'product':
        um = Multiply()([u_emb, m_emb])
    else:
        raise ValueError(f"Unsupported join method: {join_method}")
    x = Concatenate()([um, m_content_emb])
    x = Dropout(0.05)(x)
    
    x = Dense(32, activation='relu')(x)
    x = Dropout(0.5)(x)
    x = Dense(16, activation='relu')(x)
    x = Dropout(0.5)(x)
    
    x = Dense(1, activation='sigmoid')(x)
    # sigmoid output is 0...1 do it must be denormalized to min...max of rating
    x = Lambda(lambda x: x * (max_rating - min_rating) + min_rating)(x)
    
    model = Model(inputs=[user, movie], outputs=x)
    model.compile(loss='mse', optimizer='adam')
    
    return model

In [103]:
model = make_model_2(n_users, n_movies, nlp_weights)

In [104]:
# tensorboard
log_dir = "logs/" + datetime.datetime.now().strftime("content-%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

In [105]:
model.fit(
    x=X_train_inputs, y=y_train, 
    validation_data=(X_test_inputs, y_test),
    batch_size=32, epochs=15, 
    callbacks=[tensorboard_callback]
)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<tensorflow.python.keras.callbacks.History at 0x16488a828>

In [108]:
model = make_model_2(n_users, n_movies, nlp_weights, join_method='product')

In [110]:
# tensorboard
log_dir = "logs/" + datetime.datetime.now().strftime("content-prod-%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

In [111]:
model.fit(
    x=X_train_inputs, y=y_train, 
    validation_data=(X_test_inputs, y_test),
    batch_size=32, epochs=15, 
    callbacks=[tensorboard_callback]
)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<tensorflow.python.keras.callbacks.History at 0x1638960b8>

## Embeddings exploration

TODO