In [2]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Embedding, Input, Dense
from tensorflow.keras import layers
from tensorflow.keras.models import Model
from sklearn.model_selection import train_test_split
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
import copy
warnings.simplefilter(action='ignore')

In [4]:
# Load the MovieLens 100K dataset
# Assuming the file is 'u.data' in the same directory
# Columns: user_id, movie_id, rating, timestamp
data = pd.read_csv('u.data', sep='\t', names=['user_id', 'movie_id', 'rating', 'timestamp'])

# Step 0: only select top 50 movies
valid_movies = np.array(data['movie_id'].value_counts().index[:50])
data = data[data['movie_id'].isin(valid_movies)]

# Step 1: Remove users s.t. count >= 2 * unique timestamp
temp = data.groupby('user_id')['timestamp'].agg(['count', 'nunique']).reset_index()
valid_users = temp[temp['count']/temp['nunique'] < 2]['user_id']
data = data[data['user_id'].isin(valid_users)]

# Step 2: Deduplicate and recode
data = data.groupby(['user_id', 'timestamp'], group_keys=False).apply(lambda group: group.sample(n=1, random_state=42))
data = data.sort_values(['user_id', 'timestamp']).reset_index(drop = True)
data['movie_id'] = pd.factorize(data['movie_id'])[0] + 1

num_users = data['user_id'].nunique()
num_movies = data['movie_id'].nunique()
num_ratings = len(data)

sparsity = 1 - (num_ratings / (num_users * num_movies))

print(f"Filtered Dataset: {num_users} users, {num_movies} movies")
print(f"Sparsity: {sparsity:.2%}")

Filtered Dataset: 902 users, 50 movies
Sparsity: 69.50%


In [5]:
data

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,1,5,874965478
1,1,2,5,874965706
2,1,3,5,874965739
3,1,4,5,874965758
4,1,5,5,874965954
...,...,...,...,...
13752,943,23,4,888639407
13753,943,13,5,888639427
13754,943,8,4,888692413
13755,943,41,4,888692699


In [6]:
data[data['user_id'] == 943]

Unnamed: 0,user_id,movie_id,rating,timestamp
13728,943,14,5,875409939
13729,943,12,4,875409978
13730,943,19,4,875410099
13731,943,40,5,875501725
13732,943,2,5,875501774
13733,943,5,4,875501835
13734,943,46,4,875501937
13735,943,27,3,875501960
13736,943,39,4,875502042
13737,943,10,3,875502096


In [7]:
np.random.seed(42)

all_users = data['user_id'].unique()
shuffled_indices = np.random.permutation(len(all_users))

midpoint = (3 * len(all_users)) // 4
indices_1 = shuffled_indices[:midpoint]
indices_2 = shuffled_indices[midpoint:]

train_users = shuffled_indices[indices_1]
test_users = shuffled_indices[indices_2]

In [8]:
data_train = data[data['user_id'].isin(train_users)].reset_index(drop = True)
data_test = data[data['user_id'].isin(test_users)].reset_index(drop = True)

In [9]:
# Prepare baskets
def prepare_baskets(data):
    return data.groupby("user_id")["movie_id"].apply(list).tolist()

train_baskets = prepare_baskets(data_train)
test_baskets = prepare_baskets(data_test)

train_baskets, val_baskets = train_test_split(train_baskets, test_size=0.25, random_state=42)

max_len = max(len(basket) for basket in train_baskets + val_baskets + test_baskets)

In [10]:
D = 32
batch_size = 256
max_epochs = 2000
lr = 1e-4
max_items = len(set(data_test['movie_id']))

In [11]:
len(set(data_test['movie_id']))

50

In [12]:
def preprocess_baskets(baskets):
    context_inputs = []
    target_inputs = []
    
    for basket in baskets:
        for idx, elt in enumerate(basket):
            target_inputs.append(elt)
            context_inputs.append(basket[:idx] + basket[(idx+1):])
    
    context_inputs = pad_sequences(context_inputs, padding='post', maxlen = max_len - 1, value=0)
    return np.array(context_inputs), np.array(target_inputs) - 1

train_context_input, train_target_input = preprocess_baskets(train_baskets)
val_context_input, val_target_input = preprocess_baskets(val_baskets)
test_context_input, test_target_input = preprocess_baskets(test_baskets)

In [13]:
test_context_input[0]

array([37,  2,  5, 49, 21, 18,  6, 28,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0], dtype=int32)

In [14]:
input_context = layers.Input(shape=(max_len - 1,), dtype=tf.int32, name="context_input")

alpha_embedding = layers.Embedding(input_dim=max_items + 1, output_dim=D, name="alpha_embedding")
context_embedding = alpha_embedding(input_context)  # Shape: (batch_size, max_len, embedding_dim)

class ZeroMaskEmbedding(layers.Layer):
    def call(self, embeddings, input_tokens):
        mask = tf.cast(tf.not_equal(input_tokens, 0), tf.float32) 
        mask = tf.expand_dims(mask, axis=-1) 
        return embeddings * mask 

context_embedding = ZeroMaskEmbedding()(context_embedding, input_context)

class SumLayer(layers.Layer):
    def call(self, inputs):
        return tf.reduce_sum(inputs, axis=1) 

summed_context = SumLayer()(context_embedding)

output = layers.Dense(max_items, activation="softmax", name="output_layer", use_bias = False)(summed_context)

model = Model(inputs=input_context, outputs=output)
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=lr), loss="sparse_categorical_crossentropy")

early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

history = model.fit(
    train_context_input, train_target_input,        
    validation_data=(val_context_input, val_target_input),
    batch_size=batch_size,
    epochs=max_epochs,
    callbacks=[early_stopping]
)

Epoch 1/2000
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - loss: 3.9154 - val_loss: 3.9131
Epoch 2/2000
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 3.9097 - val_loss: 3.9085
Epoch 3/2000
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 3.9059 - val_loss: 3.9045
Epoch 4/2000
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 3.8990 - val_loss: 3.9008
Epoch 5/2000
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 3.8959 - val_loss: 3.8972
Epoch 6/2000
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 3.8907 - val_loss: 3.8938
Epoch 7/2000
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 3.8876 - val_loss: 3.8904
Epoch 8/2000
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 3.8850 - val_loss: 3.8873
Epoch 9/2000
[1m29/29[0m [32m━━━━━━━━

Epoch 69/2000
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 3.6805 - val_loss: 3.7188
Epoch 70/2000
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 3.6807 - val_loss: 3.7166
Epoch 71/2000
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 3.6759 - val_loss: 3.7143
Epoch 72/2000
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 3.6731 - val_loss: 3.7119
Epoch 73/2000
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 3.6725 - val_loss: 3.7097
Epoch 74/2000
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 3.6694 - val_loss: 3.7072
Epoch 75/2000
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 3.6696 - val_loss: 3.7051
Epoch 76/2000
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 3.6587 - val_loss: 3.7026
Epoch 77/2000
[1m29/29[0m [32

[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 3.4893 - val_loss: 3.5888
Epoch 137/2000
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 3.4647 - val_loss: 3.5874
Epoch 138/2000
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 3.4575 - val_loss: 3.5856
Epoch 139/2000
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 3.4721 - val_loss: 3.5843
Epoch 140/2000
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 3.4684 - val_loss: 3.5833
Epoch 141/2000
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 3.4623 - val_loss: 3.5819
Epoch 142/2000
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 3.4625 - val_loss: 3.5806
Epoch 143/2000
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 3.4554 - val_loss: 3.5787
Epoch 144/2000
[1m29/29[0m [32m━━━━━

[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 3.3022 - val_loss: 3.5224
Epoch 204/2000
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 3.2891 - val_loss: 3.5223
Epoch 205/2000
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 3.2916 - val_loss: 3.5218
Epoch 206/2000
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 3.2904 - val_loss: 3.5214
Epoch 207/2000
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 3.2722 - val_loss: 3.5212
Epoch 208/2000
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 3.2896 - val_loss: 3.5207
Epoch 209/2000
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 3.2787 - val_loss: 3.5205
Epoch 210/2000
[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 3.2864 - val_loss: 3.5202
Epoch 211/2000
[1m29/29[0m [32m━━━━━

In [15]:
model.summary()

In [16]:
# Evaluate on Test Data
test_loss = model.evaluate(test_context_input, test_target_input, batch_size=batch_size)
print(f"Test Loss: {test_loss}")

[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 3.5181 
Test Loss: 3.5656931400299072
