In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dot, Embedding, Flatten, Dense, Input
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split

caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


In [2]:
df = pd.read_csv('/kaggle/input/amazon-ratings/ratings_Beauty.csv')

In [3]:
df.drop('Timestamp', axis=1, inplace=True)
df.shape

(2023070, 3)

In [4]:
df.head()

Unnamed: 0,UserId,ProductId,Rating
0,A39HTATAQ9V7YF,205616461,5.0
1,A3JM6GV9MNOF9X,558925278,3.0
2,A1Z513UWSAAO0F,558925278,5.0
3,A1WMRR494NWEWV,733001998,4.0
4,A3IAAVS479H7M7,737104473,1.0


In [5]:
# change the id of users with incremental numbers which is easy to deal with
user_ids = df['UserId'].unique()
user_id_map = {old_id: new_id for new_id, old_id in enumerate(user_ids)}
df['UserId'] = df['UserId'].map(user_id_map)

# change the id of products with incremental numbers which is easy to deal with
product_ids = df['ProductId'].unique()
product_id_map = {old_id: new_id for new_id, old_id in enumerate(product_ids)}
df['ProductId'] = df['ProductId'].map(product_id_map)

In [6]:
df.head()

Unnamed: 0,UserId,ProductId,Rating
0,0,0,5.0
1,1,1,3.0
2,2,1,5.0
3,3,2,4.0
4,4,3,1.0


# Some variables to be used around the notebook

In [7]:
# get all unique products
allProducts = df['ProductId'].unique()

# Start developing the model

In [8]:
def build_model(num_users, num_products, embedding_size):
    user_input = Input(shape=(), name='user_input')
    product_input = Input(shape=(), name='product_input')
    
    user_embedding = Embedding(input_dim=num_users, output_dim=embedding_size, name='user_embeddings')(user_input)
    product_embedding = Embedding(input_dim=num_products, output_dim=embedding_size, name='product_embeddings')(product_input)
    
    dot_product = Dot(axes=1, name='dot_product')([user_embedding, product_embedding])
    flatten = Flatten(name='flatten_layer')(dot_product)
    
    model = Model(inputs=[user_input, product_input], outputs=flatten)
    
    return model

In [9]:
num_users = len(df['UserId'].unique())
num_products = len(df['ProductId'].unique())
embedding_size = 50
# print(num_users, num_products)

model = build_model(num_users, num_products, embedding_size)

In [10]:
model.compile(optimizer=Adam(learning_rate=0.001), loss='mse', metrics=['accuracy'])

model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 user_input (InputLayer)        [(None,)]            0           []                               
                                                                                                  
 product_input (InputLayer)     [(None,)]            0           []                               
                                                                                                  
 user_embeddings (Embedding)    (None, 50)           60513550    ['user_input[0][0]']             
                                                                                                  
 product_embeddings (Embedding)  (None, 50)          12463700    ['product_input[0][0]']          
                                                                                              

In [11]:
x_train, x_test, y_train, y_test = train_test_split(df[['UserId', 'ProductId']], df[['Rating']], test_size=0.1)

In [13]:
model.fit(
    x=[x_train['UserId'], x_train['ProductId']],
    y=y_train['Rating'],
    batch_size=64,
    epochs=10,
    validation_split=0.1
)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7a0f2c212f50>

In [14]:
test_loss = model.evaluate(
    x=[x_test['UserId'], x_test['ProductId']],
    y=y_test,
    batch_size=64
)
print(f'Test Loss: {test_loss}')

Test Loss: [17.936323165893555, 0.011749469675123692]


In [15]:
print("Number of Users:      " + str(num_users))
print("Number of Products:    " + str(num_products))

Number of Users:      1210271
Number of Products:    249274


### Save the model

In [17]:
# Save the model after training
model.save('collaborative_filtering_model.h5')

In [18]:
# Load the model if you shutted down the notebook
model = tf.keras.models.load_model('collaborative_filtering_model.h5')

## Predicting the best products for a specific user

In [59]:
def get_all_prodcuts_per_user(idOfUser):
    # get the data for a specifc user
    data = df[df['UserId'] == idOfUser].reset_index().drop(['index', 'Rating'], axis=1)
    
    # make a DataFrame with all products for a specific user
    allProdcutsPerUser = pd.DataFrame({'UserId': idOfUser, 'ProductId': allProducts.copy()})

    return allProdcutsPerUser

In [60]:
def get_ratings(idOfUser):
    # Get the ratings for a specific user
    allProductsPerUser = get_all_prodcuts_per_user(idOfUser)
    
    # seprate the columns
    userColumn = allProductsPerUser['UserId']
    productColumn = allProductsPerUser['ProductId']
    
    # predict the rating of each product
    ratings = model.predict([userColumn, productColumn])
    
    # add the id of each product to it's rating
    ratings = np.concatenate([np.arange(num_products)[:, np.newaxis], ratings.reshape(-1)[:, np.newaxis]], axis=1)
    
    # Convert the predictions from Numpy array to Pandas Data Frame
    ratings = pd.DataFrame(ratings, columns=['ProductId', 'Rating'])
    
    # change the dtype of column ProductId
    ratings['ProductId'] = ratings['ProductId'].astype(int)
    
    # Sort the products from higher rating to the lower rating
    ratingsSorted = ratings.sort_values(by='Rating', ascending=False)
    
    return ratingsSorted

In [None]:
ratings = get_ratings(2)

ratings.head()

