In [11]:
# %pip install lightfm
from lightfm import LightFM
from lightfm.evaluation import precision_at_k
from lightfm.evaluation import auc_score
import pandas as pd
from sklearn.model_selection import train_test_split
from scipy import sparse as spr
from scipy.sparse import csr_matrix
from sklearn import preprocessing
import tqdm
import os
import pickle
# %matplotlib inline

Note: you may need to restart the kernel to use updated packages.


In [20]:
path2Keep= r"data2Keep\\" # path to load data
# load the data
onDrive= False
if onDrive == True:
    path= "/content/drive/MyDrive/School/IDC/RS/Project/data/"
    path2Keep= path.replace("/data", "") +  r"data2Keep/" # path to save the data to
    from google.colab import drive
    drive.mount('/content/drive')
else:
    path= r"data\\"
    path2Keep= r"data2Keep"

# load the beforelastXRatings.pkl into a dataframe
b4Ratings_df= pd.read_pickle(path2Keep+ r"\beforelastXRatings.pkl")
# load the lastXRatings.pkl into a dataframe
lastRatings_df= pd.read_pickle(path2Keep+ r"\lastXRatings.pkl")
# combine the two dataframes
ratings_df= pd.concat([b4Ratings_df, lastRatings_df], axis=0)
print(f'shape of ratings_df: {ratings_df.shape}')

shape of ratings_df: (20251813, 4)


In [21]:
# ratings4andOver= ratings_df[ratings_df['rating']>=4]
# ratings4andOver.rating= ratings4andOver.rating.astype
(train, test) = train_test_split(ratings_df, test_size=0.2, random_state=42)
print(f'shape of train: {train.shape}')
print(f'shape of test: {test.shape}')


shape of train: (16201450, 4)
shape of test: (4050363, 4)


In [22]:
#  create a sparse matrix from the dataframe using the csr_matrix function
csrMatrixTrain = spr.csr_matrix((train["rating"] ,(train["userId"], train["movieId"])), shape=(train["userId"].max()+1, train["movieId"].max()+1))
csrMatrixTest = spr.csr_matrix((test["rating"] ,(test["userId"], test["movieId"])), shape=(test["userId"].max()+1, test["movieId"].max()+1))
print(f' shape of csrMatrixTrain.shape {csrMatrixTrain.shape}')
print(f' shape of csrMatrixTest.shape {csrMatrixTest.shape}')

 shape of csrMatrixTrain.shape (270897, 168253)
 shape of csrMatrixTest.shape (270897, 168253)


In [8]:
# The BPR models (for both movies and apps) were trained for 100 epochs with a target dimension n = 40
#  check if the model is already at path2Keep+"bprModel.pkl"
if os.path.exists(path2Keep+"bprModel.pkl"):
    print("model already exists")
    with open(path2Keep+"bprModel.pkl", "rb") as f:
        model = pickle.load(f)
else:
    model = LightFM(learning_rate=0.05, loss='bpr')
    model.fit(csrMatrixTrain, epochs=5, verbose=True, num_threads=4)
    #  drop the model from memory into file
    with open(path2Keep+"bprModel.pkl", "wb") as f:
        pickle.dump(model, f, protocol=pickle.HIGHEST_PROTOCOL)

train_precision = precision_at_k(model, csrMatrixTrain, k=5, num_threads=4, ).mean()
test_precision = precision_at_k(model, csrMatrixTest, k=5, num_threads= 4).mean()

train_auc = auc_score(model, csrMatrixTrain).mean()
test_auc = auc_score(model, csrMatrixTest).mean()

print('Precision: train %.2f, test %.2f.' % (train_precision, test_precision))
print('AUC: train %.2f, test %.2f.' % (train_auc, test_auc))

Epoch 0
Epoch 1
Epoch 2
Epoch 3
Epoch 4


In [None]:
#  get model's embeddings
item_embeddings = model.get_item_representations()[1]
user_embeddings = model.get_user_representations()[1]
print(f'item_embeddings.shape: {item_embeddings.shape}')
print(f'user_embeddings.shape: {user_embeddings.shape}')
# item_embeddings.shape

item_embeddings.shape: (168253, 10)
user_embeddings.shape: (270897, 10)


array([[-0.33071688,  1.4424202 , -1.9702144 ,  0.35360897,  0.14243674,
        -0.25969762,  1.1330045 ,  0.40967655, -1.8347144 ,  1.3147914 ],
       [ 1.2617768 , -0.22005434, -1.0865386 ,  0.6351669 ,  0.8004641 ,
        -0.8241127 ,  0.00281958,  0.16676608, -1.5117749 ,  0.80185324]],
      dtype=float32)

In [None]:
#  load the movies dataframe from Project\data2Keep\movies_df.pkl
movies_df= pd.read_pickle(path2Keep+"movies_df.pkl")
#  get the star wars movieIds
movies_df.head()
starWarsMovies= movies_df[movies_df['original_title'].str.contains('Star Wars')]
print(starWarsMovies.head(5))
#  get the star wars movie embeddings
starWarsMoviesEmbeddings= item_embeddings[starWarsMovies[:4]]
# starWarsMoviesEmbeddings


                                                  genres     id    imdb_id  \
255                   Adventure, Action, Science Fiction     11  tt0076759   
2502                  Adventure, Action, Science Fiction   1893  tt0120915   
5239                  Adventure, Action, Science Fiction   1894  tt0121765   
10069                 Science Fiction, Adventure, Action   1895  tt0121766   
12889  Thriller, Animation, Action, Science Fiction, ...  12180  tt1185834   

      original_language                                original_title  \
255                  en                                     Star Wars   
2502                 en     Star Wars: Episode I - The Phantom Menace   
5239                 en  Star Wars: Episode II - Attack of the Clones   
10069                en  Star Wars: Episode III - Revenge of the Sith   
12889                en                     Star Wars: The Clone Wars   

                                                overview  release_date  \
255    Princess Le

IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices