In [9]:
import pandas as pd
import torch
import tqdm
import numpy as np
from tqdm import tqdm


In [10]:
# Load train dataset and produce train embeddings
model_name = "cb2cf_multi_modal_encoder_model.pt"
train_data = torch.load("cb2cf_train_dataset.pt")
train_dataloader = torch.utils.data.DataLoader(train_data, batch_size=1, shuffle=False, num_workers=0)
model = torch.load(model_name)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

with torch.no_grad():
    train_embeddings = []
    train_titles = []
    train_movie_ids = []
    for data in tqdm(train_dataloader):
        output = model(
            genres=data["genres"].to(device),
            actors=data["actors"].to(device),
            directors=data["directors"].to(device),
            unix_release_time=data["unix_release_time"].to(device),
            description=data["description"],
            language=data["language"].to(device),
        )
        train_embeddings.append(output.cpu().squeeze().numpy())
        train_titles.append(data["title"])
        train_movie_ids.append(data["movie_ids"])

100%|██████████| 8911/8911 [01:26<00:00, 102.72it/s]


In [11]:
test_data = torch.load("cb2cf_test_dataset.pt")
test_dataloader = torch.utils.data.DataLoader(test_data, batch_size=1, shuffle=False, num_workers=0)

with torch.no_grad():
    test_embeddings = []
    test_titles = []
    test_movie_ids = []
    for i, data in enumerate(tqdm(test_dataloader)):
        output = model(
            genres=data["genres"].to(device),
            actors=data["actors"].to(device),
            directors=data["directors"].to(device),
            unix_release_time=data["unix_release_time"].to(device),
            description=data["description"],
            language=data["language"].to(device),
        )
        test_embeddings.append(output.cpu().squeeze().numpy())
        test_titles.append(data["title"])
        test_movie_ids.append(data["movie_ids"])

100%|██████████| 991/991 [00:09<00:00, 103.40it/s]


In [12]:
oracle_item_embeddings = pd.read_pickle(r"BPR1_item_embeddings.pkl")

# convert train_movie ids and test movie ids to list of integers

train_movie_ids = [int(movie_id) for movie_id in train_movie_ids]
test_movie_ids = [int(movie_id) for movie_id in test_movie_ids]

oracle_train_item_embeddings = oracle_item_embeddings[train_movie_ids]
oracle_test_item_embeddings = oracle_item_embeddings[test_movie_ids]
# all ids 
all_movie_ids = train_movie_ids + test_movie_ids
# all titles
all_titles = train_titles + test_titles
# all oracle embeddings
all_oracle_embeddings = np.concatenate((oracle_train_item_embeddings, oracle_test_item_embeddings), axis=0)

In [13]:
# Create a dataframe with item id, title, oracle embedding, cb2cf embedding

import pandas as pd
import pickle

df = pd.DataFrame()

oracle_item_embeddings = pd.read_pickle(r"BPR1_item_embeddings.pkl")


df["item_id"] = all_movie_ids
df["title"] = all_titles

df["oracle_embedding"] = list(all_oracle_embeddings)

df["cb2cf_embedding"] = list(train_embeddings + test_embeddings)
# print the first 5 rows of the dataframe

df.head()

# save the dataframe to a pickle file with highest protocol

df.to_pickle("cb2cf_embeddings_and_oracle_embeddings.pkl", protocol=pickle.HIGHEST_PROTOCOL)

In [14]:
# load cb2cf_embeddings_and_oracle_embeddings.pkl into a dataframe
cb2cf_embeddings_and_oracle_embeddings_df= pd.read_pickle(r"cb2cf_embeddings_and_oracle_embeddings.pkl")

# print the first 5 rows of the dataframe

cb2cf_embeddings_and_oracle_embeddings_df.head()

Unnamed: 0,item_id,title,oracle_embedding,cb2cf_embedding
0,1,[Toy Story],"[0.07483365, -0.80597216, -2.5122287, 0.172594...","[0.7114382, 0.20333348, 0.5329094, 0.94893163,..."
1,2,[Jumanji],"[-0.7013331, -0.358848, -0.35755172, 0.3665048...","[-0.19589086, -0.75600845, -0.20841327, 0.9568..."
2,4,[Waiting to Exhale],"[-1.4799722, 0.0777297, -0.37370113, 1.1802115...","[-0.65252167, -1.2592509, -0.67503864, 1.04062..."
3,5,[Father of the Bride Part II],"[-1.7130021, 0.64792114, -0.45365041, 1.244741...","[-0.5138638, -1.3205254, -0.35981533, 1.049852..."
4,6,[Heat],"[-1.9372973, 0.86165136, -0.5711593, 1.8253926...","[-1.9371253, 0.18546836, -0.46241716, 0.865678..."


In [38]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def print_most_similar_movie_titles(relevant_title, relevant_embedding, all_embeddings, all_titles, index_to_ignore=0, n=4):
    all_titles = np.array(all_titles)
    # remove index to ignore from all embeddings and all titles
    all_embeddings = np.delete(all_embeddings, index_to_ignore, axis=0)
    all_titles = np.delete(all_titles, index_to_ignore, axis=0)
    # compute cosine similarity between relevant embedding and all embeddings
    similarities = cosine_similarity(relevant_embedding.reshape(1, -1), all_embeddings)
    # get the indices of the most similar embeddings
    most_similar_indices = np.argsort(similarities[0])[-(n):]
    # remove the first index as it is the relevant embedding itself
    most_similar_indices = most_similar_indices
    # convert to list of integers
    most_similar_indices = most_similar_indices.tolist()
    # convert all titles to numpy array
    titles= all_titles[most_similar_indices]
    print(f"Most similar movies to {relevant_title} are:\n {titles}")
    

In [43]:
# pick a random index from test set
random_index = np.random.randint(0, len(test_embeddings))
# get the relevant title
relevant_title = test_titles[random_index]
# get the relevant embedding from the test set
relevant_embedding = test_embeddings[random_index]
# get the relevant embedding from the oracle set
relevant_oracle_embedding = oracle_test_item_embeddings[random_index]

index_to_ignore = random_index + len(train_embeddings)

# get the most similar movies from the test set
print("Most similar movies from the test set using CB2CF embeddings")
print_most_similar_movie_titles(relevant_title, relevant_embedding, train_embeddings + test_embeddings, train_titles + test_titles, index_to_ignore=index_to_ignore)
# get the most similar movies from the oracle set
print("\n")
print("Most similar movies from the test set using Oracle embeddings")
print_most_similar_movie_titles(relevant_title, relevant_oracle_embedding, oracle_item_embeddings, train_titles + test_titles, index_to_ignore=random_index)




Most similar movies from the test set using CB2CF embeddings
Most similar movies to ['Seeing Other People'] are:
 [['Deliver Us from Eva']
 ['The Story of Us']
 ['View from the Top']
 ['Life or Something Like It']]
Most similar movies from the test set using Oracle embeddings
Most similar movies to ['Seeing Other People'] are:
 [['Beautiful Girls']
 ['¿Qué he hecho yo para merecer esto!']
 ['Christmas with the Kranks']
 ['Anthropoid']]


In [None]:

train_ratings_df= pd.read_pickle(r"beforelastXRatings.pkl",  compression= 'gzip')

# print the first 5 rows of the dataframe

train_ratings_df.head()