In [1]:
# load train_movies.pkl
import pandas as pd

MODE = 'train'

# movies pkl set name
if MODE == 'train':
    train_items_df= pd.read_pickle(r"train_movies.pkl").to_frame()
else :
    train_items_df= pd.read_pickle(r"test_movies.pkl").to_frame()

train_items_df= train_items_df.reset_index().rename(columns= {'index': 'movieId'})

train_items_df.head()
# print amount of movies


Unnamed: 0,movieId,0
0,5000,5000
1,7881,7881
2,3931,3931
3,6223,6223
4,4256,4256


In [2]:

if MODE == 'train':
    item_embeddings= pd.read_pickle(r"BPR2_item_embeddings.pkl")
else :
    item_embeddings= pd.read_pickle(r"BPR1_item_embeddings.pkl")
# print shape of item_embeddings

print(item_embeddings.shape)

(174056, 40)


In [3]:
# load movies_df.pkl to an object

movies_df= pd.read_pickle(r"movies_df.pkl", compression= 'gzip')
# print top

movies_df.head()

Unnamed: 0,id,imdb_id,original_title,overview,release_date,runtime,title,movieId,imdbId,tmdbId,...,tl,tr,uk,ur,uz,vi,wo,xx,zh,zu
0,862.0,tt0114709,Toy Story,"Led by Woody, Andy's toys live happily in his ...",815011200.0,81.0,Toy Story,1.0,114709.0,862.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,8844.0,tt0113497,Jumanji,When siblings Judy and Peter discover an encha...,818985600.0,104.0,Jumanji,2.0,113497.0,8844.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,15602.0,tt0113228,Grumpier Old Men,A family wedding reignites the ancient feud be...,819590400.0,101.0,Grumpier Old Men,3.0,113228.0,15602.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,31357.0,tt0114885,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",819590400.0,127.0,Waiting to Exhale,4.0,114885.0,31357.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,11862.0,tt0113041,Father of the Bride Part II,Just when George Banks has recovered from his ...,792374400.0,106.0,Father of the Bride Part II,5.0,113041.0,11862.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:

# remove rows with the same id

movies_df= movies_df.drop_duplicates(subset= 'id')

# remove duplicates in train_items_df on movieId

train_items_df= train_items_df.drop_duplicates(subset= 'movieId')

# take only rows where id is in train_items_df movieId
movies_df= movies_df[movies_df['movieId'].isin(train_items_df['movieId'])]

# print amount of movies

print(movies_df.shape)
movie_ids = movies_df['movieId'].values


# convert movie_ids to int

movie_ids= movie_ids.astype(int)
# take item embeddings only on indices of movie_ids

item_embeddings= item_embeddings[movie_ids]

# print shape of item_embeddings

print(item_embeddings.shape)


(8911, 4907)
(8911, 40)


In [5]:
# load columnMappings.json into an object

import json

with open(r"columnMappings.json", "r") as read_file:
    columnMappings = json.load(read_file)

# print keys of columnMappings


genres = movies_df[columnMappings['genres']].values
# # print the type of genres

# print(type(genres))
actors = movies_df[columnMappings['cast']].values

directors = movies_df[columnMappings['director']].values

unix_release_dates = movies_df['release_date'].values

title = movies_df['original_title'].values

# print shape of unix_release_dates

print(unix_release_dates.shape)

description = movies_df['overview'].values

language = movies_df[columnMappings['original_language']].values



# print shape of genres

print(genres.shape)



(8911,)
(8911, 20)


In [6]:
# Create CB2CFTransformerDataset

from CB2CFMultiModalEncoderDataset import CB2CFMultiModalEncoderDataset

cb2cf_train_dataset = CB2CFMultiModalEncoderDataset(
    genres=genres,
    actors=actors,
    directors=directors,
    unix_release_time=unix_release_dates,
    description=description,
    language=language,
    movie_ids=movie_ids,
    embedding=item_embeddings,
    title=title,
)

# print length of cb2cf_dataset

print(len(cb2cf_train_dataset))

  from .autonotebook import tqdm as notebook_tqdm


8911


In [7]:
# save the dataset into a file with torch.save

import torch

if MODE == 'train':
    torch.save(cb2cf_train_dataset, r"cb2cf_train_dataset.pt")
else :
    torch.save(cb2cf_train_dataset, r"cb2cf_test_dataset.pt")