In [1]:
from aprec.api.action import Action
from aprec.api.catalog import Catalog
from aprec.datasets.download_file import download_file

BERT4REC_DATASET_URL = "https://raw.githubusercontent.com/asash/BERT4rec_py3_tf2/master/BERT4rec/data/{}.txt"
BERT4REC_DIR = "data/bert4rec"
VALID_DATASETS = {"ml-1m"}

def get_bert4rec_dataset(dataset):
    if dataset not in VALID_DATASETS:
        raise ValueError(f"unknown bert4rec dataset {dataset}")
    
    dataset_filename = download_file(BERT4REC_DATASET_URL.format(dataset), dataset + ".txt", BERT4REC_DIR)
    
    # Get genre and catalog information
    genres_dict, title_dict = get_movielens1m_genres()
    catalog = get_movielens1m_catalog()
    
    actions = []
    prev_user = None
    current_timestamp = 0
    
    with open(dataset_filename) as input:
        for line in input:
            user, item = [str(id) for id in line.strip().split()]
            if user != prev_user:
                current_timestamp = 0
            prev_user = user
            current_timestamp += 1
            genre = genres_dict.get(item, "Unknown") 
            title = title_dict.get(item, "Unknown")# Get genre for the item
            actions.append(Action(user, item, current_timestamp, {"genres":genre,"title":title}))  # Add genre to the action
        return actions
            


MAPPING_URL = "https://raw.githubusercontent.com/asash/ml1m-sas-mapping/main/sas_to_original_items.txt"

def ml1m_mapping_to_original():
    mapping_filename = download_file(MAPPING_URL, "sas_to_original_items.txt", BERT4REC_DIR)
    result = {}
    for line in open(mapping_filename):
        sas_item, original_item = line.strip().split()
        result[sas_item] = original_item
    return result

def get_movielens1m_genres():
    from aprec.datasets.movielens1m import get_genre_title_dict as get_ml1m_genre_dict
    original_genre_dict, original_title_dict = get_ml1m_genre_dict()
    mapping = ml1m_mapping_to_original()
    result_genre = {}
    result_title = {}
    for sas_item, original_item in mapping.items():
        result_genre[sas_item] = original_genre_dict[original_item]
        result_title[sas_item] = original_title_dict[original_item]
    return result_genre,result_title

def get_movielens1m_catalog():
    from aprec.datasets.movielens1m import get_movies_catalog as get_ml1m_catalog
    original_catalog, movie_genres, movie_titles = get_ml1m_catalog()
    mapping = ml1m_mapping_to_original()
    result = Catalog()
    for sas_item, original_item_id in mapping.items():
        item = original_catalog.get_item(original_item_id)
        item.item_id = sas_item
        result.add_item(item)
    return result


In [35]:
# dataset = [
#     {"item": 1, "data": {"genres": ["Animation"], "title": "Toy Story (1995)"}},
#     {"item": 2, "data": {"genres": ["Children"], "title": "Jumanji (1995)"}},
#     {"item": 3, "data": {"genres": ["Comedy"], "title": "Grumpier Old Men (1995)"}},
#     {"item": 4, "data": {"genres": ["Drama"], "title": "Waiting to Exhale (1995)"}},
#     {"item": 5, "data": {"genres": ["Romance"], "title": "Father of the Bride Part II (1995)"}},
#     {"item": 6, "data": {"genres": ["Action"], "title": "Heat (1995)"}},
#     {"item": 7, "data": {"genres": ["Fantasy"], "title": "Sabrina (1995)"}},
#     {"item": 8, "data": {"genres": ["Adventure"], "title": "Tom and Huck (1995)"}},
#     {"item": 9, "data": {"genres": ["Thriller"], "title": "Sudden Death (1995)"}},
#     {"item": 10, "data": {"genres": ["Crime"], "title": "GoldenEye (1995)"}},
#     {"item": 11, "data": {"genres": ["Comedy", "Romance"], "title": "American President, The (1995)"}},
#     {"item": 12, "data": {"genres": ["Horror"], "title": "Dracula: Dead and Loving It (1995)"}},
#     {"item": 13, "data": {"genres": ["Animation", "Children"], "title": "Balto (1995)"}},
#     {"item": 14, "data": {"genres": ["Drama"], "title": "Nixon (1995)"}},
#     {"item": 15, "data": {"genres": ["Action", "Adventure"], "title": "Cutthroat Island (1995)"}},
#     {"item": 16, "data": {"genres": ["Crime", "Drama"], "title": "Casino (1995)"}},
#     {"item": 17, "data": {"genres": ["Drama", "Romance"], "title": "Sense and Sensibility (1995)"}},
#     {"item": 18, "data": {"genres": ["Comedy"], "title": "Four Rooms (1995)"}}
# ]

dataset = [
    {"item": 1, "data": {"genres": ["Animation"], "title": "Toy Story (1995)"}},
    {"item": 13, "data": {"genres": ["Animation", "Children"], "title": "Balto (1995)"}},
    {"item": 20, "data": {"genres": ["Animation", "Children", "Comedy"], "title": "Aladdin (1992)"}},
    {"item": 21, "data": {"genres": ["Animation", "Children", "Musical"], "title": "Lion King, The (1994)"}},
    {"item": 22, "data": {"genres": ["Animation", "Children", "Musical"], "title": "Beauty and the Beast (1991)"}},
    {"item": 23, "data": {"genres": ["Animation", "Children", "Musical"], "title": "Hunchback of Notre Dame, The (1996)"}},

    {"item": 6, "data": {"genres": ["Action"], "title": "Heat (1995)"}},
    {"item": 10, "data": {"genres": ["Action", "Adventure", "Thriller"], "title": "GoldenEye (1995)"}},
    {"item": 15, "data": {"genres": ["Action", "Adventure"], "title": "Cutthroat Island (1995)"}},
    {"item": 30, "data": {"genres": ["Action", "Thriller"], "title": "Die Hard (1988)"}},
    {"item": 31, "data": {"genres": ["Action", "Sci-Fi", "Thriller"], "title": "Terminator 2: Judgment Day (1991)"}},
    {"item": 32, "data": {"genres": ["Action", "Comedy", "Crime"], "title": "Lethal Weapon (1987)"}},

    {"item": 3, "data": {"genres": ["Comedy","Drama"], "title": "Grumpier Old Men (1995)"}},
    {"item": 18, "data": {"genres": ["Comedy","Anthology","Farce"], "title": "Four Rooms (1995)"}},
    {"item": 40, "data": {"genres": ["Comedy","Animation","Adventure"], "title": "Dumb & Dumber (1994)"}},
    {"item": 41, "data": {"genres": ["Comedy","Action","Adventure"], "title": "Ace Ventura: Pet Detective (1994)"}},
    {"item": 42, "data": {"genres": ["Comedy","Drama"], "title": "Billy Madison (1995)"}},
    {"item": 43, "data": {"genres": ["Comedy","Romance","Drama"], "title": "Happy Gilmore (1996)"}}
]



In [3]:
dataset[1]

Action(uid=1, item=2, ts=2, data={'genres': ['Comedy'], 'title': 'Caddyshack (1980)'})

In [2]:
import requests

TMDB_API_KEY = 'e28351a5ba65201ad806d86aa3d4dbde'
TMDB_SEARCH_URL = 'https://api.themoviedb.org/3/search/movie?query={}&api_key=' + TMDB_API_KEY
TMDB_MOVIE_URL = 'https://api.themoviedb.org/3/movie/{}?api_key=' + TMDB_API_KEY

def search_movie_tmdb(movie_name):
    response = requests.get(TMDB_SEARCH_URL.format(movie_name))
    if response.status_code == 200:
        search_results = response.json()
        if search_results['results']:
            return search_results['results'][0]['id']
    return None

def get_tmdb_genres(movie_names):
    genres_dict = {}
    for movie_name in movie_names:
        movie_id = search_movie_tmdb(movie_name)
        if movie_id:
            response = requests.get(TMDB_MOVIE_URL.format(movie_id))
            if response.status_code == 200:
                movie_data = response.json()
                genres = [genre['name'] for genre in movie_data.get('genres', [])]
                genres_dict[movie_name] = genres if genres else ["Unknown"]
            else:
                genres_dict[movie_name] = ["Unknown"]
        else:
            genres_dict[movie_name] = ["Unknown"]
    return genres_dict

In [14]:
movies = ['Grumpier Old Men','Four Rooms','Dumb & Dumber','Ace Ventura: Pet Detective','Billy Madison','Happy Gilmore']
genres = get_tmdb_genres(movies)

In [15]:
print(genres)

{'Grumpier Old Men': ['Romance', 'Comedy'], 'Four Rooms': ['Comedy'], 'Dumb & Dumber': ['Comedy'], 'Ace Ventura: Pet Detective': ['Comedy', 'Mystery'], 'Billy Madison': ['Comedy'], 'Happy Gilmore': ['Comedy']}


In [30]:
# !pip install sentence_transformers
#!pip install numpy
#!pip install plotly

In [38]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sentence_transformers import SentenceTransformer
import plotly.express as px
import plotly.io as pio
pio.renderers.default = 'iframe'

# Initialize the model
model = SentenceTransformer('sentence-transformers/sentence-t5-base')

# Function to get embeddings
def get_embeddings(genres):
    genre_text = ' '.join(genres)
    embeddings = model.encode(genre_text)
    return embeddings

# Create embeddings for each movie
item_to_genre = {entry["item"]: entry["data"]["genres"] for entry in dataset}
unique_items = list(item_to_genre.keys())

genre_embeddings = np.zeros((len(unique_items), 768))
for idx, item_id in enumerate(unique_items):
    genres = item_to_genre[item_id]
    genre_emb = get_embeddings(genres)
    genre_embeddings[idx] = genre_emb

# Apply PCA to reduce dimensionality
pca = PCA(n_components=2)
reduced_embeddings = pca.fit_transform(genre_embeddings)

# Clustering
num_clusters = 3
kmeans = KMeans(n_clusters=num_clusters, random_state=0).fit(reduced_embeddings)
cluster_labels = kmeans.labels_

# Prepare data for Plotly
titles = [entry["data"]["title"] for entry in dataset]
df = pd.DataFrame({
    'PCA1': reduced_embeddings[:, 0],
    'PCA2': reduced_embeddings[:, 1],
    'Title': titles,
    'Cluster': cluster_labels
})

# Create interactive plot
fig = px.scatter(df, x='PCA1', y='PCA2', color='Cluster', hover_data=['Title'], title='PCA of Genre Embeddings with Clustering')
fig.show()






# Items with exact same genre list have the same position on the graph and hence you see less items

In [37]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
import plotly.express as px
import plotly.io as pio
pio.renderers.default = 'iframe'  

# Create a list of all unique genres
unique_genres = set(genre for movie in dataset for genre in movie['data']['genres'])
genre_to_index = {genre: idx for idx, genre in enumerate(unique_genres)}

# One-hot encode the genres
num_genres = len(unique_genres)
genre_embeddings = np.zeros((len(dataset), num_genres))

for idx, movie in enumerate(dataset):
    for genre in movie['data']['genres']:
        genre_embeddings[idx, genre_to_index[genre]] = 1

# Apply PCA to reduce dimensionality
pca = PCA(n_components=2)
reduced_embeddings = pca.fit_transform(genre_embeddings)

# Clustering
num_clusters = 3
kmeans = KMeans(n_clusters=num_clusters, random_state=0).fit(reduced_embeddings)
cluster_labels = kmeans.labels_

# Prepare data for Plotly
titles = [movie["data"]["title"] for movie in dataset]
df = pd.DataFrame({
    'PCA1': reduced_embeddings[:, 0],
    'PCA2': reduced_embeddings[:, 1],
    'Title': titles,
    'Cluster': cluster_labels
})

# Create interactive plot
fig = px.scatter(df, x='PCA1', y='PCA2', color='Cluster', hover_data=['Title'], title='PCA of Genre Embeddings with Clustering')
fig.show()




