In [4]:
import onnxruntime as ort
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics.pairwise import cosine_similarity

In [5]:
net = ort.InferenceSession("../models/ncf/model-q.onnx")

In [6]:
movies = pd.read_csv("../models/ncf/movies.csv")
ratings = pd.read_parquet("../models/ncf/ratings.parquet", engine="pyarrow")
embeddings = np.load('../models/ncf/embeddings.npy')

In [495]:
user_encoder = LabelEncoder()
movie_encoder = LabelEncoder()

# ratings['user'] = user_encoder.fit_transform(ratings['userId'])
# ratings['movie'] = movie_encoder.fit_transform(ratings['movieId'])
# ratings.drop(columns=['user', 'movie'], inplace=True)

user_encoder.fit_transform(ratings['userId'])
movie_encoder.fit_transform(ratings['movieId'])

array([   16,    24,    28, ..., 15201, 15229, 16531], dtype=int64)

In [497]:
all_genres = ['Action', 'Adventure', 'Animation', 'Children', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'IMAX', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']

In [498]:
def recommend_movies(user_id, net, movies, user_encoder, movie_encoder, ratings, top_n=10):
    try:
        user_encoded = user_encoder.transform([user_id])[0]
    except:
        print("User ID Invalid")
        return pd.DataFrame()
    
    all_movies = np.arange(84432)

    user_data = ratings[ratings['userId'] == user_id]
    rated_movie_ids = user_data['movieId'].unique()
    rated_movies = movie_encoder.transform(rated_movie_ids)
    
    movies_to_predict = np.setdiff1d(all_movies, rated_movies).astype(np.int64)
    
    predicted_ratings = []
    movie_ids = []
    
    batch_size = 1024
    for i in range(0, len(movies_to_predict), batch_size):
        batch_movies = movies_to_predict[i:i+batch_size]
        batch_users = np.array([user_encoded] * len(batch_movies), dtype=np.int64)
        inputs = {
            'user_input': batch_users,
            'movie_input': batch_movies
        }
        outputs = net.run(['rating_output'], inputs)[0]
        predicted_ratings.extend(outputs.flatten())
        batch_movie_ids = movie_encoder.inverse_transform(batch_movies)
        movie_ids.extend(batch_movie_ids)
    
    predictions_df = pd.DataFrame({
        'movieId': movie_ids,
        'predicted_rating': predicted_ratings
    })
    
    recommendations = predictions_df.merge(movies[['movieId', 'title']], on='movieId')
    recommendations = recommendations.sort_values(by='predicted_rating', ascending=False).head(top_n)
    recommendations["predicted_rating"] = np.floor(recommendations["predicted_rating"] * 100) / 100
    
    recommendations = recommendations.reset_index(drop=True)
    recommendations.index += 1
    
    recommendations.rename(columns={
    "movieId": "Movie ID",
    "title": "Title",
    "predicted_rating": "Predicted Rating"
    }, inplace=True)
    recommendations["Year"] = recommendations["Title"].str.extract(r"\((\d{4})\)")
    recommendations["Title"] = recommendations["Title"].str.replace(r" \(\d{4}\)", "", regex=True)
    recommendations = recommendations[["Movie ID", "Title", "Year", "Predicted Rating"]]
    return recommendations

In [499]:
user_id=2
print(f"Recommended Movies for: User {user_id}")
user_recommendations = recommend_movies(
    user_id=user_id,
    net=net,
    movies=movies,
    user_encoder=user_encoder,
    movie_encoder=movie_encoder,
    ratings=ratings,
    top_n=10
)
display(user_recommendations)

Recommended Movies for: User 2


Unnamed: 0,Movie ID,Title,Year,Predicted Rating
1,129788,Raanjhanaa,2013,5.0
2,927,"Women, The",1939,4.99
3,918,Meet Me in St. Louis,1944,4.97
4,121097,To Grandmother's House We Go,1992,4.96
5,7085,Send Me No Flowers,1964,4.96
6,93988,North & South,2004,4.95
7,129032,Sense & Sensibility,2008,4.95
8,1035,"Sound of Music, The",1965,4.95
9,142929,Sissi,1955,4.95
10,256991,Adventure Time: Elements,2017,4.94


In [500]:
def recommend_similar_movies_id(movie_id, movies, movie_encoder, embeddings, top_n=10):
    
    if movie_id not in movies['movieId'].values:
        print("Movie ID Invalid")
        return pd.DataFrame()
    # print(f"Recommended Movies Similar to: {movies[movies['movieId'] == movie_id]['title'].values[0]}")
    
    movie_encoded = movie_encoder.transform([movie_id])[0]
    target_embedding = embeddings[movie_encoded].reshape(1, -1)
    
    similarities = cosine_similarity(target_embedding, embeddings).flatten()
    similar_indices = similarities.argsort()[-(top_n + 1):-1][::-1]
    similar_movie_ids = movie_encoder.inverse_transform(similar_indices)
    
    recommendations = movies[movies['movieId'].isin(similar_movie_ids)][['movieId', 'title']]
    recommendations['similarity'] = similarities[similar_indices]
    recommendations["similarity"] = np.ceil(recommendations["similarity"] * 1000) / 100
    
    recommendations = recommendations.reset_index(drop=True)
    recommendations.index += 1
    
    recommendations.rename(columns={
    "movieId": "Movie ID",
    "title": "Title",
    "similarity": "Cosine Similarity"
    }, inplace=True)
    recommendations["Year"] = recommendations["Title"].str.extract(r"\((\d{4})\)")
    recommendations["Title"] = recommendations["Title"].str.replace(r" \(\d{4}\)", "", regex=True)
    recommendations = recommendations[["Movie ID", "Title", "Year", "Cosine Similarity"]]
    return recommendations

In [501]:
movie_id=1
similar_movies = recommend_similar_movies_id(
    movie_id=movie_id,
    movies=movies,
    movie_encoder=movie_encoder,
    embeddings=embeddings,
    top_n=10
)
display(similar_movies)

Unnamed: 0,Movie ID,Title,Year,Cosine Similarity
1,588,Aladdin,1992,9.5
2,2355,"Bug's Life, A",1998,8.94
3,3114,Toy Story 2,1999,8.74
4,4886,"Monsters, Inc.",2001,8.69
5,6377,Finding Nemo,2003,8.4
6,8961,"Incredibles, The",2004,8.01
7,50872,Ratatouille,2007,7.9
8,60069,WALL·E,2008,7.78
9,68954,Up,2009,7.61
10,78499,Toy Story 3,2010,7.38


In [502]:
def recommend_similar_movies_name(movie_name, movies, movie_encoder, embeddings, top_n=10):
    
    movie_name = movie_name.strip()
    matching_movies = movies[movies['title'].str.contains(movie_name, case=False, regex=True)]
    if matching_movies.empty:
        print("Movie Name Invalid")
        return pd.DataFrame()
    movie_id = matching_movies.iloc[0]['movieId']
    # print(f"Recommended Movies Similar to: {movies[movies['movieId'] == movie_id]['title'].values[0]}")
    
    movie_encoded = movie_encoder.transform([movie_id])[0]
    target_embedding = embeddings[movie_encoded].reshape(1, -1)
    
    similarities = cosine_similarity(target_embedding, embeddings).flatten()
    similar_indices = similarities.argsort()[-(top_n + 1):-1][::-1]
    similar_movie_ids = movie_encoder.inverse_transform(similar_indices)
    
    recommendations = movies[movies['movieId'].isin(similar_movie_ids)][['movieId', 'title']]
    recommendations['similarity'] = similarities[similar_indices]
    recommendations["similarity"] = np.ceil(recommendations["similarity"] * 1000) / 100
    
    recommendations = recommendations.reset_index(drop=True)
    recommendations.index += 1
    
    recommendations.rename(columns={
    "movieId": "Movie ID",
    "title": "Title",
    "similarity": "Cosine Similarity"
    }, inplace=True)
    recommendations["Year"] = recommendations["Title"].str.extract(r"\((\d{4})\)")
    recommendations["Title"] = recommendations["Title"].str.replace(r" \(\d{4}\)", "", regex=True)
    recommendations = recommendations[["Movie ID", "Title", "Year", "Cosine Similarity"]]
    return recommendations

In [503]:
movie_name="Godfather"
similar_movies = recommend_similar_movies_name(
    movie_name=movie_name,
    movies=movies,
    movie_encoder=movie_encoder,
    embeddings=embeddings,
    top_n=10
)
display(similar_movies)

Unnamed: 0,Movie ID,Title,Year,Cosine Similarity
1,111,Taxi Driver,1976.0,9.73
2,912,Casablanca,1942.0,8.1
3,1204,Lawrence of Arabia,1962.0,7.37
4,1208,Apocalypse Now,1979.0,7.28
5,1213,Goodfellas,1990.0,7.23
6,1221,"Godfather: Part II, The",1974.0,7.17
7,1228,Raging Bull,1980.0,7.11
8,1250,"Bridge on the River Kwai, The",1957.0,7.1
9,55820,No Country for Old Men,2007.0,7.05
10,214500,Horn from the Heart: The Paul Butterfield Story,,6.98


In [504]:
def recommend_by_genre_pop(genres, movies, ratings, top_n=10):
    valid_genres = sorted(all_genres)
    for genre in genres:
        if genre not in valid_genres:
            print(f"Genre '{genre}' Invalid. Valid Genres: {valid_genres}")
            return pd.DataFrame()
    
    filtered_movies = movies
    for genre in genres:
        filtered_movies = filtered_movies[filtered_movies[genre] == 1]
    
    if filtered_movies.empty:
        print("No Movies Found w/ Specified Genre(s)")
        return pd.DataFrame()
    
    popularity = ratings.groupby('movieId').size().reset_index(name='rating_count')
    
    recommendations = filtered_movies.merge(popularity, on='movieId', how='left').fillna({'rating_count': 0})
    recommendations = recommendations.sort_values(by='rating_count', ascending=False)
    recommendations = recommendations[['movieId', 'title', 'rating_count']].head(top_n)
    recommendations["rating_count"] = recommendations["rating_count"].astype(int)
    
    recommendations = recommendations.reset_index(drop=True)
    recommendations.index += 1
    
    recommendations.rename(columns={
    "movieId": "Movie ID",
    "title": "Title",
    "rating_count": "Total Rating"
    }, inplace=True)
    recommendations["Year"] = recommendations["Title"].str.extract(r"\((\d{4})\)")
    recommendations["Title"] = recommendations["Title"].str.replace(r" \(\d{4}\)", "", regex=True)
    recommendations = recommendations[["Movie ID", "Title", "Year", "Total Rating"]]
    
    return recommendations

In [505]:
selected_genres = ['Action', 'Adventure']
print(f"Recommended Movies w/ Genres: {', '.join(selected_genres)}")
genre_recommendations = recommend_by_genre_pop(
    genres=selected_genres,
    movies=movies,
    ratings=ratings,
    top_n=10
)
display(genre_recommendations)

Recommended Movies w/ Genres: Action, Adventure


Unnamed: 0,Movie ID,Title,Year,Total Rating
1,260,Star Wars: Episode IV - A New Hope,1977,85010
2,480,Jurassic Park,1993,75233
3,1196,Star Wars: Episode V - The Empire Strikes Back,1980,72151
4,1210,Star Wars: Episode VI - Return of the Jedi,1983,67496
5,7153,"Lord of the Rings: The Return of the King, The",2003,67449
6,1198,Raiders of the Lost Ark (Indiana Jones and the...,1981,67408
7,3578,Gladiator,2000,57449
8,780,Independence Day (a.k.a. ID4),1996,57224
9,6539,Pirates of the Caribbean: The Curse of the Bla...,2003,48722
10,380,True Lies,1994,47148


In [506]:
def recommend_by_genre_avg(genres, movies, ratings, top_n=10):
    valid_genres = sorted(all_genres)
    for genre in genres:
        if genre not in valid_genres:
            print(f"Genre '{genre}' is not recognized. Valid genres are: {valid_genres}")
            return pd.DataFrame()
    
    filtered_movies = movies
    for genre in genres:
        filtered_movies = filtered_movies[filtered_movies[genre] == 1]
    
    if filtered_movies.empty:
        print("No movies found with the specified genres.")
        return pd.DataFrame()
    
    rating_stats = ratings.groupby('movieId').agg(
        average_rating=('rating', 'mean'),
        rating_count=('rating', 'size')
    ).reset_index()
    
    recommendations = filtered_movies.merge(rating_stats, on='movieId', how='left').fillna({'average_rating': 0, 'rating_count': 0})
    recommendations = recommendations[recommendations['rating_count'] >= 10]
    recommendations = recommendations.sort_values(by='average_rating', ascending=False)
    recommendations = recommendations[['movieId', 'title', 'average_rating', 'rating_count']].head(top_n)
    recommendations["average_rating"] = np.ceil(recommendations["average_rating"] * 1000) / 1000
    recommendations["rating_count"] = recommendations["rating_count"].astype(int)
    
    recommendations = recommendations.reset_index(drop=True)
    recommendations.index += 1
    
    recommendations.rename(columns={
    "movieId": "Movie ID",
    "title": "Title",
    "average_rating": "Average Rating",
    "rating_count": "Total Rating"
    }, inplace=True)
    recommendations["Year"] = recommendations["Title"].str.extract(r"\((\d{4})\)")
    recommendations["Title"] = recommendations["Title"].str.replace(r" \(\d{4}\)", "", regex=True)
    recommendations = recommendations[["Movie ID", "Title", "Year", "Average Rating", "Total Rating"]]
    
    return recommendations

In [507]:
selected_genres = ['Horror', 'Adventure']
print(f"Recommended Movies w/ Genres: {', '.join(selected_genres)}")
genre_recommendations = recommend_by_genre_avg(
    genres=selected_genres,
    movies=movies,
    ratings=ratings,
    top_n=10
)
display(genre_recommendations)

Recommended Movies w/ Genres: Horror, Adventure


Unnamed: 0,Movie ID,Title,Year,Average Rating,Total Rating
1,103912,Giorgino,1994,4.034,15
2,1200,Aliens,1986,4.008,38846
3,57502,Cat Soup (Nekojiru-so),2001,3.796,384
4,1215,Army of Darkness,1993,3.732,14379
5,159403,Neo Tokyo,1987,3.699,53
6,130508,Berserk: The Golden Age Arc - The Egg of the King,2012,3.692,251
7,53519,Death Proof,2007,3.546,6248
8,2366,King Kong,1933,3.541,7284
9,40732,"Descent, The",2005,3.531,4347
10,122029,The Hound of the Baskervilles,2002,3.5,15


In [508]:
# def recommend_combined(movie_id, genres, movies, movie_encoder, embeddings, ratings, top_n=10):
#     if movie_id not in movies['movieId'].values:
#         print("Movie ID Invalid")
#         return pd.DataFrame()
    
#     movie_encoded = movie_encoder.transform([movie_id])[0]
#     target_embedding = embeddings[movie_encoded].reshape(1, -1)
    
#     similarities = cosine_similarity(target_embedding, embeddings).flatten()
    
#     similar_indices = similarities.argsort()[-(top_n * 2 + 1):-1][::-1]
#     similar_movie_ids = movie_encoder.inverse_transform(similar_indices)
#     similar_movies = movies[movies['movieId'].isin(similar_movie_ids)].copy()
#     similar_movies['similarity'] = similarities[similar_indices]
    
#     for genre in genres:
#         if genre in all_genres:
#             similar_movies = similar_movies[similar_movies[genre] == 1]
#         else:
#             print(f"Genre '{genre}' Invalid.")
#             return pd.DataFrame()
    
#     if similar_movies.empty:
#         print("No Movies Found w/ Specified Genre(s)")
#         return pd.DataFrame()
    
#     popularity = ratings.groupby('movieId').size().reset_index(name='rating_count')
#     similar_movies = similar_movies.merge(popularity, on='movieId', how='left').fillna({'rating_count': 0})
    
#     similar_movies = similar_movies.sort_values(by=['similarity', 'rating_count'], ascending=[False, False])
#     recommendations = similar_movies[['movieId', 'title', 'similarity', 'rating_count']].head(top_n)
    
#     return recommendations.reset_index(drop=True)

In [509]:
# movie_id = 100
# selected_genres = ['Action']
# print(f"Recommended Movies Similar to: {movies[movies['movieId'] == movie_id]['title'].values[0]} w/ Genres: {', '.join(selected_genres)}")
# combined_recommendations = recommend_combined(
#     movie_id=movie_id,
#     genres=selected_genres,
#     movies=movies,
#     movie_encoder=movie_encoder,
#     embeddings=embeddings,
#     ratings=ratings,
#     top_n=10
# )
# display(combined_recommendations)

In [510]:
def recommend_combined_mix(movie_name, genres, movies, movie_encoder, embeddings, ratings, top_n=10):
    similar_movies = recommend_similar_movies_name(movie_name, movies, movie_encoder, embeddings, top_n=top_n*2)
    if similar_movies.empty:
        return pd.DataFrame()
    
    similar_movie_ids = similar_movies["Movie ID"].astype(int).values
    
    filtered_movies = movies[movies['movieId'].isin(similar_movie_ids)].copy()
    for genre in genres:
        if genre in all_genres:
            filtered_movies = filtered_movies[filtered_movies[genre] == 1]
        else:
            print(f"Genre '{genre}' Invalid.")
            return pd.DataFrame() 
    
    if filtered_movies.empty:
        print("No Movies Found w/ Specified Genre(s)")
        return pd.DataFrame()
    
    popularity = ratings.groupby('movieId').size().reset_index(name='rating_count')
    filtered_movies = filtered_movies.merge(popularity, on='movieId', how='left').fillna({'rating_count': 0})
    
    filtered_movies = filtered_movies.merge(similar_movies[['Movie ID', 'Cosine Similarity']],
                                            left_on='movieId', right_on='Movie ID', how='left')
    
    filtered_movies = filtered_movies.sort_values(by=['Cosine Similarity', 'rating_count'], ascending=[False, False])
    
    recommendations = filtered_movies[['movieId', 'title', 'Cosine Similarity', 'rating_count']].head(top_n)
    
    recommendations.rename(columns={
        'movieId': 'Movie ID',
        'title': 'Title',
        'rating_count': 'Total Rating'
    }, inplace=True)
    recommendations["Year"] = recommendations["Title"].str.extract(r"\((\d{4})\)")
    recommendations["Title"] = recommendations["Title"].str.replace(r" \(\d{4}\)", "", regex=True)
    recommendations = recommendations[["Movie ID", "Title", "Year", "Cosine Similarity", "Total Rating"]]
    recommendations = recommendations.reset_index(drop=True)
    recommendations.index += 1
    return recommendations

In [511]:
movie_name = "Godfather"
selected_genres = ['Action']
combined_recommendations = recommend_combined_mix(
    movie_name=movie_name,
    genres=selected_genres,
    movies=movies,
    movie_encoder=movie_encoder,
    embeddings=embeddings,
    ratings=ratings,
    top_n=10
)
display(combined_recommendations)

Unnamed: 0,Movie ID,Title,Year,Cosine Similarity,Total Rating
1,1208,Apocalypse Now,1979,7.17,32518
2,1254,"Treasure of the Sierra Madre, The",1948,6.91,6377
3,170705,Band of Brothers,2001,6.53,2811
