In [1]:
import requests
import pandas as pd
from datetime import datetime, timedelta
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
api_key = 'cd63a32f7aac7d22fbc21b29f6bd8bd1'

In [3]:
#genre endpoint
url = f'https://api.themoviedb.org/3/genre/movie/list?api_key={api_key}&language=en-US'
response = requests.get(url)
data = response.json()
genre_dict = {genre['id']: genre['name'] for genre in data['genres']}
print(genre_dict)

{28: 'Action', 12: 'Adventure', 16: 'Animation', 35: 'Comedy', 80: 'Crime', 99: 'Documentary', 18: 'Drama', 10751: 'Family', 14: 'Fantasy', 36: 'History', 27: 'Horror', 10402: 'Music', 9648: 'Mystery', 10749: 'Romance', 878: 'Science Fiction', 10770: 'TV Movie', 53: 'Thriller', 10752: 'War', 37: 'Western'}


In [4]:
#movie discovery endpoint
all_movies = []

for page in range(1, 6):  # Pages 1 to 5
    movie_url = f'https://api.themoviedb.org/3/discover/movie?api_key={api_key}&language=en-US&page={page}'
    response = requests.get(movie_url)
    data = response.json()
    all_movies.extend(data['results'])

In [5]:
df = pd.DataFrame([{
    'Title': movie['title'],
    'Release Date': movie['release_date'],
    'Popularity': movie['popularity'],
    'Vote Average': movie['vote_average'],
    'Vote Count': movie['vote_count'],
    'Genre IDs': movie['genre_ids']
} for movie in all_movies])

print(df.head())

                      Title Release Date  Popularity  Vote Average  \
0  How to Train Your Dragon   2025-06-06   1173.2922         8.121   
1                 M3GAN 2.0   2025-06-25    897.1130         7.738   
2          Man with No Past   2025-01-13    549.8021         6.600   
3       Karate Kid: Legends   2025-05-08    451.4609         7.283   
4                  Superman   2025-07-09    434.5607         7.511   

   Vote Count        Genre IDs  
0         983  [14, 10751, 28]  
1         361    [28, 878, 53]  
2          19         [28, 18]  
3         469     [28, 12, 18]  
4         942    [878, 12, 28]  


In [6]:
# Convert genre IDs to genre names
df['Genres'] = df['Genre IDs'].apply(lambda ids: [genre_dict.get(i, 'Unknown') for i in ids])

# Convert Release Date column to datetime
df['Release Date'] = pd.to_datetime(df['Release Date'], errors='coerce')

cutoff_date = datetime.today() - timedelta(days=90)

# Filtering logic
hidden_gems = df[(df['Vote Average'] >= 7.0) & (df['Vote Count'] < 1000) & (df['Release Date'] < cutoff_date)]
print(hidden_gems)

                                         Title Release Date  Popularity  \
17                  Kaiju No. 8: Mission Recon   2025-03-28    174.7107   
22                                  The Cellar   2024-09-08    118.8486   
24                Jokōsei torio: seikan shiken   1977-02-23    114.1674   
26                             Muromachi Burai   2025-01-17    106.9167   
40                                    In Vitro   2025-03-27     81.8295   
45                            Stepmom's Desire   2020-05-29     70.7375   
49                                      Belyas   2025-02-07     68.0899   
59                                      Ne Zha   2019-07-26     55.9741   
61                                       Night   2024-09-07     54.1185   
68                    Playboy Playmate Workout   1983-01-01     49.8804   
77                 Squid Game: Making Season 2   2025-01-02     46.8564   
79                                     Warfare   2025-04-09     45.1839   
84                       

In [7]:
# one hot encoding: turn genres column into multiple binary columns
mlb = MultiLabelBinarizer()
genre_encoded = mlb.fit_transform(hidden_gems['Genres'])
genre_df = pd.DataFrame(genre_encoded, columns=mlb.classes_, index=hidden_gems.index)

# Merge into the original hidden_gems DataFrame
hidden_gems_encoded = pd.concat([hidden_gems, genre_df], axis=1) 

In [8]:
# 1. Find that movie's genre vector
# 2. Compute cosine similarity with all other movies
# 3. Return the top N similar movies (default = 5)
def recommend_by_genre(title, top_n=5):
    if title not in hidden_gems_encoded['Title'].values:
        return f"'{title}' not found in hidden gem list."
    
    # Get genre vector of the target movie
    target_vector = hidden_gems_encoded.loc[hidden_gems_encoded['Title'] == title, mlb.classes_].values
    
    # Get genre vectors for all other movies
    genre_matrix = hidden_gems_encoded[mlb.classes_].values
    
    # Compute cosine similarity
    similarities = cosine_similarity(target_vector, genre_matrix)[0]
    
    # Create similarity DataFrame
    hidden_gems_encoded['Similarity'] = similarities
    
    # Return top N most similar (excluding the movie itself)
    results = hidden_gems_encoded[hidden_gems_encoded['Title'] != title]
    results = results.sort_values(by='Similarity', ascending=False).head(top_n)
    
    return results[['Title', 'Genres', 'Vote Average', 'Similarity']]

In [9]:
recommend_by_genre('Muromachi Burai', top_n=5)

Unnamed: 0,Title,Genres,Vote Average,Similarity
49,Belyas,[Drama],7.3,0.707107
24,Jokōsei torio: seikan shiken,"[Romance, Drama]",7.3,0.5
45,Stepmom's Desire,"[Drama, Romance]",7.36,0.5
79,Warfare,"[War, Action]",7.2,0.5
84,Dovbush,"[History, Drama]",7.3,0.5
