# Determine Movie Genre by Neighboring Movies

Using the k-nearest neighbors method, use the top-k most similar movies to a target film to predict the target movie's genre.

Use Jaccard similarity based on actors in each movie to rank movies and select the top-k most similar movies.

In [19]:
import json

import pandas as pd
import numpy as np

from scipy.sparse import lil_matrix

import matplotlib.pyplot as plt


In [20]:
actor_name_map = {}
movie_actor_map = {}
actor_genre_map = {}


with open("imdb_movies_2000to2022.prolific.json", "r") as in_file:
    for line in in_file:
        
        # Read the movie on this line and parse its json
        this_movie = json.loads(line)
        
        # Skip movies with no ratings
        if len(this_movie["rating"]) == 0:
            continue
                    
        # Add all actors to the id->name map
        for actor_id,actor_name in this_movie['actors']:
            actor_name_map[actor_id] = actor_name
            
        # For each actor, add this movie's genres to that actor's list
        for actor_id,actor_name in this_movie['actors']:
            this_actors_genres = actor_genre_map.get(actor_id, {})
            
            # Increment the count of genres for this actor
            for g in this_movie["genres"]:
                this_actors_genres[g] = this_actors_genres.get(g, 0) + 1
                
            # Update the map
            actor_genre_map[actor_id] = this_actors_genres
            
        # Finished with this film
        movie_actor_map[this_movie["imdb_id"]] = ({
            "movie": this_movie["title"],
            "actors": set([item[0] for item in this_movie['actors']]),
            "genres": this_movie["genres"],
            "rating": this_movie["rating"]["avg"]
        })

In [21]:
print("Known Actors:", len(actor_name_map))
print("Known Movies:", len(movie_actor_map))

Known Actors: 29499
Known Movies: 18841


In [22]:
actor_id_to_index = {actor_id:i for i,actor_id in enumerate(actor_name_map.keys())}


In [23]:
target_movie_id = "tt0317705" # The Incredibles
# target_movie_id = "tt0816692" # Interstellar
# target_movie_id = "tt0332280" # The Notebook

In [24]:
target_movie_object = movie_actor_map[target_movie_id]

In [25]:
target_movie_object

{'movie': 'The Incredibles',
 'actors': {'nm0000168', 'nm0000456', 'nm0005134', 'nm0005266'},
 'genres': ['Action', 'Adventure', 'Animation'],
 'rating': 8.0}

## Find the Most Similar Movies by Jaccard Similarity in Actor

In [26]:
movie_similarities = []

for this_movie_id,this_movie_obj in movie_actor_map.items():
    # Skip the target movie
    if this_movie_id == target_movie_id:
        continue
        
    this_intersect = this_movie_obj["actors"].intersection(target_movie_object["actors"])
    this_union = this_movie_obj["actors"].union(target_movie_object["actors"])
    
    jaccard = len(this_intersect) / len(this_union)
    
    # Add this movie and its Jaccard similarity to the list, so we can rank at the end
    movie_similarities.append({
        "movie": this_movie_id,
        "jaccard": jaccard,
    })

In [27]:
similarity_df = pd.DataFrame(movie_similarities, columns=["movie", "jaccard"])

## Use the top-k similar movies to infer genre

In [28]:
k_nn = 1

In [29]:
similarity_df.sort_values(by="jaccard", ascending=False).head(k_nn)

Unnamed: 0,movie,jaccard
13851,tt3606756,0.333333


In [30]:
top_movies = similarity_df.sort_values(by="jaccard", ascending=False).head(k_nn)

relevant_genre_count = {}

for this_movie_id in top_movies["movie"]:
    print(this_movie_id, movie_actor_map[this_movie_id]["movie"])
    print("\t", movie_actor_map[this_movie_id]["genres"])
    
    this_movie_genres = movie_actor_map[this_movie_id]["genres"]
    for g in this_movie_genres:
        relevant_genre_count[g] = relevant_genre_count.get(g,0) + 1
        

tt3606756 Incredibles 2
	 ['Action', 'Adventure', 'Animation']


In [31]:
this_movie

{'imdb_id': 'tt9907608',
 'title': 'Footloose in the Cotswolds: Part 2',
 'year': '2016',
 'runtime': '102',
 'genres': ['Documentary'],
 'actors': [['nm1644256', 'Debra Rixon']],
 'rating': []}

In [32]:
for g in sorted(relevant_genre_count, key=relevant_genre_count.get, reverse=True):
    print(g, relevant_genre_count[g])

Action 1
Adventure 1
Animation 1


## Use the top-k similar movies to infer rating

In [33]:
top_movies = similarity_df.sort_values(by="jaccard", ascending=False).head(k_nn)

relevant_ratings = []
for this_movie_id in top_movies["movie"]:
    print(this_movie_id, movie_actor_map[this_movie_id]["movie"])
    print("\t", movie_actor_map[this_movie_id]["rating"])
    
    relevant_ratings.append(movie_actor_map[this_movie_id]["rating"])

tt3606756 Incredibles 2
	 7.6


In [34]:
print("Actual Rating:", target_movie_object["rating"])
print("Predicted Rating:", np.mean(relevant_ratings))

Actual Rating: 8.0
Predicted Rating: 7.6
