In [1]:
%matplotlib inline

In [24]:
import pandas as pd
import json
import random
import numpy as np

import matplotlib.pyplot as plt

from scipy.sparse import lil_matrix

from sklearn.metrics import jaccard_score
from sklearn.metrics import pairwise_distances

Most Similar Movies to Scream 3

In [3]:
actor_name_map = {}
movie_actor_map = {}

with open("imdb_movies_2000to2022.prolific.json") as file:
    for line in file:
        movie = json.loads(line)
        
        for actor_id,actor_name in movie['actors']:
            actor_name_map[actor_id] = actor_name
            
        movie_actor_map[movie["imdb_id"]] = ({
            "movie": movie["title"],
            "actors": {a[0] for a in movie['actors']},
            "genres": movie["genres"]
        })
            


In [4]:
df_1 = pd.DataFrame(movie_actor_map.values(), index=movie_actor_map.keys())
df_1[df_1["movie"] == "Scream 3"]


Unnamed: 0,movie,actors,genres
tt0134084,Scream 3,"{nm0001073, nm0000274, nm0000117, nm0000630}","[Horror, Mystery]"


In [5]:
target_movie_id = "tt0134084"

In [6]:
target_movie = movie_actor_map[target_movie_id]
target_movie

{'movie': 'Scream 3',
 'actors': {'nm0000117', 'nm0000274', 'nm0000630', 'nm0001073'},
 'genres': ['Horror', 'Mystery']}

In [7]:
distances = []

target_actors = target_movie["actors"]
for movie in movie_actor_map.values():
    these_actors = movie["actors"]
    
    numer = len(target_actors.intersection(these_actors))
    denom = len(target_actors.union(these_actors))
    
    jaccard_sim = numer / denom
    
    distances.append({
        "movie": movie,
        "similarity": jaccard_sim
    })

In [8]:
for similar_movie in sorted(distances, key=lambda x: x["similarity"], reverse=True)[:10]:
    print(similar_movie["movie"]["movie"], similar_movie["similarity"])
    for actor in similar_movie["movie"]["actors"]:
        print("\t", actor_name_map[actor])

Scream 3 1.0
	 Courteney Cox
	 David Arquette
	 Neve Campbell
	 Liev Schreiber
Scream 0.6
	 Melissa Barrera
	 Courteney Cox
	 David Arquette
	 Neve Campbell
Scream 4 0.6
	 Courteney Cox
	 David Arquette
	 Neve Campbell
	 Lucy Hale
The Shrink Is In 0.3333333333333333
	 Carol Kane
	 David Arquette
	 Courteney Cox
	 David James Elliott
Dirty Oil 0.25
	 Neve Campbell
The Curious Case of Curt Flood 0.25
	 Liev Schreiber
Derek Jeter 3K 0.25
	 Liev Schreiber
Money for Nothing: Inside the Federal Reserve 0.25
	 Liev Schreiber
Finding Babel 0.25
	 Liev Schreiber
Gonzaga: The March to Madness 0.25
	 Liev Schreiber


Finding Similar Actors with Dimensionality Reduction

In [10]:
actor_name_map = {}
movie_actor_map = {}
actor_genre_map = {}


with open("imdb_movies_2000to2022.prolific.json", "r") as in_file:
    for line in in_file:
        
        # Read the movie on this line and parse its json
        this_movie = json.loads(line)
                    
        # Add all actors to the id->name map
        for actor_id,actor_name in this_movie['actors']:
            actor_name_map[actor_id] = actor_name
            
        # For each actor, add this movie's genres to that actor's list
        for actor_id,actor_name in this_movie['actors']:
            this_actors_genres = actor_genre_map.get(actor_id, {})
            
            # Increment the count of genres for this actor
            for g in this_movie["genres"]:
                this_actors_genres[g] = this_actors_genres.get(g, 0) + 1
                
            # Update the map
            actor_genre_map[actor_id] = this_actors_genres
            
        # Finished with this film
        movie_actor_map[this_movie["imdb_id"]] = ({
            "movie": this_movie["title"],
            "actors": set([item[0] for item in this_movie['actors']]),
            "genres": this_movie["genres"]
        })

In [11]:
# Get all actors as an index for a dataframe
index = actor_genre_map.keys()

# Get the genre-counts for each actor in the index
rows = [actor_genre_map[k] for k in index]

# Create the data frame from these rows, with the actors as index
df = pd.DataFrame(rows, index=index)


df = df.fillna(0)

df

Unnamed: 0,Comedy,Fantasy,Romance,Drama,Mystery,Thriller,Action,Biography,Crime,War,...,Horror,Documentary,Sport,News,Family,Music,Unnamed: 18,Western,Short,Reality-TV
nm0000212,7.0,1.0,6.0,6.0,1.0,2.0,1.0,1.0,2.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
nm0413168,7.0,3.0,5.0,12.0,5.0,2.0,14.0,4.0,6.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
nm0000630,8.0,2.0,6.0,14.0,2.0,3.0,4.0,5.0,1.0,1.0,...,3.0,7.0,3.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
nm0005227,10.0,1.0,2.0,2.0,0.0,1.0,1.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0
nm0864851,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
nm9504284,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
nm10592896,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
nm7216750,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
nm0936300,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
from sklearn.decomposition import PCA 
from sklearn.decomposition import TruncatedSVD
from sklearn.manifold import TSNE

In [13]:
matrix_dense = df.to_numpy()

use_pca = False
use_svd = False

if use_pca:
    pca = PCA(n_components=16)
    pca.fit(matrix_dense)
    matrix_reduced = pca.transform(matrix_dense)

elif use_svd:
    svd = TruncatedSVD(n_components=4)
    svd.fit(matrix_dense)
    matrix_reduced = svd.transform(matrix_dense)

else:
    tsne = TSNE(n_components=2)
    matrix_reduced = tsne.fit_transform(matrix_dense)

In [14]:
np.mean(matrix_reduced, axis=0)

array([1.4775544, 3.1917357], dtype=float32)

In [15]:
target_actor_id = 'nm0000136'

In [16]:
query_idx = [idx for idx,m in enumerate(df.index) if m == target_actor_id][0]
query_idx

79

In [17]:
query_v = matrix_reduced[query_idx,:]

distances = pairwise_distances(matrix_reduced, [query_v], metric='euclidean')
distances_df = pd.DataFrame(distances, columns=["distance"])
for idx,row in distances_df.sort_values(by="distance", ascending=True).head(20).iterrows():
    print(idx, actor_name_map[df.index[idx]], row["distance"])
    
    

79 Johnny Depp 0.0
112 Will Smith 0.3057616
835 Robert Downey Jr. 0.47877106
2918 Zoe Saldana 0.48415738
4343 Michael Peña 0.50169086
1 Hugh Jackman 0.53729737
6897 Channing Tatum 0.55785215
3969 Chris Pine 0.5652015
73 Chris Evans 0.67103255
1737 Carla Gugino 0.67755014
87 Elijah Wood 0.7133202
89 Orlando Bloom 0.8387051
4953 Simon Pegg 0.9645496
675 Laurence Fishburne 1.1975056
7267 Alexander Skarsgård 1.2168293
1886 Pierce Brosnan 1.4158388
2437 Aaron Eckhart 1.5539197
298 Angelina Jolie 1.6143076
1134 Antonio Banderas 1.6409336
992 Patrick Bergin 1.7732458


In [18]:

import scipy.spatial.distance # Needed for calculating pairwise distances

In [19]:
actor_name_map = {}
movie_actor_map = {}
actor_genre_map = {}


with open("imdb_movies_2000to2022.prolific.json", "r") as in_file:
    for line in in_file:
        
        # Read the movie on this line and parse its json
        this_movie = json.loads(line)
                    
        # Add all actors to the id->name map
        for actor_id,actor_name in this_movie['actors']:
            actor_name_map[actor_id] = actor_name
            
        # For each actor, add this movie's genres to that actor's list
        for actor_id,actor_name in this_movie['actors']:
            this_actors_genres = actor_genre_map.get(actor_id, {})
            
            # Increment the count of genres for this actor
            for g in this_movie["genres"]:
                this_actors_genres[g] = this_actors_genres.get(g, 0) + 1
                
            # Update the map
            actor_genre_map[actor_id] = this_actors_genres
            
        # Finished with this film
        movie_actor_map[this_movie["imdb_id"]] = ({
            "movie": this_movie["title"],
            "actors": set([item[0] for item in this_movie['actors']]),
            "genres": this_movie["genres"]
        })


actor_genre_map['nm0000136']

# Get all actors as an index for a dataframe
index = actor_genre_map.keys()

# Get the genre-counts for each actor in the index
rows = [actor_genre_map[k] for k in index]

# Create the data frame from these rows, with the actors as index
df = pd.DataFrame(rows, index=index)

# Fill NAs with zero, as NA means the actor has not starred in that genre
df = df.fillna(0)

df

df_norm = df.divide(df.sum(axis=1), axis=0)
df_norm.head(10)

Unnamed: 0,Comedy,Fantasy,Romance,Drama,Mystery,Thriller,Action,Biography,Crime,War,...,Horror,Documentary,Sport,News,Family,Music,Unnamed: 18,Western,Short,Reality-TV
nm0000212,0.25,0.035714,0.214286,0.214286,0.035714,0.071429,0.035714,0.035714,0.071429,0.035714,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
nm0413168,0.081395,0.034884,0.05814,0.139535,0.05814,0.023256,0.162791,0.046512,0.069767,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
nm0000630,0.112676,0.028169,0.084507,0.197183,0.028169,0.042254,0.056338,0.070423,0.014085,0.014085,...,0.042254,0.098592,0.042254,0.014085,0.0,0.0,0.0,0.0,0.0,0.0
nm0005227,0.4,0.04,0.08,0.08,0.0,0.04,0.04,0.0,0.0,0.0,...,0.04,0.0,0.04,0.0,0.08,0.0,0.0,0.0,0.0,0.0
nm0864851,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
nm0828288,0.119403,0.0,0.0,0.059701,0.044776,0.149254,0.014925,0.0,0.044776,0.0,...,0.522388,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
nm0933983,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
nm0329491,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
nm0000417,0.119048,0.02381,0.0,0.190476,0.071429,0.095238,0.119048,0.0,0.095238,0.0,...,0.119048,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
nm0000603,0.0625,0.03125,0.0625,0.375,0.0625,0.0625,0.0,0.09375,0.0625,0.0,...,0.03125,0.03125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [25]:
target_actor_id = 'nm0000136'

In [26]:
target_actor_ratings = df.loc[target_actor_id]

#Generating distances from that actor to all the others
distances = scipy.spatial.distance.cdist(df, [target_actor_ratings], metric="cosine")[:,0]

query_distances = list(zip(df.index, distances))

#Printing the top ten most similar actors to our target
for similar_actor_id, similar_genre_score in sorted(query_distances, key=lambda x: x[1], reverse=False)[:10]:
    print(similar_actor_id, actor_name_map[similar_actor_id], similar_genre_score, df.loc[similar_actor_id].sum())

nm0000136 Johnny Depp 0.0 95.0
nm0785227 Andy Serkis 0.0520202427824783 36.0
nm0705356 Daniel Radcliffe 0.06079347194899609 65.0
nm0503567 Logan Lerman 0.07117513386731456 38.0
nm4043618 Tom Holland 0.07812948566835998 34.0
nm0344435 Ioan Gruffudd 0.08061238345318367 30.0
nm0938950 Benedict Wong 0.08116563075668959 21.0
nm0000621 Kurt Russell 0.08275658274546505 44.0
nm0000226 Will Smith 0.08294504247660517 64.0
nm0001401 Angelina Jolie 0.08434557468711501 72.0


In [27]:
target_actor_ratings = df_norm.loc[target_actor_id]

#Generating distances from that actor to all the others
distances = scipy.spatial.distance.cdist(df_norm, [target_actor_ratings], metric="euclidean")[:,0]

query_distances = list(zip(df.index, distances))

#Printing the top ten most similar actors to our target
for similar_actor_id, similar_genre_score in sorted(query_distances, key=lambda x: x[1], reverse=False)[:10]:
    print(similar_actor_id, actor_name_map[similar_actor_id], similar_genre_score, df.loc[similar_actor_id].sum())

nm0000136 Johnny Depp 0.0 95.0
nm0785227 Andy Serkis 0.10513474948885257 36.0
nm0705356 Daniel Radcliffe 0.11148830220385951 65.0
nm0503567 Logan Lerman 0.12185091476621288 38.0
nm0344435 Ioan Gruffudd 0.12278195232928205 30.0
nm0000621 Kurt Russell 0.12462117225517601 44.0
nm0000226 Will Smith 0.1301156295105868 64.0
nm0001401 Angelina Jolie 0.1318812317192192 72.0
nm0000168 Samuel L. Jackson 0.14022083538639854 155.0
nm1083271 Megan Fox 0.14096357492351347 45.0
