In [3]:
%matplotlib inline

In [4]:
import pandas as pd 

import json

In [5]:
actor_name_map = {}
movie_actor_map = {}
actor_genre_map = {}

with open('imdb_movies_2000to2022.prolific.json') as file: 
    for line in file:
        
        #Go through each movie on the line and parse the json
        this_movie = json.loads(line)
        
        #add all actors to the id-name map
        for actor_id,actor_name in this_movie['actors']:
            actor_name_map[actor_id] = actor_name 
            
        #for each actor, add this movie's genre to that actor's list
        for actor_id, actor_name in this_movie['actors']:
            this_actors_genres = actor_genre_map.get(actor_id, {})
            
            for g in this_movie['genres']:
                this_actors_genres[g] = this_actors_genres.get(g, 0) + 1
                
            actor_genre_map[actor_id] = this_actors_genres
            
        movie_actor_map[this_movie["imdb_id"]] = ({
            "movie": this_movie["title"],
            "actors": set([item[0] for item in this_movie['actors']]),
            "genres": this_movie["genres"]
        })

In [6]:
movie_actor_map

{'tt0035423': {'movie': 'Kate & Leopold',
  'actors': {'nm0000212', 'nm0000630', 'nm0005227', 'nm0413168'},
  'genres': ['Comedy', 'Fantasy', 'Romance']},
 'tt0088751': {'movie': 'The Naked Monster',
  'actors': {'nm0329491', 'nm0828288', 'nm0864851', 'nm0933983'},
  'genres': ['Comedy', 'Horror', 'Sci-Fi']},
 'tt0096056': {'movie': 'Crime and Punishment',
  'actors': {'nm0000417', 'nm0000457', 'nm0000603', 'nm0452288'},
  'genres': ['Drama']},
 'tt0113092': {'movie': 'For the Cause',
  'actors': {'nm0001002', 'nm0001299', 'nm0923529', 'nm0936365'},
  'genres': ['Action', 'Adventure', 'Drama']},
 'tt0116391': {'movie': 'Gang',
  'actors': {'nm0006763', 'nm0007113', 'nm0310173', 'nm0412917'},
  'genres': ['Action', 'Crime', 'Drama']},
 'tt0117461': {'movie': 'Remembering Mario',
  'actors': {'nm0001123',
   'nm0269451',
   'nm0321320',
   'nm0803138',
   'nm0803374',
   'nm0898634',
   'nm1142237',
   'nm1315804'},
  'genres': ['Comedy', 'Romance']},
 'tt0117743': {'movie': 'Still Water

In [7]:
actor_genre_map['nm0413168']

{'Comedy': 7,
 'Fantasy': 3,
 'Romance': 5,
 'Action': 14,
 'Adventure': 11,
 'Sci-Fi': 10,
 'Crime': 6,
 'Thriller': 2,
 'Animation': 4,
 'Drama': 12,
 'Mystery': 5,
 'Biography': 4,
 'Musical': 2,
 'History': 1}

In [8]:
index = actor_genre_map.keys()

rows = [actor_genre_map[k] for k in index]

df = pd.DataFrame(rows, index=index)

df = df.fillna(0)

df

Unnamed: 0,Comedy,Fantasy,Romance,Drama,Mystery,Thriller,Action,Biography,Crime,War,...,Horror,Documentary,Sport,News,Family,Music,Unnamed: 18,Western,Short,Reality-TV
nm0000212,7.0,1.0,6.0,6.0,1.0,2.0,1.0,1.0,2.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
nm0413168,7.0,3.0,5.0,12.0,5.0,2.0,14.0,4.0,6.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
nm0000630,8.0,2.0,6.0,14.0,2.0,3.0,4.0,5.0,1.0,1.0,...,3.0,7.0,3.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
nm0005227,10.0,1.0,2.0,2.0,0.0,1.0,1.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0
nm0864851,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
nm9504284,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
nm10592896,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
nm7216750,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
nm0936300,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
from sklearn.cluster import KMeans


In [11]:
k = 8

cluster_model = KMeans(n_clusters=k)

cluster_model.fit(df)

  super()._check_params_vs_input(X, default_n_init=10)


In [12]:
cluster_labels = cluster_model.predict(df)
actor_cluster_df = pd.DataFrame(cluster_labels, index=df.index, columns=["cluster"])

actor_cluster_df["cluster"].value_counts()

0    29674
2     2315
1      809
7      253
4      216
3      188
6      141
5       13
Name: cluster, dtype: int64

In [13]:
for cluster,actors in actor_cluster_df.groupby("cluster"):
    print("Cluster:", cluster, "Size:", actors.shape[0])
    
    for a_id in actors.sample(5).index:
        print("\t", a_id, actor_name_map[a_id])

Cluster: 0 Size: 29674
	 nm0120410 Laura Bell Bundy
	 nm5065920 Julia Goldani Telles
	 nm2451535 Lisa Davis
	 nm3684453 Chloe Blue
	 nm5254895 Petra Gocheva
Cluster: 1 Size: 809
	 nm0358922 Emily Hampshire
	 nm0001523 Natascha McElhone
	 nm1789970 Addison Timlin
	 nm0662504 Molly Parker
	 nm0199215 Hugh Dancy
Cluster: 2 Size: 2315
	 nm0885840 Emily VanCamp
	 nm0601041 Christina Moore
	 nm4103976 Jonny Weston
	 nm0300824 Kevin Gage
	 nm0192505 Kaley Cuoco
Cluster: 3 Size: 188
	 nm0001709 Til Schweiger
	 nm1265067 50 Cent
	 nm0700712 Dominic Purcell
	 nm0001173 Aaron Eckhart
	 nm0518085 Kristanna Loken
Cluster: 4 Size: 216
	 nm0000701 Kate Winslet
	 nm0252961 Idris Elba
	 nm0583951 Robert Miano
	 nm0001367 C. Thomas Howell
	 nm0004742 Maria Bello
Cluster: 5 Size: 13
	 nm0005458 Jason Statham
	 nm0000616 Eric Roberts
	 nm0474774 Akshay Kumar
	 nm0000115 Nicolas Cage
	 nm0001803 Danny Trejo
Cluster: 6 Size: 141
	 nm0001643 Linnea Quigley
	 nm0347149 Sienna Guillory
	 nm4820312 Breana Mitch