%matplotlib inline

In [3]:
%matplotlib inline

In [4]:
import pandas as pd

import json 

import matplotlib.pyplot as plt

In [5]:
actor_name_map = {}
movie_actor_map = {}
actor_genre_map = {}



with open("/Users/trintkillip/Downloads/inst414/imdb_movies_2000to2022.prolific.json", "r") as in_file:
    for line in in_file:
        
        #Read the movie on this line and parse the json
        this_movie = json.loads(line)
        
        #add all actors to the id->name map
        for actor_id, actor_name in this_movie['actors']:
            actor_name_map[actor_id] = actor_name 
            
        #for each actor, add this movie's genres to that actor's list 
        for actor_id, actor_name in this_movie['actors']:
            this_actors_genres = actor_genre_map.get(actor_id, {})
        
            #increment the count of genres for this actor
            for g in this_movie["genres"]:
                this_actors_genres[g] = this_actors_genres.get(g,0) + 1
                
            #update the map
            actor_genre_map[actor_id] = this_actors_genres
            
            
        #finished with this film
        movie_actor_map[this_movie["imdb_id"]] = ({
            "movie": this_movie["title"],
            "actors": set([item[0] for item in this_movie['actors']]),
            "genres": this_movie["genres"]
        })

In [6]:
actor_genre_map['nm0413168']

{'Comedy': 7,
 'Fantasy': 3,
 'Romance': 5,
 'Action': 14,
 'Adventure': 11,
 'Sci-Fi': 10,
 'Crime': 6,
 'Thriller': 2,
 'Animation': 4,
 'Drama': 12,
 'Mystery': 5,
 'Biography': 4,
 'Musical': 2,
 'History': 1}

In [7]:
#get all actors as an index for a dataframe
index = actor_genre_map.keys()

#get the genre-counts for each actor in the index
rows = [actor_genre_map[k] for k in index]

#create the data frame from these rows, with the actors as index
df = pd.DataFrame(rows, index=index)

#Fill NAs with zero, as NA means the actor has not starred in that genre
df = df.fillna(0)

df 

Unnamed: 0,Comedy,Fantasy,Romance,Drama,Mystery,Thriller,Action,Biography,Crime,War,...,Horror,Documentary,Sport,News,Family,Music,Unnamed: 18,Western,Short,Reality-TV
nm0000212,7.0,1.0,6.0,6.0,1.0,2.0,1.0,1.0,2.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
nm0413168,7.0,3.0,5.0,12.0,5.0,2.0,14.0,4.0,6.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
nm0000630,8.0,2.0,6.0,14.0,2.0,3.0,4.0,5.0,1.0,1.0,...,3.0,7.0,3.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
nm0005227,10.0,1.0,2.0,2.0,0.0,1.0,1.0,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0
nm0864851,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
nm9504284,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
nm10592896,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
nm7216750,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
nm0936300,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [8]:
#Normalizing the rows 

#We normalized rows by dividing through the sum of each row. This transformation, L1-normalization, changes rows from counts to proportions.

df_norm = df.divide(df.sum(axis=1), axis=0)
df_norm.head(10)

Unnamed: 0,Comedy,Fantasy,Romance,Drama,Mystery,Thriller,Action,Biography,Crime,War,...,Horror,Documentary,Sport,News,Family,Music,Unnamed: 18,Western,Short,Reality-TV
nm0000212,0.25,0.035714,0.214286,0.214286,0.035714,0.071429,0.035714,0.035714,0.071429,0.035714,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
nm0413168,0.081395,0.034884,0.05814,0.139535,0.05814,0.023256,0.162791,0.046512,0.069767,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
nm0000630,0.112676,0.028169,0.084507,0.197183,0.028169,0.042254,0.056338,0.070423,0.014085,0.014085,...,0.042254,0.098592,0.042254,0.014085,0.0,0.0,0.0,0.0,0.0,0.0
nm0005227,0.4,0.04,0.08,0.08,0.0,0.04,0.04,0.0,0.0,0.0,...,0.04,0.0,0.04,0.0,0.08,0.0,0.0,0.0,0.0,0.0
nm0864851,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
nm0828288,0.119403,0.0,0.0,0.059701,0.044776,0.149254,0.014925,0.0,0.044776,0.0,...,0.522388,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
nm0933983,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
nm0329491,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
nm0000417,0.119048,0.02381,0.0,0.190476,0.071429,0.095238,0.119048,0.0,0.095238,0.0,...,0.119048,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
nm0000603,0.0625,0.03125,0.0625,0.375,0.0625,0.0625,0.0,0.09375,0.0625,0.0,...,0.03125,0.03125,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
df_norm.sum(axis=1).head(20)

nm0000212    1.0
nm0413168    1.0
nm0000630    1.0
nm0005227    1.0
nm0864851    1.0
nm0828288    1.0
nm0933983    1.0
nm0329491    1.0
nm0000417    1.0
nm0000603    1.0
nm0000457    1.0
nm0452288    1.0
nm0001002    1.0
nm0001299    1.0
nm0923529    1.0
nm0936365    1.0
nm0006763    1.0
nm0007113    1.0
nm0310173    1.0
nm0412917    1.0
dtype: float64

In [11]:
#Actor simiilarity using counts

from scipy.sparse import lil_matrix  #for building the matrix
import scipy.spatial.distance 



In [13]:
target_actor_id = 'nm0413168'

target_actor_ratings = df.loc[target_actor_id]

distances = scipy.spatial.distance.cdist(df, [target_actor_ratings], metric='cosine')[:,0]

query_distances = list(zip(df.index,distances))

#printing the top ten most similar actors to our target
for similar_actor_id, similar_genre_score in sorted(query_distances, key=lambda x: x[1], reverse=False)[:10]:
    print(similar_actor_id, actor_name_map[similar_actor_id], similar_genre_score, df.loc[similar_actor_id].sum())


nm0413168 Hugh Jackman 0.0 86.0
nm3592338 Emilia Clarke 0.023038353871383088 20.0
nm1663205 Sharlto Copley 0.03680695049665894 39.0
nm0000375 Robert Downey Jr. 0.04015740464249473 75.0
nm3772243 Theo James 0.04358329306959263 46.0
nm0262635 Chris Evans 0.04805589651668307 74.0
nm0881631 Karl Urban 0.05508543978437186 44.0
nm0159789 Hayden Christensen 0.060195990212304484 46.0
nm1517976 Chris Pine 0.07953201413353528 66.0
nm1475594 Channing Tatum 0.08366839174462504 72.0
