In [1]:
import json

import pandas as pd
import numpy as np

In [3]:
actor_name_map = {}
movie_actor_map = {}
actor_genre_map = {}


with open("imdb_movies_2000to2022.prolific.json", "r") as in_file:
    for line in in_file:
        
        # Read the movie on this line and parse its json
        this_movie = json.loads(line)
                    
        # Add all actors to the id->name map
        for actor_id,actor_name in this_movie['actors']:
            actor_name_map[actor_id] = actor_name
            
        # For each actor, add this movie's genres to that actor's list
        for actor_id,actor_name in this_movie['actors']:
            this_actors_genres = actor_genre_map.get(actor_id, {})
            
            # Increment the count of genres for this actor
            for g in this_movie["genres"]:
                this_actors_genres[g] = this_actors_genres.get(g, 0) + 1
                
            # Update the map
            actor_genre_map[actor_id] = this_actors_genres
            
        # Finished with this film
        movie_actor_map[this_movie["imdb_id"]] = ({
            "movie": this_movie["title"],
            "actors": set([item[0] for item in this_movie['actors']]),
            "genres": this_movie["genres"]
        })

In [4]:
cluster_df = pd.read_csv("movie_to_cluster_rating.csv")

In [5]:
cluster_df

Unnamed: 0,movie_id,cluster,rating,raters
0,tt0035423,8,6.4,85923
1,tt0088751,12,5.3,328
2,tt0096056,6,5.6,830
3,tt0113092,3,3.4,829
4,tt0116391,3,6.2,257
...,...,...,...,...
20615,tt9906278,10,0.0,0
20616,tt9906644,13,6.8,835
20617,tt9906844,10,0.0,0
20618,tt9907032,10,0.0,0


Find the probabilities of clusters and now find the most probable cluster 

take the average of the ratings that belong to that cluster 


Or take the average of each cluster


1. Get the cluster probabilities

In [7]:
#cluster_pr_map = {cluster_id:cluster_pr for cluster_id,cluster_pr in (cluster_df["cluster"].value_counts() / cluster_df.shape[0]).items()}
#cluster_pr_df = pd.DataFrame(cluster_df["cluster"].value_counts() / cluster_df.shape[0])

#cluster_pr_df

Unnamed: 0,cluster
6,0.154074
0,0.150194
15,0.085063
13,0.082687
2,0.07289
12,0.071096
1,0.066731
3,0.060136
14,0.043307
8,0.037536


In [8]:
cluster_df.groupby("cluster").mean()

Unnamed: 0_level_0,rating,raters
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1
0,4.552502,24081.608008
1,5.502326,26345.518169
2,4.896075,83537.664005
3,5.515968,50720.176613
4,5.425038,39424.616794
5,6.464643,54953.271429
6,5.44073,11503.330186
7,6.048661,29263.581102
8,5.481137,24112.288114
9,4.876823,35051.484375


In [9]:
cluster_df["cluster"].value_counts()

6     3177
0     3097
15    1754
13    1705
2     1503
12    1466
1     1376
3     1240
14     893
8      774
10     761
4      655
11     640
7      635
5      560
9      384
Name: cluster, dtype: int64

In [29]:
cluster_pr_map = {cluster_id:cluster_pr for cluster_id,cluster_pr in (cluster_df["cluster"].value_counts() / cluster_df.shape[0]).items()}
cluster_df["cluster"].value_counts() / cluster_df.shape[0]



6     0.154074
0     0.150194
15    0.085063
13    0.082687
2     0.072890
12    0.071096
1     0.066731
3     0.060136
14    0.043307
8     0.037536
10    0.036906
4     0.031765
11    0.031038
7     0.030795
5     0.027158
9     0.018623
Name: cluster, dtype: float64

Determine for a new movie with a genre, which cluster it would most likely go to

In [30]:
# For each genre, count the number of movies
genre_counts = {}

# For each movie, get its genres and update the genre count
for movie_id in movie_actor_map.keys():
    for genre in movie_actor_map[movie_id]["genres"]:
        genre_counts[genre] = genre_counts.get(genre, 0) + 1
        
genre_prs = []
for genre,g_count in genre_counts.items():
    genre_prs.append((genre, g_count/len(movie_actor_map)))
    
genre_prs_df = pd.DataFrame(genre_prs, columns=["genre", "probability"])
genre_pr_map = {row["genre"]:row["probability"] for idx,row in genre_prs_df.iterrows()}

genre_prs_df.sort_values(by="probability", ascending=False)

Unnamed: 0,genre,probability
5,Drama,0.49258
0,Comedy,0.291804
10,Thriller,0.19418
6,Action,0.181523
3,Horror,0.149224
8,Crime,0.134481
2,Romance,0.12226
7,Adventure,0.080844
9,Mystery,0.074442
4,Sci-Fi,0.051164


First genre

In [46]:
target_genre = "Mystery"

per_cluster_prs = []
for cluster_id,group in cluster_df.groupby("cluster"):

    this_cluster_genre_count = sum([
        1 if target_genre in movie_actor_map[m]["genres"] else 0 
        for m in group["movie_id"]
    ])
    
    # Calculate conditional probability
    pr_genre_given_cluster = this_cluster_genre_count / group.shape[0]
    print("Pr[%s| Cluster %02d]:" % (target_genre, cluster_id), "\t", pr_genre_given_cluster)
    
    # Calculate joint probability
    joint_pr_genre_cluster = pr_genre_given_cluster * group.shape[0] / cluster_df.shape[0]
    print("Pr[%s, Cluster %02d]:" % (target_genre, cluster_id), "\t", joint_pr_genre_cluster)
    per_cluster_prs.append(joint_pr_genre_cluster)

Pr[Mystery| Cluster 00]: 	 0.01517597675169519
Pr[Mystery, Cluster 00]: 	 0.002279340446168768
Pr[Mystery| Cluster 01]: 	 0.11991279069767442
Pr[Mystery, Cluster 01]: 	 0.008001939864209506
Pr[Mystery| Cluster 02]: 	 0.033932135728542916
Pr[Mystery, Cluster 02]: 	 0.002473326867119302
Pr[Mystery| Cluster 03]: 	 0.0
Pr[Mystery, Cluster 03]: 	 0.0
Pr[Mystery| Cluster 04]: 	 1.0
Pr[Mystery, Cluster 04]: 	 0.03176527643064985
Pr[Mystery| Cluster 05]: 	 0.0035714285714285713
Pr[Mystery, Cluster 05]: 	 9.699321047526673e-05
Pr[Mystery| Cluster 06]: 	 0.0
Pr[Mystery, Cluster 06]: 	 0.0
Pr[Mystery| Cluster 07]: 	 0.0
Pr[Mystery, Cluster 07]: 	 0.0
Pr[Mystery| Cluster 08]: 	 0.007751937984496124
Pr[Mystery, Cluster 08]: 	 0.0002909796314258002
Pr[Mystery| Cluster 09]: 	 1.0
Pr[Mystery, Cluster 09]: 	 0.01862269641125121
Pr[Mystery| Cluster 10]: 	 0.006570302233902759
Pr[Mystery, Cluster 10]: 	 0.00024248302618816683
Pr[Mystery| Cluster 11]: 	 0.0
Pr[Mystery, Cluster 11]: 	 0.0
Pr[Mystery| Clust

In [47]:
pr_target_genre = sum(per_cluster_prs)
print("Probability of Target Genre:", pr_target_genre)

Probability of Target Genre: 0.07444228903976721


In [32]:
cluster_posterior_prs = []
for cluster_id,cluster_genre_pr in enumerate(per_cluster_prs):

    pr_cluster_given_genre = cluster_genre_pr / genre_pr_map[target_genre]
    cluster_posterior_prs.append(pr_cluster_given_genre)

    print("Pr[Cluster %02d | %s]:" % (cluster_id, target_genre), "\t", pr_cluster_given_genre) 

Pr[Cluster 00 | Sci-Fi]: 	 0.17345971563981044
Pr[Cluster 01 | Sci-Fi]: 	 0.040758293838862564
Pr[Cluster 02 | Sci-Fi]: 	 0.20379146919431282
Pr[Cluster 03 | Sci-Fi]: 	 0.02464454976303318
Pr[Cluster 04 | Sci-Fi]: 	 0.05308056872037915
Pr[Cluster 05 | Sci-Fi]: 	 0.0
Pr[Cluster 06 | Sci-Fi]: 	 0.061611374407582936
Pr[Cluster 07 | Sci-Fi]: 	 0.0
Pr[Cluster 08 | Sci-Fi]: 	 0.008530805687203791
Pr[Cluster 09 | Sci-Fi]: 	 0.021800947867298578
Pr[Cluster 10 | Sci-Fi]: 	 0.0009478672985781991
Pr[Cluster 11 | Sci-Fi]: 	 0.047393364928909956
Pr[Cluster 12 | Sci-Fi]: 	 0.18862559241706164
Pr[Cluster 13 | Sci-Fi]: 	 0.013270142180094787
Pr[Cluster 14 | Sci-Fi]: 	 0.03033175355450237
Pr[Cluster 15 | Sci-Fi]: 	 0.13175355450236967


In [33]:
pr_cluster_given_genre

0.13175355450236967

In [34]:
poster_cluster_prs_df = pd.DataFrame(cluster_posterior_prs, columns=['posterior_cluster_pr'])
poster_cluster_prs_df["cluster"] = poster_cluster_prs_df.index

poster_cluster_prs_df

Unnamed: 0,posterior_cluster_pr,cluster
0,0.17346,0
1,0.040758,1
2,0.203791,2
3,0.024645,3
4,0.053081,4
5,0.0,5
6,0.061611,6
7,0.0,7
8,0.008531,8
9,0.021801,9


In [35]:
cluster_df.groupby("cluster").mean()

Unnamed: 0_level_0,rating,raters
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1
0,4.552502,24081.608008
1,5.502326,26345.518169
2,4.896075,83537.664005
3,5.515968,50720.176613
4,5.425038,39424.616794
5,6.464643,54953.271429
6,5.44073,11503.330186
7,6.048661,29263.581102
8,5.481137,24112.288114
9,4.876823,35051.484375


In [36]:
joined_df = poster_cluster_prs_df.set_index("cluster").join(cluster_df.groupby("cluster").mean())
joined_df

Unnamed: 0_level_0,posterior_cluster_pr,rating,raters
cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.17346,4.552502,24081.608008
1,0.040758,5.502326,26345.518169
2,0.203791,4.896075,83537.664005
3,0.024645,5.515968,50720.176613
4,0.053081,5.425038,39424.616794
5,0.0,6.464643,54953.271429
6,0.061611,5.44073,11503.330186
7,0.0,6.048661,29263.581102
8,0.008531,5.481137,24112.288114
9,0.021801,4.876823,35051.484375


In [37]:
joined_df["posterior_cluster_pr"] * joined_df["rating"]

cluster
0     0.789676
1     0.224265
2     0.997778
3     0.135939
4     0.287964
5     0.000000
6     0.335211
7     0.000000
8     0.046759
9     0.106319
10    0.005305
11    0.207257
12    0.771113
13    0.077831
14    0.183077
15    0.655846
dtype: float64

In [27]:
sum(joined_df["posterior_cluster_pr"] * joined_df["rating"])

4.824340296971503

In [38]:
sum(joined_df["posterior_cluster_pr"] * joined_df["rating"])

4.824340296971503

In [39]:
np.mean([m["rating"] for m in movie_actor_map.values() if target_genre in m["genres"]])

KeyError: 'rating'

In [40]:
target_genre = "Drama"
target_actor = "nm0000120" # Jim Carrey

per_cluster_prs = []
for cluster_id,group in cluster_df.groupby("cluster"):

    this_cluster_genre_count = sum([
        1 if (target_genre in movie_actor_map[m]["genres"] and target_actor in movie_actor_map[m]["actors"]) else 0 
        for m in group["movie_id"]
    ])
    
    # Calculate conditional probability
    pr_genre_actor_given_cluster = this_cluster_genre_count / group.shape[0]
    print("Pr[%s, %s| Cluster %02d]:" % (target_genre, target_actor, cluster_id), "\t", pr_genre_actor_given_cluster)
    
    # Calculate joint probability
    joint_pr_genre_actor_cluster = pr_genre_actor_given_cluster * group.shape[0] / cluster_df.shape[0]
    print("Pr[%s, %s, Cluster %02d]:" % (target_genre, target_actor, cluster_id), "\t", joint_pr_genre_actor_cluster)
    per_cluster_prs.append(joint_pr_genre_actor_cluster)

Pr[Drama, nm0000120| Cluster 00]: 	 0.0
Pr[Drama, nm0000120, Cluster 00]: 	 0.0
Pr[Drama, nm0000120| Cluster 01]: 	 0.0
Pr[Drama, nm0000120, Cluster 01]: 	 0.0
Pr[Drama, nm0000120| Cluster 02]: 	 0.0
Pr[Drama, nm0000120, Cluster 02]: 	 0.0
Pr[Drama, nm0000120| Cluster 03]: 	 0.0
Pr[Drama, nm0000120, Cluster 03]: 	 0.0
Pr[Drama, nm0000120| Cluster 04]: 	 0.0
Pr[Drama, nm0000120, Cluster 04]: 	 0.0
Pr[Drama, nm0000120| Cluster 05]: 	 0.0
Pr[Drama, nm0000120, Cluster 05]: 	 0.0
Pr[Drama, nm0000120| Cluster 06]: 	 0.0
Pr[Drama, nm0000120, Cluster 06]: 	 0.0
Pr[Drama, nm0000120| Cluster 07]: 	 0.0
Pr[Drama, nm0000120, Cluster 07]: 	 0.0
Pr[Drama, nm0000120| Cluster 08]: 	 0.0
Pr[Drama, nm0000120, Cluster 08]: 	 0.0
Pr[Drama, nm0000120| Cluster 09]: 	 0.0
Pr[Drama, nm0000120, Cluster 09]: 	 0.0
Pr[Drama, nm0000120| Cluster 10]: 	 0.0
Pr[Drama, nm0000120, Cluster 10]: 	 0.0
Pr[Drama, nm0000120| Cluster 11]: 	 0.0
Pr[Drama, nm0000120, Cluster 11]: 	 0.0
Pr[Drama, nm0000120| Cluster 12]: 	 0.0


In [41]:
for cluster_id,cluster_genre_pr in enumerate(per_cluster_prs):

    pr_cluster_given_genre = cluster_genre_pr / genre_pr_map[target_genre]

    print("Pr[Cluster %02d | %s, %s]:" % (cluster_id, target_genre, target_actor), "\t", pr_cluster_given_genre)
    

Pr[Cluster 00 | Drama, nm0000120]: 	 0.0
Pr[Cluster 01 | Drama, nm0000120]: 	 0.0
Pr[Cluster 02 | Drama, nm0000120]: 	 0.0
Pr[Cluster 03 | Drama, nm0000120]: 	 0.0
Pr[Cluster 04 | Drama, nm0000120]: 	 0.0
Pr[Cluster 05 | Drama, nm0000120]: 	 0.0
Pr[Cluster 06 | Drama, nm0000120]: 	 0.0
Pr[Cluster 07 | Drama, nm0000120]: 	 0.0
Pr[Cluster 08 | Drama, nm0000120]: 	 0.0
Pr[Cluster 09 | Drama, nm0000120]: 	 0.0
Pr[Cluster 10 | Drama, nm0000120]: 	 0.0
Pr[Cluster 11 | Drama, nm0000120]: 	 0.0
Pr[Cluster 12 | Drama, nm0000120]: 	 0.0
Pr[Cluster 13 | Drama, nm0000120]: 	 0.0
Pr[Cluster 14 | Drama, nm0000120]: 	 0.00019690853598503495
Pr[Cluster 15 | Drama, nm0000120]: 	 0.0


In [42]:
target_cluster = 0

In [43]:
for movie_id in cluster_df[cluster_df["cluster"] == target_cluster].sample(n=10, replace=False)["movie_id"]:
    this_movie = movie_actor_map[movie_id]
    print(movie_id, this_movie["movie"], this_movie["genres"])

tt10635588 A Christmas Wish ['Family']
tt14721454 The Misadventures of the Dunderheads ['Comedy']
tt7104824 Graven Image ['']
tt2023453 Diary of a Wimpy Kid: Dog Days ['Comedy', 'Family']
tt2461150 Masterminds ['Biography', 'Comedy', 'Crime']
tt8870946 A Dim Valley ['Comedy']
tt2093991 Elvis & Nixon ['Comedy', 'History']
tt2022490 Pennin Manathai Thottu ['Musical']
tt8806000 Another Watcher ['']
tt2463288 Walk of Shame ['Comedy']


1. Me

2. Selecting the cluster with the highest probability then the rating of that

3. Sci-Fi: 0.051163918525703206

   Drama: 0.49258001939864215

   Mystery: 0.07444228903976721

4. This makes sense as Drama is the most popular so that would have the highest

   
