# Cluster Actors using a Matrix of Actor x Genre

Create a matrix of actors and the genres in which they've starred. Then, we'll use k-Means to extract clusters from the data in an unsupervised fashion.

In [44]:
%matplotlib inline


In [45]:
import pandas as pd

import json

In [46]:
df = pd.read_csv("movie_data.csv")

actor_name_map = {}
actor_genre_map = {}
movie_actor_map = {}

for _, row in df.iterrows():
    actor_id = row['actor_id']
    genres = [col for col in row.index if row[col] != 0] 

    for genre in genres:
        this_actors_genres = actor_genre_map.get(actor_id, {})
        this_actors_genres[genre] = this_actors_genres.get(genre, 0) + 1
        actor_genre_map[actor_id] = this_actors_genres

    actor_name_map[actor_id] = actor_id 

    movie_actor_map[actor_id] = {
        "genres": genres,
        "actors": {actor_id}, 
        "movie": actor_id 
    }



In [47]:
index = actor_genre_map.keys()

rows = [actor_genre_map[k] for k in index]

df = pd.DataFrame(rows, index=index)

df = df.fillna(0)

df

Unnamed: 0,actor_id,Comedy,Fantasy,Romance,Drama,Mystery,Thriller,Action,Biography,Crime,...,Horror,Documentary,Sport,News,Family,Music,Unnamed: 22,Western,Short,Reality-TV
nm0000212,1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
nm0413168,1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
nm0000630,1,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
nm0005227,1,1.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,...,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
nm0864851,1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
nm9504284,1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
nm10592896,1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
nm7216750,1,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
nm0936300,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Apply k-Means with a Fixed K

In [48]:
from sklearn.cluster import KMeans

In [49]:
k = 8 # We pick k=8 for illustrative purposes only. You would need a more principled approach here.

In [50]:
cluster_model = KMeans(n_clusters=k)

In [51]:
cluster_model.fit(df)



In [52]:
cluster_labels = cluster_model.predict(df)
actor_cluster_df = pd.DataFrame(cluster_labels, index=df.index, columns=["cluster"])

In [53]:
actor_cluster_df["cluster"].value_counts()

cluster
2    7876
3    6743
1    4995
6    3043
0    2983
5    2819
7    2794
4    2356
Name: count, dtype: int64

In [54]:
for cluster,actors in actor_cluster_df.groupby("cluster"):
    print("Cluster:", cluster, "Size:", actors.shape[0])
    
    for a_id in actors.sample(5).index:
        print("\t", a_id, actor_name_map[a_id])

Cluster: 0 Size: 2983
	 nm0198501 nm0198501
	 nm0621031 nm0621031
	 nm3749108 nm3749108
	 nm0429171 nm0429171
	 nm0674025 nm0674025
Cluster: 1 Size: 4995
	 nm0778660 nm0778660
	 nm5669325 nm5669325
	 nm5153572 nm5153572
	 nm1566474 nm1566474
	 nm12001104 nm12001104
Cluster: 2 Size: 7876
	 nm4279268 nm4279268
	 nm11857981 nm11857981
	 nm0085932 nm0085932
	 nm0198369 nm0198369
	 nm5328659 nm5328659
Cluster: 3 Size: 6743
	 nm1380586 nm1380586
	 nm0929977 nm0929977
	 nm0444687 nm0444687
	 nm6356195 nm6356195
	 nm3444392 nm3444392
Cluster: 4 Size: 2356
	 nm0044762 nm0044762
	 nm0568180 nm0568180
	 nm0001631 nm0001631
	 nm0731075 nm0731075
	 nm0001517 nm0001517
Cluster: 5 Size: 2819
	 nm2003700 nm2003700
	 nm0961737 nm0961737
	 nm3122771 nm3122771
	 nm0273371 nm0273371
	 nm1274752 nm1274752
Cluster: 6 Size: 3043
	 nm0788903 nm0788903
	 nm3165541 nm3165541
	 nm0841910 nm0841910
	 nm5585655 nm5585655
	 nm6563798 nm6563798
Cluster: 7 Size: 2794
	 nm0416551 nm0416551
	 nm0665544 nm0665544
	 nm23