## 3. Actors PageRank And Network Analysis

This file implements the Google PageRank algorithm on the acting network as defined by actors that co-star in the lead, second, or third supporting roles of any film. Google PageRank is a modification of Katz (eigenvector) centrality, that measures how central a node is to a network based on the number of random walks that must pass through it. 

We weight edges by various factors when calculating the PageRank.

In [112]:
import pandas as pd
import datetime as dt
import numpy as np
import math
import matplotlib.pyplot as plt
from collections import Counter
from collections import OrderedDict
warnings.filterwarnings(action='ignore')

In [113]:
data_path = "data/"
m = pd.read_csv(data_path + "movies_postprocessing.csv")
entities = {}
for col in ["genres", "keywords","plot_keywords","all_keywords" ,"production_companies", "production_countries", "spoken_languages"]:
    m[col] = pd.Series(ast.literal_eval(b) for b in m[col])
actors_ = m[["actor_1_name","actor_2_name","actor_3_name", "budget", "profit", "revenue",'popularity']]

## Implementation of PageRank Algorithm

In [223]:
def get_all_actors(actors):
    actors_ = actors[["actor_1_name","actor_2_name","actor_3_name"]]
    return list(set([a for b in actors_.values.tolist() for a in b]))

def build_adjacency_matrix(actors, criteria = None):
    actors_dict = get_all_actors(actors)
    actors_dict = {a:i for i,a in enumerate(actors_dict)}
    size = len(actors_dict)
    A = np.full((size, size), 0.0)
    
    for i, edge in actors.iterrows():
        pr_val = edge[criteria] if criteria in edge else 1
        actors_ = edge["actor_1_name"], edge["actor_2_name"], edge["actor_3_name"]
        for x in actors_:
            for y in actors_:
                if x != y:
                    A[actors_dict[x], actors_dict[y]] += pr_val
    return actors_dict, A


def build_transition_matrix(A):
    s = np.sum(A, axis = 1)
    s[s == 0] = 1
    M = A / s[None,:]
    return M


def pagerank(M, eps=1e-3, d=0.85):
    N = M.shape[1]
    v = np.random.rand(N,1)
    v = v / np.linalg.norm(v,1)
    last_v = np.ones((N,1), dtype=np.float32) * 100

    while np.linalg.norm(v-last_v, 2) > eps:
        last_v = v
        v = d * np.matmul(M,v) + (1-d) / N
        print("norm {}".format(np.linalg.norm(v-last_v, 2)))
    return v


def get_pagerank(criteria=None):
    actors_dict, A = build_adjacency_matrix(m, criteria=criteria)
    M = build_transition_matrix(A)
    v = pagerank(M)
    temp = {k:list(v)[ix][0] for k,ix in actors_dict.items()}
    pagerank_nodes = sorted(temp, key=temp.get, reverse=True)
    page_rank = {key: rank for rank, key in enumerate(pagerank_nodes, 1)}
    pagerank_val = temp
    return page_rank, pagerank_nodes, pagerank_val


In [228]:
page_rank, pagerank_nodes, pagerank_val = get_pagerank()
sorted(pagerank_val.items(), key=lambda kv: kv[1], reverse=True)[:15]

norm 0.0189617154619
norm 0.00608246035292
norm 0.00233631462289
norm 0.00107844543387
norm 0.000520839772782


[('Robert De Niro', 0.0031422870147596147),
 ('Morgan Freeman', 0.003083140870185692),
 ('Matt Damon', 0.002733919418055917),
 ('Bruce Willis', 0.0026212259112237796),
 ('Johnny Depp', 0.002359116032360327),
 ('Liam Neeson', 0.002283090710913995),
 ('Denzel Washington', 0.0022365198731424418),
 ('Tom Hanks', 0.0022336919058720105),
 ('Nicolas Cage', 0.0022331866362385574),
 ('Brad Pitt', 0.002218437145573223),
 ('Bill Murray', 0.0021334889836587705),
 ('Steve Buscemi', 0.002107301352486852),
 ('Tom Cruise', 0.0020781124778054),
 ('Will Ferrell', 0.0020715098658035374),
 ('Harrison Ford', 0.0019881745249752374)]

In [225]:
page_rank, pagerank_nodes, pagerank_val = get_pagerank("imdb_score")
sorted(pagerank_val.items(), key=lambda kv: kv[1], reverse=True)[:15]

norm 0.0190811214447
norm 0.00607211769742
norm 0.00236951540937
norm 0.00109970433603
norm 0.000539961581558


[('Morgan Freeman', 0.0032294475612127354),
 ('Robert De Niro', 0.003210103033728647),
 ('Matt Damon', 0.002906367810935474),
 ('Bruce Willis', 0.002632095318093047),
 ('Johnny Depp', 0.0024491396756034524),
 ('Denzel Washington', 0.0024109586152144522),
 ('Tom Hanks', 0.002410454054772374),
 ('Brad Pitt', 0.0023725438368700094),
 ('Bill Murray', 0.002307930654999723),
 ('Liam Neeson', 0.0022619312271243137),
 ('Nicolas Cage', 0.002250077157331331),
 ('Tom Cruise', 0.0022279635077519823),
 ('Steve Buscemi', 0.002131461484867716),
 ('Will Ferrell', 0.0020805505843765598),
 ('Harrison Ford', 0.00206901250495735)]

In [226]:
page_rank, pagerank_nodes, pagerank_val = get_pagerank("revenue")
sorted(pagerank_val.items(), key=lambda kv: kv[1], reverse=True)[:15]

norm 0.020221264724
norm 0.00777470635143
norm 0.00317325703028
norm 0.00164483969442
norm 0.000910173364141


[('Tom Cruise', 0.004341426370967094),
 ('Tom Hanks', 0.004165160425084343),
 ('Morgan Freeman', 0.004108375137132653),
 ('Will Smith', 0.0034673826784268743),
 ('Bruce Willis', 0.003254595768294114),
 ('Scarlett Johansson', 0.003147459413866304),
 ('Robert Downey Jr.', 0.003143313298539969),
 ('Matt Damon', 0.0031075747175251384),
 ('Brad Pitt', 0.0030365232977737047),
 ('Leonardo DiCaprio', 0.0030024422967819997),
 ('Liam Neeson', 0.0028729335269277487),
 ('Johnny Depp', 0.0028162253045744753),
 ('Nicolas Cage', 0.0026939525818893526),
 ('Angelina Jolie Pitt', 0.00268623555690746),
 ('Harrison Ford', 0.0026641722244962586)]

In [227]:
page_rank, pagerank_nodes, pagerank_val = get_pagerank("budget")
sorted(pagerank_val.items(), key=lambda kv: kv[1], reverse=True)[:15]

norm 0.0197920999876
norm 0.0069743179607
norm 0.00281714390562
norm 0.00142357814082
norm 0.000758515526327


[('Morgan Freeman', 0.003981593282823947),
 ('Johnny Depp', 0.0035572241967924997),
 ('Bruce Willis', 0.003316312176596024),
 ('Tom Cruise', 0.003207117764316171),
 ('Matt Damon', 0.0031817304875636407),
 ('Nicolas Cage', 0.003162170470562861),
 ('Tom Hanks', 0.0031423294243716014),
 ('Brad Pitt', 0.003006578301481497),
 ('Will Smith', 0.002956845230077237),
 ('Steve Buscemi', 0.0028427485537773855),
 ('Robert De Niro', 0.0028033649697696873),
 ('Liam Neeson', 0.0027350555005992688),
 ('Angelina Jolie Pitt', 0.0026999148776034394),
 ('Denzel Washington', 0.0026907162164370125),
 ('Sylvester Stallone', 0.002677860920294705)]

## Building an Edge List of Actors

This is used to graph the network on Gephi.

In [191]:
def build_actors_edge_list(actors):
    actors_edge_list = []
    actors_dict = get_all_actors(actors)
    actors_dict = {a:i for i,a in enumerate(actors_dict)}
    size = len(actors_dict)
    A = np.full((size, size), 0)
    for i, edge in actors.iterrows():
        actors = edge["actor_1_name"], edge["actor_2_name"], edge["actor_3_name"]
        for ix in range(len(actors)):
            for j in range(ix,len(actors)):
                x,y = actors[ix], actors[j]
                actors_edge_list += [[x,y,edge["revenue"],edge["budget"], edge["popularity"], edge["imdb_score"], 1]]
    actors_edge_list = pd.DataFrame(actors_edge_list, columns=["Source", "Target", "revenue","budget","popularity", "imdb_score", "count"])
    edge_list = edge_list.groupby(["Source", "Target"]).sum()
    return actors_edge_list

edge_list = build_actors_edge_list(m)
edge_list.to_csv(data_path+"actors_edge_list.csv")