In [29]:
from numpy import linalg as LA
from numpy import array, identity, ones, nonzero, zeros
from collections import defaultdict
import random

from PageRank_module import pageRank

file = "top250movies.txt"

In [30]:
try:
    with open(file, encoding = "utf-8") as f:
        movies = defaultdict(list)
        for line in f:
            line = line.strip().split('/')
            movie = line[0]
            cast = line[1:]
            movies[movie].extend(cast)
except FileNotFoundError:
    raise IOError(f"Error: Could not find file {file}.")
except ValueError:
    raise ValueError(f"Error: Invalid data in file {file}.")


In [31]:
print("Number of movies: ", len(movies))
print("Number of actors: ", len(set([a for m in movies for a in movies[m]])))
print("Example entry: ", list(movies.items())[random.randint(0, len(movies))])

Number of movies:  250
Number of actors:  14882
Example entry:  ('V for Vendetta (2005)', ['Natalie Portman', 'Hugo Weaving', 'Stephen Rea', 'Stephen Fry', 'John Hurt', 'Tim Pigott-Smith', 'Rupert Graves', 'Roger Allam', 'Ben Miles', 'Sinéad Cusack', 'Natasha Wightman', 'John Standing', 'Eddie Marsan', 'Clive Ashborn', 'Emma Field-Rayner', 'Ian Burfield', 'Mark Phoenix', 'Alister Mazzotti', 'Billie Cook', 'Guy Henry', 'Cosima Shaw', 'Megan Gay', 'Roderic Culver', 'Tara Hacking', 'Andy Rashleigh', 'Chad Stahelski', 'Antje Rau', 'Amelda Brown', 'Richard Campbell', 'Patricia Gannon', 'Mark Longhurst', 'Simon Holmes', 'Charles Cork', 'John Ringham', 'Oliver Bradshaw', 'Jack Schouten', 'Caoimhe Murdock', 'Juliet Howland', 'Brin Rosser', 'Raife Patrick Burchell', 'Joseph Rye', 'Adrian Finighan', 'Malcolm Sinclair', 'Bradley Steve Ford', 'Madeleine Rakic-Platt', 'Selina Giles', 'Carsten Hayes', 'Derek Hutchinson', 'Martin Savage', 'Grant Burgin', 'Greg Donaldson', 'Imogen Poots', 'Jason Griff

In [32]:
# Create adjacency matrix
actors = sorted(set([actor for cast in movies.values() for actor in cast]))
n = len(actors)
G = zeros((n, n))

# Calculate the adjacency matrix
for cast in movies.values():
    actorsInCast = {actor for actor in cast if actor in actors}
    for actor1 in actorsInCast:
        for actor2 in actorsInCast:
            if actor1 != actor2:
                G[actors.index(actor1), actors.index(actor2)] += 1

In [41]:
# Compute PageRank
netw = pageRank(G, actors, 0.85, 1e-6, 15)
prv = netw.linsolve()


In [43]:
# Write the ranked movies to a file
rankedActors = sorted(zip(actors, prv), key = lambda x: x[1], reverse = True)

print("Top 10 actors:")
for actor, rank in rankedActors[:10]:
    print(f"{actor}: {rank}")
    
print("Bottom 10 actors:")
for actor, rank in rankedActors[-10:]:
    print(f"{actor}: {rank}")
    
name_counts = {}

for cast in movies.values():
    for actor in cast:
        if actor in name_counts:
            name_counts[actor] += 1
        else:
            name_counts[actor] = 1
print("Top 10 actors by number of movies:")
print(sorted(name_counts.items(), key = lambda x: x[1], reverse = True)[:10])


Top 10 actors:
Hendrik von Bültzingslöwen: ('Ümit Çirak', 6.719526945303047e-05)
Hemky Madera: ('Ülkü Duru', 6.719526945303047e-05)
Hedda Hopper: ('Øyvind Hagen-Traberg', 6.719526945303048e-05)
Henry Rowland: ('Özkan Ugur', 6.719526945303045e-05)
Hilary Brown: ('Özgül Arslan', 6.719526945303043e-05)
Hikaru Midorikawa: ('Özge Özberk', 6.719526945303043e-05)
Melora Harte: ('Íñigo Garcés', 5.9664027041767503e-05)
Machiko Washio: ('Émilie Caen', 6.135982699829402e-05)
Hanna Hall: ('Éder Júlio Martins', 6.719526945303056e-05)
Herbert Nelson: ('Çetin Tekindor', 6.719526945303044e-05)
Bottom 10 actors:
Declan Geraghty: ('Aakash Dabhade', 7.218628589338038e-05)
Dave Courvoisier: ('Aadil', 7.348453282767918e-05)
Deirdre Fitzpatrick: ('A.S. Duggal', 7.218628589338036e-05)
Shawn Fogarty: ('A.R. Haysel', 4.9671211217788633e-05)
Gary Riley: ("A.J. O'Connor", 6.788741147545513e-05)
Te'ron A. O'Neal: ('A.B. Lane', 4.7146434662615815e-05)
Madhav Datt: ('A. Marshal Ward', 6.116786137932241e-05)
Frances