In [89]:
import numpy as np
from numpy import linalg as LA
from imp import reload
from numpy import array, identity, ones, nonzero, zeros
from numpy import linalg as LA
import networkx as nx
from collections import defaultdict
import random

file = "top250movies.txt"

In [90]:
def sortPageRank(prdict):
    return sorted(prdict.items(), key = lambda item: item[1], reverse=True)


In [91]:
class pageRank():
    def __init__(self, G, eps, tol, maxiter):
        G = np.array(G)
        n = G.shape[0]
        if n == 0:
            raise ValueError("Error: Matrix G has no nodes.")
        for i in range(n): # normalize columns of G
            ci = LA.norm(G[:,i], 1)
            if ci > 0:
                G[:,i] = G[:,i] / ci
            else: # adjustment for a column of zeros
                G[:,i] = np.ones((n,)) / float(n)
        self.G = G # normalized matrix
        self.eps = eps # probability of jumping to a link on page
        self.size = G.shape[0] # size of matrix
        self.tol = tol # tolerance for power method
        self.maxiter = maxiter # 
        self.movies = defaultdict(list) # initialize the movies attribute

        
    def powermethod(self):
        n = self.size
        
        # Get sparse G (as a list)
        
        #list of lists of index of nonzero elements in each row
        nzre = [np.nonzero(self.G[k,:] > 0)[0] for k in range(n)] 
        
        #list of vectors of nonzero elements in each row
        nzv = [self.eps * self.G[k, nzre[k]] for k in range(n)]
        
        oeps = (1.0 - self.eps) / n
        x = np.ones((n,1)) / float(n) # initial vector
        
        xn1 = LA.norm(x, 1)
        ntol = xn1
        niter = 0
        while ntol > self.tol and niter < self.maxiter :
            xold = x       
            for k in range(n):
                x[k] = nzv[k] @ x[nzre[k]] + oeps
                
            xn1  = LA.norm(x, 1)
            x = x / xn1
            ntol = LA.norm(x - xold, 1)
            
            niter += 1
            
        return sortPageRank({k:float(v) for (k,v) in zip(range(n), x)})
    
    def movieRank(self):
        # Flatten the list of actors and remove duplicates
        actor_list = list(set([a for m in self.movies.values() for a in m]))
        actor_list_sorted = sorted(actor_list)
        
        # Create the adjacency matrix for the actor network
        n = len(actor_list)
        if n == 0:
            return []
        G = zeros((n, n))
        for m in self.movies.values():
            for i, a1 in enumerate(m):
                if a1 in actor_list:
                    for a2 in m[i+1:]:
                        if a2 in actor_list and a1 != a2:
                            G[actor_list.index(a2), actor_list.index(a1)] += 1
        
        # Compute the PageRank vector
        x = LA.solve(identity(n) - self.eps * G, (1.0 - self.eps) / n * ones((n, 1)))
        
        # Sort the actors by PageRank
        actor_ranked = sortPageRank({actor_list_sorted[i]: float(x[i]) for i in range(n)})
        
        return actor_ranked

In [92]:
try:
    with open(file, encoding = "utf-8") as f:
        movies = defaultdict(list)
        for line in f:
            line = line.strip().split('/')
            movie = line[0]
            cast = line[1:]
            movies[movie].extend(cast)
except FileNotFoundError:
    raise IOError(f"Error: Could not find file {file}.")
except ValueError:
    raise ValueError(f"Error: Invalid data in file {file}.")


In [93]:
print("Number of movies: ", len(movies))
print("Number of actors: ", len(set([a for m in movies for a in movies[m]])))
print("Example entry: ", list(movies.items())[random.randint(0, len(movies))])

Number of movies:  250
Number of actors:  14882
Example entry:  ('The Dark Knight (2008)', ['Christian Bale', 'Heath Ledger', 'Aaron Eckhart', 'Michael Caine', 'Maggie Gyllenhaal', 'Gary Oldman', 'Morgan Freeman', 'Monique Gabriela Curnen', 'Ron Dean', 'Cillian Murphy', 'Chin Han', 'Nestor Carbonell', 'Eric Roberts', 'Ritchie Coster', 'Anthony Michael Hall', 'Keith Szarabajka', 'Colin McFarlane', 'Joshua Harto', 'Melinda McGraw', 'Nathan Gamble', 'Michael Vieau', 'Michael Stoyanov', 'William Smillie', 'Danny Goldring', 'Michael Jai White', "Matthew O'Neill", 'William Fichtner', 'Olumiji Olawumi', 'Greg Beam', 'Erik Hellman', 'Beatrice Rosen', 'Vincenzo Nicoli', 'Edison Chen', 'Nydia Rodriguez Terracina', 'Andy Luther', 'James Farruggio', 'Tom McElroy', 'Will Zahrn', 'James Fierro', 'Patrick Leahy', 'Sam Derence', 'Jennifer Knox', 'Patrick Clear', 'Sarah Jayne Dunn', 'Charles Venn', 'Winston Ellis', 'David Dastmalchian', 'Sophia Hinshelwood', 'Keith Kupferer', 'Joseph Luis Caballero', '

In [94]:
# Create adjacency matrix
actors = sorted(set([actor for cast in movies.values() for actor in cast]))
n = len(actors)
A = np.zeros((n, n))

for cast in movies.values():
    for i, actor1 in enumerate(cast):
        if i == 0:
            continue
        else: 
            for j, actor2 in enumerate(cast[:i]):
                if actor1 != actor2:
                    A[actors.index(actor1), actors.index(actor2)] += 1


In [95]:
# Compute PageRank
netw = pageRank(A,0.85,1e-15,15)
print("Top 10 actors by PageRank:")
print(netw.movieRank()[:10])

# Write the ranked movies to a file
with open("rankedMovies.txt", "w") as f:
    for movie, score in netw.movieRank():
        f.write(f"{movie}: {score}\n")

Top 10 actors by PageRank:


ZeroDivisionError: float division by zero

In [None]:
# Print ranked actors
ranked_actors = [actors[i] for i in np.argsort(prv)[::-1]]
print(ranked_actors)