# Project task 04:  Restaurant ranking

In [1]:
import numpy as np
import scipy.sparse as sp

The goal of this task is to rank restaurants using the **PageRank** algorithm. You are given a directed weighted graph where each node represents one restaurant. The edges in this graph are based on users reviews.

Additionally for each restaurant you are given the categories it belongs to, i.e. 'Mexican', 'Italian', etc. Note that each restaurant can belong to multiple categories.

Considering these categories as topics you will perform **topic-sensitive PageRank**, enabling you to e.g. find the top 10 'Mexican' restaurants.

## 1. Load data

* The graph is stored as a sparse adjacency matrix $A$
* The categories are stored in a binary sparse matrix $C$, with $C_{ij}=1$ indicating that restaurant $i$ belongs to category $j$
* We also provide you with a dictionary mapping each category to its corresponding column index in $C$
* The name of each restaurant is provided as a list, with the i-th element in the list corresponding to the i-th node in the graph

In [2]:
A = sp.load_npz('restaurant_graph.npz')
A

<7073x7073 sparse matrix of type '<class 'numpy.float64'>'
	with 1682844 stored elements in Compressed Sparse Row format>

In [3]:
#debug. assert that A is column stochastic as we expect
row_sum_A = np.sum(A,axis = 1)
col_sum_A = np.sum(A,axis = 0)


In [4]:
C = sp.load_npz('restaurant_categories.npz')
C

<7073x138 sparse matrix of type '<class 'numpy.float64'>'
	with 19047 stored elements in Compressed Sparse Row format>

In [5]:
categories = np.load('categories.npy',allow_pickle=True).tolist()
categories['Mexican'], categories['Chinese']

(3, 14)

In [6]:
names = np.load('restaurant_names.npy')
names[:3]

array(['Alize Catering', 'Chula Taberna Mexicana', 'Sunnyside Grill'],
      dtype='<U50')

In [7]:
assert A.shape[0] == len(names) == C.shape[0]
assert C.shape[1] == len(categories)

 ## 2. Determine the teleport set
 

Given a list of topics of intereset, i.e. `['Mexican', 'Italian', ...]`, implement a helper function to return all the restaurants that belong to **at least one** of these topics. These restaurants will become part of the teleport set in topic-sensitive PageRank.

In [8]:
def teleport_set(C, topics, categories):
    """
    Finds the teleport set consisting of restaurants that belong to at least one of the specified topics.
    
    Parameters
    ----------
    C             : sp.spmatrix, shape [num_restaurants, num_categories]
                    Binary matrix encoding which restaurants belongs to which categories.
    topics        : List[string]
                    List of topics of interest.
    categories    : dict(string, int)
                    Dictionary mapping each category to its corresponding column index in C.
        
    Returns
    -------
    teleport_idx : np.array, shape [S]
                   The indicies of the nodes in the teleport set.
    """
    
    #code for determining index of categories:
    categories_idx = [categories[i] for i in topics]
    categories_onehot = np.zeros(C.shape[1])
    categories_onehot [categories_idx] =1
    
    #print('shape of C an dof categories_onehot: %s  ,  %s'%(str(C.shape),str(categories_onehot.reshape(1,-1).shape)))
    teleport_idx = C .multiply( categories_onehot.reshape(1,-1)   )
    teleport_idx = np.nonzero(teleport_idx)[0] #interested only in the first dimension


    return teleport_idx

In [9]:
dummy = teleport_set(C,['Italian','Mexican'],categories)
print(dummy.shape)



(806,)


 ## 3. Implement topic-sensitive PageRank

In [10]:
def page_rank(A, beta, teleport_idx=None, eps=1e-12):
    """
    Implements topic-sensitive PageRank using power iteration and sparse matrix operations.
    
    Parameters
    ----------
    A           : sp.spmatrix, shape [num_restaurants, num_restaurants]
                  The adjacency matrix representing the graph of restaurants.
    beta        : float, 
                  0 < beta < 1, (1-beta) is the probabilty of teleporting to the nodes in the teleport set
    teleport_idx: np.array, shape [S]
                  The indicies of the nodes in the teleport set. If it equals to None
                  it means runs standard PageRank, i.e. all nodes are in the teleport set.
    
    Returns
    -------
    r          : np.array, shape [num_restaurants]
                 The page rank vector containing the page rank scores for each restaurant.
    """
    #first create teleport vector
    num_restaurants = A.shape[0]
    if teleport_idx is not None:
        teleport = np.zeros(num_restaurants)
        teleport[teleport_idx] = 1.0/teleport_idx.shape[0]
    else:
        teleport = np.full(num_restaurants, 1.0/num_restaurants)
    
    
    #start with a random normalised vector. Sampling distribution should not matter:
    r_old = np.random.rand(num_restaurants)
    r_old = r_old/np.linalg.norm(r_old)
    
    #currently A is not a stochaastic matrix,
    # it is just a weighted matrix that needs to be normalised
    #lets do that!
    A= A.multiply((1./np.sum(A, axis = 0).reshape(1,-1)))
    
    r = beta *( A.dot(r_old)) + (1-beta) * teleport
    
    # if teleport_idx is not None:
    #     print('Debug.shapes of teleport , r, r_old, A ,teleport_idx = %s,%s,%s,%s,%s'%
    #           (str(teleport.shape),str(r.shape),str(r_old.shape),str(A.shape),str(teleport_idx.shape)))
    # else:
    #     print('Debug.shapes of teleport , r, r_old, A  = %s,%s,%s,%s'%
    #           (str(teleport.shape),str(r.shape),str(r_old.shape),str(A.shape)))
    
    while np.linalg.norm(r-r_old)>eps:
        #print(np.linalg.norm(r_old))
        r_old = r
        r = beta *( A.dot(r_old)) + (1.0-beta) * teleport
        
    
    return r

### 3.1 Calculate the standard PageRank scores and print the names of the top 5 restaurants overall

In [11]:
idx_to_category = {v:k for k, v in categories.items()}

In [14]:
r = page_rank(A=A, beta=0.6, teleport_idx=None)

for i, x in enumerate(r.argsort()[-5:]):
    print(i+1, names[x], '\n  Categories: ', [idx_to_category[cat] for cat in C[x].nonzero()[1]])

1 Congee Me 
  Categories:  ['Korean']
2 Go Go China 
  Categories:  ['Chinese']
3 Sushi Making For the Soul 
  Categories:  ['Japanese']
4 Spring Rolls 
  Categories:  ['African']
5 Happy Tummy Filipino Cuisine 
  Categories:  ['Chinese']


### 3.2 Calculate the topic-sensitive PageRank scores and print the names of top 5 Mexican restaurants

In [15]:
teleport_idx = teleport_set(C, ['Mexican'], categories)
r = page_rank(A=A, beta=0.6, teleport_idx=teleport_idx)

for i, x in enumerate(r.argsort()[-5:]):
    print(i+1, names[x], '\n  Categories: ', [idx_to_category[cat] for cat in C[x].nonzero()[1]])

1 Chill 
  Categories:  ['Mexican']
2 El Taquito 
  Categories:  ['Mexican']
3 The Atlantic 
  Categories:  ['Fast Food', 'Mexican']
4 Burrito Loco 
  Categories:  ['Mexican']
5 El Takito 
  Categories:  ['Mexican']


### 3.3 Calculate the topic-sensitive PageRank scores and print the names of top 5 Italian or French restaurants


In [16]:
teleport_idx = teleport_set(C, ['Italian', 'French'], categories)
r = page_rank(A=A, beta=0.6, teleport_idx=teleport_idx)

for i, x in enumerate(r.argsort()[-5:]):
    print(i+1, names[x], '\n  Categories: ', [idx_to_category[cat] for cat in C[x].nonzero()[1]])

1 Ali Baba's Middle Eastern Cuisine 
  Categories:  ['Sandwiches', 'Pizza', 'Italian']
2 New May Hong Yuen BBQ 
  Categories:  ['Italian']
3 Sunnyside Café 
  Categories:  ['French']
4 IPho Vietnamese Cuisine 
  Categories:  ['Italian']
5 McDonald's 
  Categories:  ['Italian']
