 IR PRACTICAL 5: PAGERANK ALGORITHM IN PYTHON

Step 1: Import Required Libraries

In [1]:
import requests
from bs4 import BeautifulSoup
import numpy as np

Step 2: Define Public Web Pages

In [8]:
pages = {
    'A': 'https://www.python.org/',
    'B': 'https://www.djangoproject.com/',
    'C': 'https://flask.palletsprojects.com/',
    'D': 'https://numpy.org/',
    'E': 'https://pandas.pydata.org/'
}

Step 3: Extract Links Using BeautifulSoup

In [9]:
def extract_links(urls):
    links = {page: [] for page in urls}
    for page, url in urls.items():
        try:
            response = requests.get(url, timeout=5)
            soup = BeautifulSoup(response.text, 'html.parser')
            for tag in soup.find_all('a', href=True):
                for target in urls:
                    if urls[target] in tag['href']:
                        links[page].append(target)
        except:
            print(f" Failed to fetch {url}")
    return links

link_graph = extract_links(pages)

Step 4: Build Adjacency Matrix

In [10]:
page_list = list(pages.keys())
n = len(page_list)
adj_matrix = np.zeros((n, n))

for i, src in enumerate(page_list):
    for tgt in link_graph[src]:
        j = page_list.index(tgt)
        adj_matrix[i][j] = 1

# Normalize rows to get transition probabilities
for i in range(n):
    if adj_matrix[i].sum() != 0:
        adj_matrix[i] /= adj_matrix[i].sum()

Step 5: Initialize PageRank Vector

In [11]:
pagerank = np.ones(n) / n
damping = 0.85
threshold = 0.0001
delta = 1

Step 6: Iterative PageRank Computation

In [12]:
while delta > threshold:
    new_rank = (1 - damping) / n + damping * adj_matrix.T.dot(pagerank)
    delta = np.linalg.norm(new_rank - pagerank)
    pagerank = new_rank

Step 7: Display Final PageRank Scores

In [13]:
print("\n Final PageRank Scores:")
for i, page in enumerate(page_list):
    print(f"{page}: {round(pagerank[i], 4)}")


 Final PageRank Scores:
A: 0.2
B: 0.2
C: 0.03
D: 0.0522
E: 0.3478
