In [2]:
import requests
from bs4 import BeautifulSoup, MarkupResemblesLocatorWarning
import networkx as nx
import warnings

# Suppress warnings from BeautifulSoup
warnings.filterwarnings("ignore", category=MarkupResemblesLocatorWarning)

CHROME_USER_AGENT = (
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
    "AppleWebKit/537.36 (KHTML, like Gecko) "
    "Chrome/141.0.0.0 Safari/537.36"
)

start_url = "https://en.wikipedia.org/wiki/Data_science"

# Step 2: Function to extract internal Wikipedia links
def get_links(url):
    try:
        headers = {"User-Agent": CHROME_USER_AGENT}
        response = requests.get(url, headers=headers, timeout=10)

        # Validate the response content
        if not response.ok or not response.text.strip().startswith("<"):
            print(f" Skipping invalid response from {url}")
            return []

        soup = BeautifulSoup(response.text, "html.parser")
        links = set()

        for a_tag in soup.find_all("a", href=True):
            href = a_tag["href"]
            if href.startswith("/wiki/") and not any(prefix in href for prefix in [":", "#", "Main_Page"]):
                links.add("https://en.wikipedia.org" + href)

        return list(links)

    except Exception as e:
        print(f" Error fetching {url}: {e}")
        return []

# Step 3: Crawl a few pages and build directed graph
pages_to_crawl = [start_url]
max_pages = 5
graph = nx.DiGraph()

while pages_to_crawl and len(graph.nodes) < max_pages:
    url = pages_to_crawl.pop(0)
    if url not in graph.nodes:
        print(f" Crawling: {url}")
        links = get_links(url)
        for link in links[:10]:
            graph.add_edge(url, link)
        pages_to_crawl.extend(links[:2])

print("\n Total pages collected:", len(graph.nodes))
print(" Pages found:")
for i, page in enumerate(graph.nodes, 1):
    print(f"{i}. {page}")

# Step 4: Compute PageRank
pagerank_scores = nx.pagerank(graph)

# Step 5: Display results
print("\n PageRank Scores:")
for page, rank in sorted(pagerank_scores.items(), key=lambda x: x[1], reverse=True):
    print(f"{page} : {rank:.5f}")


 Crawling: https://en.wikipedia.org/wiki/Data_science

 Total pages collected: 11
 Pages found:
1. https://en.wikipedia.org/wiki/Data_science
2. https://en.wikipedia.org/wiki/Business
3. https://en.wikipedia.org/wiki/Supervised_learning
4. https://en.wikipedia.org/wiki/Data_cooperative
5. https://en.wikipedia.org/wiki/Open_data
6. https://en.wikipedia.org/wiki/Computational_science
7. https://en.wikipedia.org/wiki/Data_compression
8. https://en.wikipedia.org/wiki/Data_philanthropy
9. https://en.wikipedia.org/wiki/Data_visualization
10. https://en.wikipedia.org/wiki/Algorithm
11. https://en.wikipedia.org/wiki/Joanna_Bryson

 PageRank Scores:
https://en.wikipedia.org/wiki/Business : 0.09156
https://en.wikipedia.org/wiki/Supervised_learning : 0.09156
https://en.wikipedia.org/wiki/Data_cooperative : 0.09156
https://en.wikipedia.org/wiki/Open_data : 0.09156
https://en.wikipedia.org/wiki/Computational_science : 0.09156
https://en.wikipedia.org/wiki/Data_compression : 0.09156
https://en.wikip