# BFS Crawl on Wikipedia
This notebook performs a **Breadth-First Search (BFS)** crawl starting from a Wikipedia article.
- Uses `requests` and `BeautifulSoup` to fetch and parse HTML.
- Restricts depth to avoid infinite crawling.
- Stores visited links and page titles.

In [1]:
import requests
from bs4 import BeautifulSoup
from collections import deque
import pandas as pd

In [2]:
def get_wiki_links(url):
    """Fetch all valid Wikipedia article links from a given page"""
    try:
        response = requests.get(url)
        if response.status_code != 200:
            return []
        soup = BeautifulSoup(response.text, 'html.parser')
        links = []
        for link in soup.select('a[href^="/wiki/"]'):
            href = link.get('href')
            # Filter out special pages (like Help:, Category:, File:)
            if ':' not in href:
                links.append("https://en.wikipedia.org" + href)
        return list(set(links))
    except Exception as e:
        print("Error:", e)
        return []

In [3]:
def bfs_crawl(start_url, max_depth=2, max_pages=30):
    visited = set()
    queue = deque([(start_url, 0)])
    results = []

    while queue and len(visited) < max_pages:
        url, depth = queue.popleft()
        if url in visited or depth > max_depth:
            continue
        visited.add(url)

        # Fetch page
        try:
            response = requests.get(url)
            soup = BeautifulSoup(response.text, 'html.parser')
            title = soup.find('h1').get_text() if soup.find('h1') else "No Title"
            results.append({"URL": url, "Title": title, "Depth": depth})

            print(f"[Depth {depth}] {title}")

            # Add children links
            if depth < max_depth:
                for link in soup.select('a[href^="/wiki/"]'):
                    href = link.get('href')
                    if ':' not in href:
                        full_url = "https://en.wikipedia.org" + href
                        if full_url not in visited:
                            queue.append((full_url, depth+1))
        except Exception as e:
            print("Failed to crawl:", url, "Error:", e)

    return pd.DataFrame(results)

In [4]:
# Run BFS starting from the 'Artificial Intelligence' Wikipedia page
start_page = "https://en.wikipedia.org/wiki/Artificial_intelligence"
df = bfs_crawl(start_page, max_depth=1, max_pages=15)
df.head()

[Depth 0] No Title


  soup = BeautifulSoup(response.text, 'html.parser')


Unnamed: 0,URL,Title,Depth
0,https://en.wikipedia.org/wiki/Artificial_intel...,No Title,0
