In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import gradio as gr
from urllib.parse import urljoin

def get_title_and_paragraph(url):
    headers = {
        'User-Agent': 'Mozilla/5.0'
    }
    try:
        response = requests.get(url, headers=headers, timeout=5)
        soup = BeautifulSoup(response.text, 'html.parser')
        title_tag = soup.find('title')
        title = title_tag.get_text(strip=True) if title_tag else "No Title"

        # New paragraph extraction logic
        paragraphs = soup.find_all('p', class_='comp mntl-sc-block mntl-sc-block-html')

        content = ""
        for p in paragraphs:
            heading_tag = p.find_previous('h2')
            if heading_tag:
                heading_text = heading_tag.find('span', class_='mntl-sc-block-heading__text')
                if heading_text:
                    content += f"\n=== {heading_text.get_text(strip=True)} ===\n"

            text = p.get_text(strip=True)
            sentences = text.split('.')
            for sentence in sentences:
                sentence = sentence.strip()
                if sentence:
                    content += sentence + '.\n'
            content += '\n'

        paragraph = content.strip() if content else "No Paragraphs Found"
    except Exception as e:
        title = "Error fetching title"
        paragraph = f"Error: {e}"

    return title, paragraph

def scrape_links(url, num_links):
    headers = {
        'User-Agent': 'Mozilla/5.0'
    }

    try:
        response = requests.get(url, headers=headers)
        soup = BeautifulSoup(response.text, 'html.parser')

        links = []
        for tag in soup.find_all('a', href=True):
            href = tag['href']
            if href.startswith('/'):
                href = urljoin(url, href)
            elif href.startswith('#') or href.startswith('javascript:'):
                continue
            links.append(href)

        unique_links = list(set(links))
        limited_links = unique_links[:int(num_links)]

        # Collect title and paragraph for each of the top N links
        enriched_data = []
        for link in limited_links:
            title, paragraph = get_title_and_paragraph(link)
            enriched_data.append({
                'Link': link,
                'Title': title,
                'Paragraph': paragraph
            })

        # Save all unique links to Excel, with enriched content for top N
        all_data = pd.DataFrame({'Links': unique_links})
        enriched_df = pd.DataFrame(enriched_data)
        with pd.ExcelWriter("scraped_links.xlsx") as writer:
            all_data.to_excel(writer, index=False, sheet_name="All Links")
            enriched_df.to_excel(writer, index=False, sheet_name="Top N with Content")

        preview = "\n\n".join([
            f"{i+1}. {item['Link']}\nTitle: {item['Title']}\nParagraph: {item['Paragraph'][:300]}..."
            for i, item in enumerate(enriched_data)
        ])

        return preview, f"✅ Found {len(unique_links)} links.\nSaved as scraped_links.xlsx"

    except Exception as e:
        return "", f"❌ Error: {e}"

# Launch Gradio interface
gr.Interface(
    fn=scrape_links,
    inputs=[
        gr.Textbox(label="Website URL", value="https://www.coursera.org"),
        gr.Number(label="Number of Links to Preview", value=10)
    ],
    outputs=[
        gr.Textbox(label="Top N Links with Content", lines=20),
        gr.Textbox(label="Status")
    ],
    title="Web Link Scraper with Content",
    description="Enter a website URL and number of links to preview. Each previewed link includes its title and a paragraph. All links saved to Excel."
).launch()


ModuleNotFoundError: No module named 'gradio'

In [2]:
!pip install gradio


Collecting gradio
  Downloading gradio-5.30.0-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<25.0,>=22.0 (from gradio)
  Downloading aiofiles-24.1.0-py3-none-any.whl.metadata (10 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)
Collecting gradio-client==1.10.1 (from gradio)
  Downloading gradio_client-1.10.1-py3-none-any.whl.metadata (7.1 kB)
Collecting groovy~=0.1 (from gradio)
  Downloading groovy-0.1.2-py3-none-any.whl.metadata (6.1 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart>=0.0.18 (from gradio)
  Downloading python_multipart-0.0.20-py3-none-any.whl.metadata (1.8 kB)
Collecting ruff>=0.9.3 (from gradio)
  Downloading ruff-0.11.10-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (25 kB)
Collecting safehttpx<0.2.0,>=0.1.

In [5]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import gradio as gr
from urllib.parse import urljoin
import re

def clean_and_split_text(text):
    # Remove extra spaces, split sentences
    sentences = re.split(r'(?<=[.!?])\s+', text)
    cleaned_sentences = [s.strip() for s in sentences if s.strip()]
    return "\n".join(cleaned_sentences)

def get_title_and_paragraph(url):
    headers = {
        'User-Agent': 'Mozilla/5.0'
    }
    try:
        response = requests.get(url, headers=headers, timeout=10)
        soup = BeautifulSoup(response.text, 'html.parser')

        # Get page title
        title_tag = soup.find('title')
        title = title_tag.get_text(strip=True) if title_tag else "No Title"

        # Gather text from <p>, <div>, <span>, and <li> tags
        tags_to_scan = soup.find_all(['p', 'div', 'span', 'li'])

        content = ""
        for tag in tags_to_scan:
            try:
                # Attempt to find a relevant heading nearby
                heading_tag = tag.find_previous(['h1', 'h2', 'h3'])
                if heading_tag:
                    heading_text = heading_tag.get_text(strip=True)
                    if heading_text not in content:
                        content += f"\n=== {heading_text} ===\n"

                text = tag.get_text(separator=' ', strip=True)
                if text and len(text) > 30:  # Filter out short/noisy text
                    content += clean_and_split_text(text) + "\n\n"
            except Exception:
                continue  # Skip problematic tags

        paragraph = content.strip() if content else "No relevant content found."

    except Exception as e:
        title = "Error fetching title"
        paragraph = f"Error: {e}"

    return title, paragraph

def scrape_links(url, num_links):
    headers = {
        'User-Agent': 'Mozilla/5.0'
    }

    try:
        response = requests.get(url, headers=headers)
        soup = BeautifulSoup(response.text, 'html.parser')

        links = []
        for tag in soup.find_all('a', href=True):
            href = tag['href']
            if href.startswith('/'):
                href = urljoin(url, href)
            elif href.startswith('#') or href.startswith('javascript:'):
                continue
            links.append(href)

        unique_links = list(set(links))
        limited_links = unique_links[:int(num_links)]

        # Collect title and paragraph for each of the top N links
        enriched_data = []
        for link in limited_links:
            title, paragraph = get_title_and_paragraph(link)
            enriched_data.append({
                'Link': link,
                'Title': title,
                'Paragraph': paragraph
            })

        # Save all unique links to Excel, with enriched content for top N
        all_data = pd.DataFrame({'Links': unique_links})
        enriched_df = pd.DataFrame(enriched_data)
        with pd.ExcelWriter("scraped_links.xlsx") as writer:
            all_data.to_excel(writer, index=False, sheet_name="All Links")
            enriched_df.to_excel(writer, index=False, sheet_name="Top N with Content")

        preview = "\n\n".join([
            f"{i+1}. {item['Link']}\nTitle: {item['Title']}\nParagraph: {item['Paragraph'][:300]}..."
            for i, item in enumerate(enriched_data)
        ])

        return preview, f"✅ Found {len(unique_links)} links.\nSaved as scraped_links.xlsx"

    except Exception as e:
        return "", f"❌ Error: {e}"

# Launch Gradio interface
gr.Interface(
    fn=scrape_links,
    inputs=[
        gr.Textbox(label="Website URL", value="https://www.coursera.org"),
        gr.Number(label="Number of Links to Preview", value=10)
    ],
    outputs=[
        gr.Textbox(label="Top N Links with Content", lines=20),
        gr.Textbox(label="Status")
    ],
    title="Enhanced Web Link Scraper",
    description="Enter a website URL and number of links to preview. Scrapes titles + multi-tag paragraphs + headings. Saves all to Excel."
).launch()


It looks like you are running Gradio on a hosted a Jupyter notebook. For the Gradio app to work, sharing must be enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://7b163b72256795b64e.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


