In [62]:
import os
import json
from openai import OpenAI
import requests
from dotenv import load_dotenv
from IPython.display import display, Markdown

In [63]:
# Initialize Ollama OpenAI client
OLLAMA_BASE_URL = "http://localhost:11434/v1"
ollama = OpenAI(base_url=OLLAMA_BASE_URL, api_key='ollama')

USE_API = False  # Set to True to use OpenAI API instead of Ollama
if USE_API:
    load_dotenv()
    OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
    openai = OpenAI()
    GET_LINKS_RELEVANT_MODEL = "gpt-4.1-mini"
    CREATE_BROCHURE_MODEL = "gpt-4.1-mini"
else:
    GET_LINKS_RELEVANT_MODEL = "llama3.2"
    CREATE_BROCHURE_MODEL = "gpt-oss"
    

In [64]:
requests.get("http://localhost:11434").content

b'Ollama is running'

In [65]:
from bs4 import BeautifulSoup
import requests

# Standard headers to fetch a website
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
}

def fetch_website_contents(url):
    """
    Return the title and contents of the website at the given url;
    truncate to 2,000 characters as a sensible limit
    """
    try:
        response = requests.get(url, headers=headers)
    except requests.exceptions.RequestException as e:
        print(f"Error fetching the URL {url}: {e}")
        return None

    soup = BeautifulSoup(response.content, "html.parser")
    title = soup.title.string if soup.title else "No title found"
    if soup.body:
        for body in soup.body(["script", "style", "img", "input"]):
            body.decompose()
        text = soup.body.get_text(separator="\n", strip=True)
    else:
        text = ""
    return (title + "\n\n" + text)[:2_000]
    
def fetch_website_links(url):
    """
    Return the links on the webiste at the given url
    I realize this is inefficient as we're parsing twice! This is to keep the code in the lab simple.
    Feel free to use a class and optimize it!
    """
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, "html.parser")
    links = [link.get("href") for link in soup.find_all("a")]
    return [link for link in links if link]

In [66]:
# Define system and user prompts
link_system_prompt = """
You are an expert web content analyzer. Your task is to identify and extract links from a given webpage that are most relevant to the main topic of the page. You are provided with a list of links found on a webpage. Provide only the URLs of the relevant links (such as About page, Company page, Careers/Jobs pages) to include in a brochure about the company without any additional commentary.
You should return the links in a JSON array format as shown below:
{
    "relevant_links": [
        {"type: "About page", "url": "https://example.com/about"},
        {"type: "Careers page", "url": "https://example.com/careers"}
    ]
}
"""

def get_links_user_prompt(url):
    user_prompt = f"""
Here is the URL of the webpage: {url}
Your task is to analyze the links on this page and identify those that are most relevant to the main topic of the page for inclusion in a company brochure.
Please return the relevant links in the specified JSON format.
Do not include Terms of Service, Privacy Policy, email links, or any other unrelated links.
Links:
"""
    links = fetch_website_links(url)
    user_prompt += "\n".join(links)
    return user_prompt

In [67]:
def select_relevant_links(url):
    """
    Select the relevant links to the url provided.
    Args:
        url (str): The url to select relevant links for.
    Returns:
        json: A list of relevant links in JSON format.
    """
    if USE_API:
        response = openai.chat.completions.create(
            model=GET_LINKS_RELEVANT_MODEL,
            messages=[
                {"role": "system", "content": link_system_prompt},
                {"role": "user", "content": get_links_user_prompt(url)}
            ],
            response_format={"type": "json_object"}
        )
    else:
        response = ollama.chat.completions.create(
            model=GET_LINKS_RELEVANT_MODEL,
            messages=[
                {"role": "system", "content": link_system_prompt},
                {"role": "user", "content": get_links_user_prompt(url)}
            ],
            response_format={"type": "json_object"}
        )
    content = response.choices[0].message.content
    try:
        relevant_links = json.loads(content)
    except json.JSONDecodeError:
        print(f"Failed to parse JSON response: {content}")
        return []
    return relevant_links

In [68]:
select_relevant_links("https://huggingface.co")

{'relevant_links': [{'type': 'About page', 'url': 'https://huggingface.co'},
  {'type': 'FAQs/Hub Documentation', 'url': '/docs'},
  {'type': 'GitHub repository', 'url': 'https://github.com/huggingface'}]}

In [69]:
def fetch_page_and_relevant_links(url):
    """
    Fetch the page content and relevant links for the given URL.
    Args:
        url (str): The URL of the webpage.
    Returns:
        str: The content of the webpage and relevant links.
    """
    page_content = fetch_website_contents(url)
    relevant_links = select_relevant_links(url)
    
    result = f"## Webpage Content:\n\n{page_content}\n\n## Relevant Links:\n\n"
    for link in relevant_links.get("relevant_links", []):
        result += f"Link: {link['type']}\n"
        result += fetch_website_contents(link['url'])
        result += "\n\n"
    return result

In [70]:
fetch_page_and_relevant_links("https://huggingface.co")

Error fetching the URL https://join.huggingface.co: HTTPSConnectionPool(host='join.huggingface.co', port=443): Max retries exceeded with url: / (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x116bb2de0>: Failed to resolve 'join.huggingface.co' ([Errno 8] nodename nor servname provided, or not known)"))


TypeError: can only concatenate str (not "NoneType") to str

In [None]:
brochure_system_prompt = """
You are a skilled brochure writer. Your task is to create a compelling brochure for a company based on the provided webpage content and relevant links. Use the information to highlight the company's strengths, values, and offerings in an engaging manner.
The brochure should be well-structured, informative, and persuasive, aiming to attract potential customers or clients.
Respond in markdown format without code blocks.
Include sections such as Introduction, About Us, Services/Products, Careers, and Contact Information where applicable.
"""

In [None]:
def get_brochure_user_prompt(company_name, url):
    user_prompt = f"""
    You are to create a brochure for {company_name}.
    Using the following webpage content and relevant links, create a compelling brochure for the company.
    Ensure the brochure is well-structured and highlights the company's strengths, values, and offerings.
    Use this information to build a short brochure of the company in markdown format without code blocks.
    """
    user_prompt += fetch_page_and_relevant_links(url)
    return user_prompt[:5000]  # Truncate to first 5000 characters to fit model context

In [None]:
def create_brochure(company_name, url):
    """
    Create a brochure for the given company using its webpage content and relevant links.
    Args:
        company_name (str): The name of the company.
        url (str): The URL of the company's webpage.
    Returns:
        str: The generated brochure in markdown format.
    """
    if USE_API:
        response = openai.chat.completions.create(
            model=CREATE_BROCHURE_MODEL,
            messages=[
                {"role": "system", "content": brochure_system_prompt},
                {"role": "user", "content": get_brochure_user_prompt(company_name, url)}
            ]
        )
    else:
        response = ollama.chat.completions.create(
            model=CREATE_BROCHURE_MODEL,
            messages=[
                {"role": "system", "content": brochure_system_prompt},
                {"role": "user", "content": get_brochure_user_prompt(company_name, url)}
            ]
        )
    brochure = response.choices[0].message.content
    display(Markdown(brochure))

In [None]:
create_brochure("Hugging Face", "https://huggingface.co")

# Hugging Face â€“ The AI community building the future  

Welcome to the worldâ€‘wide hub where researchers, engineers, and creators turn ideas into AI realities. Hugging Face is the openâ€‘source platform that empowers collaboration, acceleration, and innovation across every modalityâ€”from text and image to video, audio, and 3D.

---

## About Us  

- **Mission**: Democratize AI by providing an inclusive, open ecosystem where anyone can host, share, and build upon stateâ€‘ofâ€‘theâ€‘art models, datasets, and applications.  
- **Reach**: Over **1â€¯M+ publicly shared models**, **400k+ AI apps**, and **250k+ datasets** are available right now, continually growing with community contributions.  
- **Philosophy**: Open source, transparency, and speed. All core libraries, tooling, and infrastructure are open, making it easy to move from research to production without friction.  

---

## What We Offer  

### 1. **Model Hub**  
- Host & version **unlimited** public (or private) models.  
- Explore trending models such as *WeiboAI/VibeThinkerâ€‘1.5B*, *moonshotai/Kimiâ€‘K2â€‘Thinking*, and *facebook/sam3*.  
- Seamless integration with ðŸ¤— Transformers, ðŸ¤— Diffusers, and other cuttingâ€‘edge libraries.  

### 2. **Data Hub**  
- Discover and share **250k+ datasets** across NLP, vision, audio, and more.  
- Example datasets: *tensonaut/EPSTEIN_FILES_20K*, *nvidia/PhysicalAIâ€‘Autonomousâ€‘Vehicles*.  

### 3. **Spaces** (AI Apps)  
- Deploy, collaborate on, and try out **400k+ live applications** with instant inference.  
- Trending Spaces: *Qwen Image Edit Camera Control*, *The Smol Training Playbook*, *Depth Anything 3*.  
- Run on Zeroâ€‘Code, CPU, or GPUâ€‘accelerated (MCP) environments without provisioning infrastructure.  

### 4. **Compute & Enterprise**  
- **Paid compute services**: Scale up inference and training with onâ€‘demand GPU clusters.  
- **Enterprise solutions**: Custom hosting, data privacy, dedicated support, and deploymentâ€‘ready AI pipelines.  

---

## Why Choose Hugging Face  

| Value | What It Looks Like |
|-------|--------------------|
| **Speed** | Move from idea to demo in days, not months. |
| **Multimodality** | Support for text, image, video, audio, 3D â€“ all in the same framework. |
| **Collaboration** | Public Gitâ€‘like repos, community discussions, and realâ€‘time feedback. |
| **Openâ€‘Source Stack** | No vendor lockâ€‘in; you own your models, data, and code. |
| **Community** | Millions of researchers, developers, and companies contributing every day. |

---

## Careers  

Join a team thatâ€™s reshaping AI for good. Hugging Face looks for passionate, creative minds in engineering, research, product, and operations.  
- **Current open positions**: Visit the [Careers page](https://huggingface.co/jobs).  
- **Culture**: Diverse, inclusive, & collaborative.  

---

## Get in Touch  

- **Website**: [huggingface.co](https://huggingface.co)  
- **LinkedIn**: [Hugging Face on LinkedIn](https://www.linkedin.com/company/huggingface)  
- **Email**: contact@huggingface.co (general inquiries)  
- **Support**: docs & community forums at [docs.huggingface.co](https://huggingface.co/docs).

Ready to build, share, and accelerate your AI projects?  
**[Sign Up Now](https://huggingface.co/signup)** â€“ the future starts with the first model you publish.