In [33]:
# imports
# If these fail, please check you're running from an 'activated' environment with (llms) in the command prompt

import os
import requests
import json
from typing import List
from dotenv import load_dotenv
from bs4 import BeautifulSoup
from IPython.display import Markdown, display, update_display


In [36]:
import google.generativeai as genai
import os
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Set Gemini API key
genai.configure(api_key=os.getenv("GEMINIAI_API_KEY"))  # Fixed typo

# Use the faster model
model = genai.GenerativeModel("gemini-1.5-flash")  # Use a better model

# Generate response
response = model.generate_content("Hello, how are you?")

print(response.text)


I am doing well, thank you for asking!  How are you today?



In [37]:
# A class to represent a Webpage

# Some websites need you to use proper headers when fetching them:
headers = {
 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
}

class Website:
    """
    A utility class to represent a Website that we have scraped, now with links
    """

    def __init__(self, url):
        self.url = url
        response = requests.get(url, headers=headers)
        self.body = response.content
        soup = BeautifulSoup(self.body, 'html.parser')
        self.title = soup.title.string if soup.title else "No title found"
        if soup.body:
            for irrelevant in soup.body(["script", "style", "img", "input"]):
                irrelevant.decompose()
            self.text = soup.body.get_text(separator="\n", strip=True)
        else:
            self.text = ""
        links = [link.get('href') for link in soup.find_all('a')]
        self.links = [link for link in links if link]

    def get_contents(self):
        return f"Webpage Title:\n{self.title}\nWebpage Contents:\n{self.text}\n\n"

In [38]:
ed = Website("https://edwarddonner.com")
ed.links

['https://edwarddonner.com/',
 'https://edwarddonner.com/connect-four/',
 'https://edwarddonner.com/outsmart/',
 'https://edwarddonner.com/about-me-and-about-nebula/',
 'https://edwarddonner.com/posts/',
 'https://edwarddonner.com/',
 'https://news.ycombinator.com',
 'https://nebula.io/?utm_source=ed&utm_medium=referral',
 'https://www.prnewswire.com/news-releases/wynden-stark-group-acquires-nyc-venture-backed-tech-startup-untapt-301269512.html',
 'https://patents.google.com/patent/US20210049536A1/',
 'https://www.linkedin.com/in/eddonner/',
 'https://edwarddonner.com/2025/01/23/llm-workshop-hands-on-with-agents-resources/',
 'https://edwarddonner.com/2025/01/23/llm-workshop-hands-on-with-agents-resources/',
 'https://edwarddonner.com/2024/12/21/llm-resources-superdatascience/',
 'https://edwarddonner.com/2024/12/21/llm-resources-superdatascience/',
 'https://edwarddonner.com/2024/11/13/llm-engineering-resources/',
 'https://edwarddonner.com/2024/11/13/llm-engineering-resources/',
 'ht

In [39]:
link_system_prompt = "You are provided with a list of links found on a webpage. \
You are able to decide which of the links would be most relevant to include in a brochure about the company, \
such as links to an About page, or a Company page, or Careers/Jobs pages.\n"
link_system_prompt += "You should respond in JSON as in this example:"
link_system_prompt += """
{
    "links": [
        {"type": "about page", "url": "https://full.url/goes/here/about"},
        {"type": "careers page": "url": "https://another.full.url/careers"}
    ]
    
}
"""

In [40]:
print(link_system_prompt)

You are provided with a list of links found on a webpage. You are able to decide which of the links would be most relevant to include in a brochure about the company, such as links to an About page, or a Company page, or Careers/Jobs pages.
You should respond in JSON as in this example:
{
    "links": [
        {"type": "about page", "url": "https://full.url/goes/here/about"},
        {"type": "careers page": "url": "https://another.full.url/careers"}
    ]

}



In [41]:
def get_links_user_prompt(website):
    user_prompt = f"Here is the list of links on the website of {website.url} - "
    user_prompt += "please decide which of these are relevant web links for a brochure about the company, respond with the full https URL in JSON format. \
Do not include Terms of Service, Privacy, email links.\n"
    user_prompt += "Links (some might be relative links):\n"
    user_prompt += "\n".join(website.links)
    
    return user_prompt

In [42]:
print(get_links_user_prompt(ed))

Here is the list of links on the website of https://edwarddonner.com - please decide which of these are relevant web links for a brochure about the company, respond with the full https URL in JSON format. Do not include Terms of Service, Privacy, email links.
Links (some might be relative links):
https://edwarddonner.com/
https://edwarddonner.com/connect-four/
https://edwarddonner.com/outsmart/
https://edwarddonner.com/about-me-and-about-nebula/
https://edwarddonner.com/posts/
https://edwarddonner.com/
https://news.ycombinator.com
https://nebula.io/?utm_source=ed&utm_medium=referral
https://www.prnewswire.com/news-releases/wynden-stark-group-acquires-nyc-venture-backed-tech-startup-untapt-301269512.html
https://patents.google.com/patent/US20210049536A1/
https://www.linkedin.com/in/eddonner/
https://edwarddonner.com/2025/01/23/llm-workshop-hands-on-with-agents-resources/
https://edwarddonner.com/2025/01/23/llm-workshop-hands-on-with-agents-resources/
https://edwarddonner.com/2024/12/21/

In [46]:
def get_links(url):
    website = Website(url)

    # Load the Gemini model
    model = genai.GenerativeModel("gemini-1.5-flash")

    # Generate a response using plain text input (not OpenAI-style "role" messages)
    response = model.generate_content(
        f"{link_system_prompt}\n\n{get_links_user_prompt(website)}",
        generation_config={"response_mime_type": "application/json"}
    )

    # Parse JSON response
    return json.loads(response.text)


In [47]:
# Anthropic has made their site harder to scrape, so I'm using HuggingFace..

huggingface = Website("https://huggingface.co")
huggingface.links

['/',
 '/models',
 '/datasets',
 '/spaces',
 '/posts',
 '/docs',
 '/enterprise',
 '/pricing',
 '/login',
 '/join',
 '/spaces',
 '/models',
 '/deepseek-ai/DeepSeek-V3-0324',
 '/Qwen/Qwen2.5-Omni-7B',
 '/Qwen/Qwen2.5-VL-32B-Instruct',
 '/ds4sd/SmolDocling-256M-preview',
 '/manycore-research/SpatialLM-Llama-1B',
 '/models',
 '/spaces/enzostvs/deepsite',
 '/spaces/ByteDance/InfiniteYou-FLUX',
 '/spaces/starvector/starvector-1b-im2svg',
 '/spaces/Qwen/Qwen2.5-Omni-7B-Demo',
 '/spaces/3DAIGC/LHM',
 '/spaces',
 '/datasets/nvidia/Llama-Nemotron-Post-Training-Dataset-v1',
 '/datasets/glaiveai/reasoning-v1-20m',
 '/datasets/a-m-team/AM-DeepSeek-R1-Distilled-1.4M',
 '/datasets/PixelAI-Team/TalkBody4D',
 '/datasets/FreedomIntelligence/medical-o1-reasoning-SFT',
 '/datasets',
 '/join',
 '/pricing#endpoints',
 '/pricing#spaces',
 '/pricing',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/enterprise',
 '/allenai',
 '/facebook',
 '/amazon',
 '/google

In [48]:
get_links("https://huggingface.co")

{'links': [{'type': 'about page', 'url': 'https://huggingface.co'},
  {'type': 'Company page', 'url': 'https://huggingface.co/enterprise'},
  {'type': 'careers page', 'url': 'https://apply.workable.com/huggingface/'}]}

In [49]:
def get_all_details(url):
    result = "Landing page:\n"
    result += Website(url).get_contents()
    links = get_links(url)
    print("Found links:", links)
    for link in links["links"]:
        result += f"\n\n{link['type']}\n"
        result += Website(link["url"]).get_contents()
    return result

In [50]:
print(get_all_details("https://huggingface.co"))

Found links: {'links': [{'type': 'about page', 'url': 'https://huggingface.co/'}, {'type': 'company page', 'url': 'https://huggingface.co/'}, {'type': 'careers page', 'url': 'https://apply.workable.com/huggingface/'}]}
Landing page:
Webpage Title:
Hugging Face – The AI community building the future.
Webpage Contents:
Hugging Face
Models
Datasets
Spaces
Posts
Docs
Enterprise
Pricing
Log In
Sign Up
The AI community building the future.
The platform where the machine learning community collaborates on models, datasets, and applications.
Explore AI Apps
or
Browse 1M+ models
Trending on
this week
Models
deepseek-ai/DeepSeek-V3-0324
Updated
4 days ago
•
79.1k
•
2.07k
Qwen/Qwen2.5-Omni-7B
Updated
23 minutes ago
•
44.8k
•
914
Qwen/Qwen2.5-VL-32B-Instruct
Updated
5 days ago
•
127k
•
281
ds4sd/SmolDocling-256M-preview
Updated
8 days ago
•
54.1k
•
1.06k
manycore-research/SpatialLM-Llama-1B
Updated
10 days ago
•
12.1k
•
814
Browse 1M+ models
Spaces
Running
679
679
DeepSite
🐳
Imagine and Share in 1

In [51]:
system_prompt = "You are an assistant that analyzes the contents of several relevant pages from a company website \
and creates a short brochure about the company for prospective customers, investors and recruits. Respond in markdown.\
Include details of company culture, customers and careers/jobs if you have the information."

# Or uncomment the lines below for a more humorous brochure - this demonstrates how easy it is to incorporate 'tone':

# system_prompt = "You are an assistant that analyzes the contents of several relevant pages from a company website \
# and creates a short humorous, entertaining, jokey brochure about the company for prospective customers, investors and recruits. Respond in markdown.\
# Include details of company culture, customers and careers/jobs if you have the information."


In [52]:
def get_brochure_user_prompt(company_name, url):
    user_prompt = f"You are looking at a company called: {company_name}\n"
    user_prompt += f"Here are the contents of its landing page and other relevant pages; use this information to build a short brochure of the company in markdown.\n"
    user_prompt += get_all_details(url)
    user_prompt = user_prompt[:5_000] # Truncate if more than 5,000 characters
    return user_prompt

In [53]:
get_brochure_user_prompt("HuggingFace", "https://huggingface.co")

Found links: {'links': [{'type': 'about page', 'url': 'https://huggingface.co/'}, {'type': 'models page', 'url': 'https://huggingface.co/models'}, {'type': 'datasets page', 'url': 'https://huggingface.co/datasets'}, {'type': 'spaces page', 'url': 'https://huggingface.co/spaces'}, {'type': 'enterprise page', 'url': 'https://huggingface.co/enterprise'}, {'type': 'pricing page', 'url': 'https://huggingface.co/pricing'}, {'type': 'careers page', 'url': 'https://apply.workable.com/huggingface/'}, {'type': 'documentation', 'url': 'https://huggingface.co/docs'}, {'type': 'blog', 'url': 'https://huggingface.co/blog'}, {'type': 'community forum', 'url': 'https://discuss.huggingface.co/'}, {'type': 'status page', 'url': 'https://status.huggingface.co/'}]}


'You are looking at a company called: HuggingFace\nHere are the contents of its landing page and other relevant pages; use this information to build a short brochure of the company in markdown.\nLanding page:\nWebpage Title:\nHugging Face – The AI community building the future.\nWebpage Contents:\nHugging Face\nModels\nDatasets\nSpaces\nPosts\nDocs\nEnterprise\nPricing\nLog In\nSign Up\nThe AI community building the future.\nThe platform where the machine learning community collaborates on models, datasets, and applications.\nExplore AI Apps\nor\nBrowse 1M+ models\nTrending on\nthis week\nModels\ndeepseek-ai/DeepSeek-V3-0324\nUpdated\n4 days ago\n•\n79.1k\n•\n2.07k\nQwen/Qwen2.5-Omni-7B\nUpdated\n23 minutes ago\n•\n44.8k\n•\n914\nQwen/Qwen2.5-VL-32B-Instruct\nUpdated\n5 days ago\n•\n127k\n•\n281\nds4sd/SmolDocling-256M-preview\nUpdated\n8 days ago\n•\n54.1k\n•\n1.06k\nmanycore-research/SpatialLM-Llama-1B\nUpdated\n10 days ago\n•\n12.1k\n•\n814\nBrowse 1M+ models\nSpaces\nRunning\n679\n

In [58]:
import google.generativeai as genai
from IPython.display import Markdown, display
import os

# Configure Gemini API
genai.configure(api_key=os.getenv("GEMINIAI_API_KEY"))

def create_brochure(company_name, url):
    model = genai.GenerativeModel("gemini-1.5-flash")
    
    prompt = f"""
    Create a professional brochure for the company "{company_name}".
    Website: {url}
    Include company introduction, services, key features, and contact details.
    """

    response = model.generate_content(prompt)
    
    result = response.text
    display(Markdown(result))


In [59]:
create_brochure("HuggingFace", "https://huggingface.co")

## HuggingFace: Democratizing AI for Everyone

**(Brochure - Tri-fold design)**

**(Panel 1:  Left Panel - Image: A vibrant, abstract image representing AI and collaboration.  HuggingFace logo prominently displayed.)**

**Headline:**  Unlocking the Power of AI: Simple, Accessible, and Collaborative.

**Subheadline:** HuggingFace is the leading platform for building, training, and deploying machine learning models.  We empower developers, researchers, and businesses of all sizes to harness the transformative potential of AI.


**(Panel 2: Center Panel - Text focused on Services and Key Features)**

**Headline:**  Our Solutions: Empowering Your AI Journey

**Section 1:  Models:**

* **Access the world's largest Hub of pre-trained models:**  Explore thousands of readily available models for various tasks, from natural language processing and computer vision to speech recognition and more.  Find the perfect model to integrate into your applications or build upon for your unique needs.  *Key Feature: Easy Search & Filtering, Version Control, Model Evaluation Metrics*

* **Train and fine-tune your own models:** Leverage our robust infrastructure to train custom models tailored to your specific data and requirements.  *Key Feature: Scalable Infrastructure, Collaborative Training, Open-Source Tools*

* **Deploy models effortlessly:** Integrate trained models into your applications with our simple and intuitive APIs and SDKs.  *Key Feature:  Flexible Deployment Options,  Low-Latency Inference,  Multiple Cloud Integrations*


**Section 2: Datasets:**

* **Discover and share high-quality datasets:** Access a vast library of datasets, meticulously curated for diverse applications. Contribute your own datasets to the community and accelerate AI research and development. *Key Feature: Data Versioning, Data Quality Assurance, Open Licensing Options*


**Section 3: Spaces:**

* **Deploy and share your applications:** Showcase your AI creations with our interactive Spaces, providing a seamless way to demonstrate and deploy your models in an easily accessible environment.  *Key Feature: Simple Deployment, Interactive Web Interface, Community Engagement*


**(Panel 3: Right Panel - Image:  Diverse team working collaboratively. Contact Information)**

**Headline:**  Join the HuggingFace Community

**Body Text:**  Become a part of the thriving HuggingFace ecosystem, collaborate with leading AI experts, and contribute to the future of AI. Explore our documentation, engage with our community forum, and learn from our extensive resources.


**Contact Information:**

* **Website:** huggingface.co
* **Email:**  [Insert general inquiries email address]
* **Community Forum:** [Link to HuggingFace community forum]
* **Social Media:** [Links to relevant social media profiles]


**(Footer across all panels):  HuggingFace Logo & Copyright Information)**


**Note:**  This is a template. You should replace bracketed information with actual HuggingFace details and potentially adjust the imagery and text to better reflect their current branding and offerings.  Consider using high-quality images and a visually appealing layout for maximum impact.


In [None]:
def stream_brochure(company_name, url):
    stream = openai.chat.completions.create(
        model=MODEL,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": get_brochure_user_prompt(company_name, url)}
          ],
        stream=True
    )
    
    response = ""
    display_handle = display(Markdown(""), display_id=True)
    for chunk in stream:
        response += chunk.choices[0].delta.content or ''
        response = response.replace("```","").replace("markdown", "")
        update_display(Markdown(response), display_id=display_handle.display_id)

In [None]:
stream_brochure("HuggingFace", "https://huggingface.co")

In [None]:
# Try changing the system prompt to the humorous version when you make the Brochure for Hugging Face:

stream_brochure("HuggingFace", "https://huggingface.co")