In [1]:
import pandas as pd
import numpy as np 
from bs4 import BeautifulSoup 
from IPython.display import Markdown, display, update_display
import requests 
from typing import List

In [2]:
import requests

response = requests.post(
    'http://localhost:11434/api/generate',
    json={
        "model": "llama3",
        "prompt": "Who is Hussain?",
        "stream": False
    }
)

# Print the full response to see what's inside
data = response.json()
print(data)


{'model': 'llama3', 'created_at': '2025-08-19T11:06:04.879216567Z', 'response': "Hussain (570-661 CE) was a revered Islamic saint, martyr, and grandson of the Prophet Muhammad. He played a crucial role in the early history of Islam and is considered one of the most important figures in Shia Islam.\n\nHussain ibn Ali was born in Medina, Arabia, to Imam Ali, the first Imam of Shia Islam, and Fatima, the daughter of the Prophet Muhammad. Hussain grew up in a family deeply rooted in Islamic values and became known for his piety, wisdom, and courage.\n\nIn 680 CE, Hussain led a caravan with his family and followers from Medina to Mecca, intending to perform the Hajj pilgrimage. However, he was confronted by the army of Yazid I, the Umayyad caliph who had seized power through treachery and usurpation. When Hussain refused to pledge allegiance to Yazid, a brutal battle ensued at Karbala, near modern-day Iraq.\n\nDespite being vastly outnumbered (around 72-100 men against an army of thousands)

In [3]:
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
}

class Website:
    url: str
    
    def __init__(self, url):
        self.url = url
        response = requests.get(url, headers=headers)
        self.body = response.content
        soup = BeautifulSoup(self.body, 'html.parser')
        
        self.title = soup.title.string if soup.title else None
        print(self.title if self.title else "No title found")
        
        if soup.body:
            for irrelevant in soup.body(["script", "style", "img", "input"]):
                irrelevant.decompose()
            self.text = soup.body.get_text(separator="\n", strip=True)
        else:
            self.text = ""
        
      
        links = [link.get('href') for link in soup.find_all('a')]
        self.links = [link for link in links if link]

    def get_contents(self):
        return f"Webpage Title:\n{self.title}\nWebpage Contents:\n{self.text}\n\n"





In [4]:
ed = Website("https://edwarddonner.com")
ed.links

Home - Edward Donner


['https://edwarddonner.com/',
 'https://edwarddonner.com/connect-four/',
 'https://edwarddonner.com/outsmart/',
 'https://edwarddonner.com/about-me-and-about-nebula/',
 'https://edwarddonner.com/posts/',
 'https://edwarddonner.com/',
 'https://news.ycombinator.com',
 'https://nebula.io/?utm_source=ed&utm_medium=referral',
 'https://www.prnewswire.com/news-releases/wynden-stark-group-acquires-nyc-venture-backed-tech-startup-untapt-301269512.html',
 'https://patents.google.com/patent/US20210049536A1/',
 'https://www.linkedin.com/in/eddonner/',
 'https://edwarddonner.com/2025/05/28/connecting-my-courses-become-an-llm-expert-and-leader/',
 'https://edwarddonner.com/2025/05/28/connecting-my-courses-become-an-llm-expert-and-leader/',
 'https://edwarddonner.com/2025/05/18/2025-ai-executive-briefing/',
 'https://edwarddonner.com/2025/05/18/2025-ai-executive-briefing/',
 'https://edwarddonner.com/2025/04/21/the-complete-agentic-ai-engineering-course/',
 'https://edwarddonner.com/2025/04/21/the-

In [5]:
link_system_prompt = "You are provided with a list of links found on a webpage. \
You are able to decide which of the links would be most relevant to include in a brochure about the company, \
such as links to an About page, or a Company page, or Careers/Jobs pages.\n"
link_system_prompt += "You should respond in JSON as in this example:"
link_system_prompt += """
{
    "links": [
        {"type": "about page", "url": "https://full.url/goes/here/about"},
        {"type": "careers page", "url": "https://another.full.url/careers"}
    ]
}
"""

In [6]:
print(link_system_prompt)

You are provided with a list of links found on a webpage. You are able to decide which of the links would be most relevant to include in a brochure about the company, such as links to an About page, or a Company page, or Careers/Jobs pages.
You should respond in JSON as in this example:
{
    "links": [
        {"type": "about page", "url": "https://full.url/goes/here/about"},
        {"type": "careers page", "url": "https://another.full.url/careers"}
    ]
}



In [7]:
def get_links_user_prompt(website):
    user_prompt = f"Here is the list of links on the website of {website.url} - "
    user_prompt += "please decide which of these are relevant web links for a brochure about the company, respond with the full https URL in JSON format. \
Do not include Terms of Service, Privacy, email links.\n"
    user_prompt += "Links (some might be relative links):\n"
    user_prompt += "\n".join(website.links)
    return user_prompt


In [8]:
print(get_links_user_prompt(ed))

Here is the list of links on the website of https://edwarddonner.com - please decide which of these are relevant web links for a brochure about the company, respond with the full https URL in JSON format. Do not include Terms of Service, Privacy, email links.
Links (some might be relative links):
https://edwarddonner.com/
https://edwarddonner.com/connect-four/
https://edwarddonner.com/outsmart/
https://edwarddonner.com/about-me-and-about-nebula/
https://edwarddonner.com/posts/
https://edwarddonner.com/
https://news.ycombinator.com
https://nebula.io/?utm_source=ed&utm_medium=referral
https://www.prnewswire.com/news-releases/wynden-stark-group-acquires-nyc-venture-backed-tech-startup-untapt-301269512.html
https://patents.google.com/patent/US20210049536A1/
https://www.linkedin.com/in/eddonner/
https://edwarddonner.com/2025/05/28/connecting-my-courses-become-an-llm-expert-and-leader/
https://edwarddonner.com/2025/05/28/connecting-my-courses-become-an-llm-expert-and-leader/
https://edwarddo

def messages_for(website):
    return [
        {"role": "system", "content": link_system_prompt},
        {"role": "user", "content": get_links_user_prompt(website)}
    ]
messages_for(ed)

In [27]:
from rich.console import Console
import requests
from rich.markdown import Markdown
console = Console()

def summarize(url):
    website = Website(url)
    response = requests.post(
        "http://localhost:11434/api/chat",
        json={
            "model": "llama3",
            "messages": messages_for(website),
            "stream": False
        }
    )
    data = response.json()
    return data['message']['content']
    try:
        # Convert model's JSON string into Python dict
        parsed = json.loads(content)
        return parsed   # returns {"links": [...]} as a Python dict
    except json.JSONDecodeError:
        # fallback in case model returns invalid JSON
        return {"error": "Model did not return valid JSON", "raw": content}



In [28]:
summary = summarize("https://edwarddonner.com")

Home - Edward Donner


In [29]:
md = Markdown(summary)
console.print(md)

<h1>NOW BROCHURE</h1>

In [35]:
def get_all_details(url):
    result = "landing page \n"
    result += Website(url).get_contents()
    links = summarize(url)

    for link in links['links']:
        result += f"\n\n{link['type']}\n"
        result += Website(link["url"]).get_contents()
    return result
    

In [36]:
print(get_all_details("https://edwarddonner.com"))

Home - Edward Donner
Home - Edward Donner


TypeError: string indices must be integers, not 'str'

In [None]:
system_prompt = "You are an assistant that analyzes the contents of several relevant pages from a company website \
and creates a short brochure about the company for prospective customers, investors and recruits. Respond in markdown.\
Include details of company culture, customers and careers/jobs if you have the information."

In [None]:
def get_brochure_user_prompt(company_name, url):
    user_prompt = f"You are looking at a company called: {company_name}\n"
    user_prompt += f"Here are the contents of its landing page and other relevant pages; use this information to build a short brochure of the company in markdown.\n"
    user_prompt += get_all_details(url)
    user_prompt = user_prompt[:5_000] # Truncate if more than 5,000 characters
    return user_prompt
