In [26]:
from bs4 import BeautifulSoup
import requests


# Standard headers to fetch a website
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
}


def fetch_website_contents(url):
    """
    Return the title and contents of the website at the given url;
    truncate to 2,000 characters as a sensible limit
    """
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, "html.parser")
    title = soup.title.string if soup.title else "No title found"
    if soup.body:
        for irrelevant in soup.body(["script", "style", "img", "input"]):
            irrelevant.decompose()
        text = soup.body.get_text(separator="\n", strip=True)
    else:
        text = ""
    return (title + "\n\n" + text)[:2_000]


def fetch_website_links(url):
    """
    Return the links on the webiste at the given url
    I realize this is inefficient as we're parsing twice! This is to keep the code in the lab simple.
    Feel free to use a class and optimize it!
    """
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, "html.parser")
    links = [link.get("href") for link in soup.find_all("a")]
    return [link for link in links if link]


In [27]:
import requests
import json

OLLAMA_URL = "http://localhost:11434/api/chat"
MODEL = "llama3"

In [28]:
def call_ollama(messages):
    payload = {
        "model": MODEL,
        "messages": messages,
        "stream": False
    }

    response = requests.post(OLLAMA_URL, json=payload)
    data = response.json()

    if "error" in data:
        print("Ollama Error:", data["error"])
        raise Exception("Ollama returned error")

    if "message" not in data:
        print("Unexpected format:")
        print(data)
        raise Exception("Invalid Ollama response format")

    return data["message"]["content"]


In [34]:
import re
import json

def extract_json_from_text(text):

    # Remove markdown fences
    text = text.replace("```json", "").replace("```", "")

    # Extract first JSON block
    match = re.search(r'\{.*\}', text, re.DOTALL)
    if not match:
        raise Exception("No JSON object found in model output")

    json_str = match.group()

    # Remove JS-style comments
    json_str = re.sub(r'//.*', '', json_str)

    # Remove control characters
    json_str = re.sub(r'[\x00-\x1F\x7F]', '', json_str)

    return json.loads(json_str)


In [39]:
def select_relevant_links(url):
    links = fetch_website_links(url)

    system_prompt = """
Return ONLY valid minified JSON.
Do not include markdown.
Do not include explanation.
Do not include comments.
Do not include trailing commas.
Ensure it is valid JSON that can be parsed by json.loads.


{
  "links": [
    {"type": "about page", "url": "https://example.com/about"}
  ]
}
"""

    user_prompt = f"""
Here are the links from {url}:

{chr(10).join(links)}
"""

    result = call_ollama([
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt}
    ])

    try:
        parsed = extract_json_from_text(result)

        # Remove empty URLs
        parsed["links"] = [
            link for link in parsed["links"]
            if link.get("url") and link["url"].startswith("http")
        ]

        return parsed

    except Exception as e:
        print("Failed to parse model output:")
        print(result)
        raise


In [40]:
def build_context(url):
    print("Fetching homepage...")
    homepage=fetch_website_contents(url)

    print("Selecting relevant links...")
    relevant=select_relevant_links(url)

    combined=f"## Homepage\n\n{homepage}\n"

    for link in relevant["links"]:
        print(f"Fetching: {link['url']}")
        page_content=fetch_website_contents(link["url"])
        combined += f"\n\n## {link['type']}\n{page_content}"

    return combined[:12000]

In [41]:
def create_brochure(company_name, url):

    context = build_context(url)

    system_prompt = """You are a professional business copywriter.Create a polished brochure in markdown.Target: customers, investors, recruits.
    Include culture, offerings, hiring info if available.
    No code blocks."""

    user_prompt = f"""Company: {company_name}Website Information:{context}"""

    brochure = call_ollama([
        {"role": "system", "content": system_prompt},
        {"role": "user", "content": user_prompt}
    ])

    return brochure


In [42]:
if __name__ == "__main__":

    company = input("Enter Company Name: ")
    url = input("Enter Website URL: ")

    result = create_brochure(company, url)

    print("\n\n=========== BROCHURE ===========\n")
    print(result)


Fetching homepage...
Selecting relevant links...
Failed to parse model output:
{
  "links": [
    {
      "type": "accessibility",
      "url": "/in/corporate/accessibility/"
    },
    {
      "type": "maincontent",
      "url": "#maincontent"
    },
    {
      "type": "index.html",
      "url": "/in/index.html"
    },
    {
      "type": "cloud",
      "url": "/in/cloud/"
    },
    {
      "type": "applications",
      "url": "/in/applications/"
    },
    {
      "type": "database/technologies",
      "url": "/in/database/technologies/"
    },
    {
      "type": "java/technologies/downloads",
      "url": "/in/java/technologies/downloads/"
    },
    {
      "type": "careers",
      "url": "/in/careers/"
    },
    {
      "type": "countries-list.html#countries",
      "url": "/in/countries-list.html#countries"
    },
    {
      "type": "cloud/sign-in.html",
      "url": "/in/cloud/sign-in.html"
    },
    {
      "type": "cloud/free",
      "url": "/in/cloud/free/"
    },
    {

JSONDecodeError: Expecting ',' delimiter: line 1 column 963 (char 962)