# Company brochure

Create a product that builds a Brochure for a company to be used for prospective clients, investors and potential recruits from a company name and LinkedIn page
There are 2 calls to LLM: with all the available links from the LinkedIn page we filter only relevant ones; and from content scraped from all the URLs (including main page) assemble a Brochure

Outputs in GUI (web browser):
![output.png](output.png)

In [1]:
import ollama
import requests
from bs4 import BeautifulSoup
from IPython.display import Markdown, display, update_display
from pydantic import BaseModel
from urllib.parse import urljoin
import time
import gradio

**Define the model and the Company input**

In [2]:
MODELS = {"qwen2.5:7b", "llama3.2:3b", "mistral:7b"}
COMPANY_NAME = "Construo"
COMPANY_WEBSITE = "https://www.linkedin.com/company/construo-ag"

First let's define helper classes

In [3]:
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
}


class Website:
    def __init__(self, url):
        self.url = url
        try:
            response = requests.get(url, headers=headers, timeout=10)
            response.raise_for_status()
            self.body = response.content
        except requests.RequestException as e:
            print(f"❌ Failed to scrape {url}: {e}. Retrying once...")
            time.sleep(2)
            try:
                response = requests.get(url, headers=headers, timeout=10)
                response.raise_for_status()
                self.body = response.content
            except requests.RequestException as e:
                print(f"❌ Retry failed: {e}. Using empty content.")
                self.body = b""
                self.text = ""
                self.title = "Failed to load"
                self.links = []
                return

        soup = BeautifulSoup(self.body, 'html.parser')
        self.title = soup.title.string if soup.title else "No title found"
        if soup.body:
            for irrelevant in soup.body(["script", "style", "img", "input"]):
                irrelevant.decompose()
            self.text = soup.body.get_text(separator="\n", strip=True)
        else:
            self.text = ""
        links = [link.get('href') for link in soup.find_all('a') if link.get('href')]
        self.links = list(set(urljoin(url, link) for link in links))[:50]  # Dedupe and limit

    def get_contents(self):
        return f"Webpage Title:\n{self.title}\nWebpage Contents:\n{self.text}\n\n"


class Link(BaseModel):
    type: str
    url: str


class AllLinks(BaseModel):
    links: list[Link]


def get_links(url, selected_model):
    website = Website(url)
    response = ollama.chat(
        model=selected_model,
        messages=[
            {"role": "system", "content": link_system_prompt},
            {"role": "user", "content": get_links_user_prompt(website)}
        ],
        format='json'
    )
    content = response['message']['content'].strip()
    try:
        return AllLinks.model_validate_json(content)
    except Exception as e:
        print(f"⚠️ Failed to parse LLM response: {e}\nRaw response: {content}")
        return AllLinks(links=[])


def get_all_details(url, selected_model):
    result = "Landing page:\n"
    try:
        result += Website(url).get_contents()
    except Exception as e:
        print(f"⚠️ Failed to scrape main page: {e}")
        result += "Main page content unavailable.\n\n"

    links = get_links(url, selected_model)
    print("Found links:", [f"{link.type}: {link.url}" for link in links.links])
    for link in links.links:
        try:
            result += f"\n\n{link.type}\n"
            result += Website(link.url).get_contents()
        except Exception as e:
            print(f"⚠️ Failed to scrape {link.url}: {e}")
            continue
    return result

Our system prompt is 1-shot prompt (i.e. it gives 1 example of the output)

In [4]:
link_system_prompt = """You are provided with a list of links from a company webpage. Identify the most relevant links for a company brochure, such as About, Company, Services, or Careers/Jobs pages. Exclude Terms of Service, Privacy, or email links. Respond in JSON format like this:
{
  "links": [
    {"type": "about page", "url": "https://full.url/goes/here/about"},
    {"type": "careers page", "url": "https://another.full.url/careers"}
  ]
}
Return only the JSON object, no extra text."""


def get_links_user_prompt(website):
    return f"""Here is the list of links on the website of {website.url}. Select relevant links for a company brochure. Use full HTTPS URLs.
Links:
{"\n".join(website.links)}
"""


system_prompt_funny = """You are an assistant that analyzes content from a company website and creates a short, humorous, entertaining brochure in markdown for prospective customers, investors, and recruits. Include details on company culture, customers, and careers if available. Use emojis, bullet points, and a fun tone."""
system_prompt_serious = """You are an assistant that analyzes content from a company website and creates a short, yet concise, serious brochure in markdown for prospective customers, investors, and recruits. Include details on company culture, customers, and careers if available. Use emojis, bullet points, and a fun tone."""


def get_brochure_user_prompt(company_name, url, selected_model):
    user_prompt = f"You are looking at a company called: {company_name}\n"
    user_prompt += "Here are the contents of its landing page and relevant pages:\n"
    user_prompt += get_all_details(url, selected_model)
    user_prompt = user_prompt[:5_000]
    return user_prompt


def stream_brochure(company_name, url, system_prompt_selection, selected_model):
    system_prompt = system_prompt_funny if system_prompt_selection == 'Funny' else system_prompt_serious
    stream = ollama.chat(
        model=selected_model,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": get_brochure_user_prompt(company_name, url, selected_model)}
        ],
        stream=True
    )
    response = ""
    # display_handle = display(Markdown(""), display_id=True)
    for chunk in stream:
        if 'message' in chunk and 'content' in chunk['message']:
            response += chunk['message']['content'] or ''
            response = response.replace("```", "").replace("markdown", "")
            yield response
            # update_display(Markdown(response), display_id=display_handle.display_id)

User interface:

In [5]:
force_dark_mode = """
function refresh() {
    const url = new URL(window.location);
    if (url.searchParams.get('__theme') !== 'dark') {
        url.searchParams.set('__theme', 'dark');
        window.location.href = url.href;
    }
}
"""
view = gradio.Interface(fn=stream_brochure,
                        inputs=[gradio.Textbox(label="Company name", value=COMPANY_NAME),
                                gradio.Textbox(label="Website URL", value=COMPANY_WEBSITE),
                                gradio.Radio(['Funny', 'Serious'], label="Select attitude", value='Funny'),
                                gradio.Dropdown(MODELS, label="Select model", value="qwen2.5:7b")],
                        outputs=[gradio.Markdown(label="Brochure")],
                        flagging_mode="never",
                        js=force_dark_mode
                        )

# stream_brochure(COMPANY_NAME, COMPANY_WEBSITE)

Main call

In [6]:
view.launch()

* Running on local URL:  http://127.0.0.1:7861
* To create a public link, set `share=True` in `launch()`.


