# Company brochure generator

Build a company brochure for a company to attract clients, investors and potential recruits.

In [None]:
import os, sys
import requests
import json
import time
from typing import List
from dotenv import load_dotenv
from bs4 import BeautifulSoup
from IPython.display import Markdown, display, update_display
from openai import OpenAI
import ollama
from pydantic import BaseModel
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait

In [None]:
load_dotenv()

openai_api_key = os.getenv('OPENAI_API_KEY')

if not openai_api_key:
    print("No OpenAI API key defined")
    sys.exit(1)
else:
    print("OpenAI API key loaded")

In [None]:
def normalize_url(base_url, href):
    if href.startswith(('http://', 'https://')):
        return href
    if href.endswith('/'):
        return base_url + href[:-1]
    else:
        return base_url + href

# OpenAI
def get_links(url, selenium=False):
    openai = OpenAI()
    website = Website(url, selenium)
    response = openai.chat.completions.create(
        model = "gpt-4o-mini",
        messages = [
            {"role": "system", "content": website.system_prompt},
            {"role": "user", "content": website.user_prompt}
        ],
        response_format = {"type": "json_object"}
    )
    return json.loads(response.choices[0].message.content)

def get_brochure(json_str, _stream=False):
    openai = OpenAI()
    brochure = Brochure(json_str)
    response = openai.chat.completions.create(
        model = "gpt-4o-mini",
        messages = [
            {"role": "system", "content": brochure.system_prompt},
            {"role": "user", "content": brochure.user_prompt}
        ],
        stream = _stream
    )
    if not _stream:
        return response.choices[0].message.content
    else:
        output = ""
        display_handle = display(Markdown(""), display_id=True)
        for chunk in response:
            output += chunk.choices[0].delta.content or ''
            output = output.replace("``", "").replace("markdown", "")
            update_display(Markdown(output), display_id=display_handle.display_id)
        return output

def brochure_maker(url, selenium=False, _stream=True):
    links = get_links(url, selenium)
    brochure = get_brochure(links, _stream)
    if not _stream:
        return brochure

# OSS
class Links(BaseModel):
    links: List[str]

def get_links_oss(url, selenium=False):
    website = Website(url, selenium)
    response = ollama.chat(
        model = "llama3.2",
        messages = [
            {"role": "system", "content": website.system_prompt},
            {"role": "user", "content": website.user_prompt}
        ],
        format = Links.model_json_schema()
    )
    return json.loads(response['message']['content'])

def get_brochure_oss(json_str, _stream=False):
    brochure = Brochure(json_str)
    response = ollama.chat(
        model = "llama3.2",
        messages = [
            {"role": "system", "content": brochure.system_prompt},
            {"role": "user", "content": brochure.user_prompt}
        ],
        stream = _stream
    )
    if not _stream:
        return response['message']['content']
    else:
        output = ""
        display_handle = display(Markdown(""), display_id=True)
        for chunk in response:
            content = chunk["message"]["content"] or ""
            output += content
            display_text = output.replace("```", "")
            update_display(Markdown(display_text), display_id=display_handle.display_id)
        return output

def brochure_maker_oss(url, selenium=False, _stream=True):
    links = get_links_oss(url, selenium)
    brochure = get_brochure_oss(links, _stream)
    return brochure

In [None]:
class Website:
    url: str
    title: str
    body: str
    text: str
    hrefs: List[str]
    summary: str
    system_prompt: str
    user_prompt: str
    selenium: bool

    def __init__(self, url, selenium=False):
        self.url = url
        self.selenium = selenium
        if selenium:
            self._selenium()
        else:
            self._requests()
        self.system_prompt()
        self.user_prompt()

    def _wait(self, driver, timeout=20):
        from selenium.webdriver.support.ui import WebDriverWait
        from selenium.webdriver.common.by import By
    
        try:
            # Wait for the body to contain something
            WebDriverWait(driver, timeout).until(lambda d: d.find_element(By.TAG_NAME, "body").text.strip() != "")
        except:
            pass

        start = time.time()
        prev_html = ""
        cur_html = ""
        stable_html_count = 0
        # Halt at timeout
        while time.time() - start < timeout:
            cur_html = driver.page_source
            if cur_html == prev_html:
                stable_html_count += 1
                # If the content is 5 times the same then it is considered stable
                if stable_html_count > 5:
                    return
            else:
                stable_html_count = 0
            prev_html = cur_html
            time.sleep(0.5)

    def _requests(self):
        response = requests.get(self.url)
        self.body = response.content
        soup = BeautifulSoup(response.content, 'html.parser')
        self.title = soup.title.string if soup.title else "Unknown"
        for no_text_tags in soup.body(["script", "style", "img", "input"]):
            no_text_tags.decompose()
        hrefs = soup.find_all('a', href=True)
        self.hrefs = [normalize_url(self.url, href.get('href')) for href in hrefs if not href['href'].startswith(('#', 'javascript:'))]
        self.text = soup.body.get_text(separator="\n", strip=True)

    def _selenium(self):
        options = Options()
        options.add_argument("--headless")
        options.add_argument("--disable-gpu")
        options.add_argument("--no-sandbox")
        options.add_argument("--window-size=1920,1080")
        # These will make it look more like a human / normal behavior instead of an automated headless browser
        options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36")
        options.add_argument("--disable-blink-features=AutomationControlled")
        options.add_experimental_option("excludeSwitches", ["enable-automation"])
        options.add_experimental_option('useAutomationExtension', False)

        driver = webdriver.Chrome(options=options)
        driver.get(self.url)
        # Wait until load
        self._wait(driver)
        html = driver.page_source
        driver.quit()

        soup = BeautifulSoup(html, 'html.parser')
        self.title = soup.title.string if soup.title else "Unknown"
        self.body = html
        for no_text_tags in soup.body(["script", "style", "img", "input"]):
            no_text_tags.decompose()
        hrefs = soup.find_all('a', href=True)
        self.hrefs = [normalize_url(self.url, href.get('href')) for href in hrefs if not href['href'].startswith(('#', 'javascript:'))]
        self.text = soup.body.get_text(separator="\n", strip=True)

    def printer(self):
        print(f'Website: {self.url}') 
        print(f'Title: {self.title}')
        print(f'Hrefs: {self.hrefs}')
        print(f'Text Head: {self.text[:100]}...')

    def system_prompt(self):
        self.system_prompt = f"""
    You are provided with a list of Links found on a website. 
    You have to decide which of the links would be most relevant to include in a brochure about the company found at {self.url}. 
    We are interested in pages that may include information of interest such as; About, Company, Career, Jobs, Contact, Social, or anything 
    relevant that can help us build an attractive brochure. 
    Do not include Terms of Service, Privacy Statements, Cookie notices, Payment methods or anything relevant. 
    You always respond in JSON format. 
    You do not output anything else other than the final JSON. 
    Your JSON has to look like the following:
    """
        self.system_prompt += """
    {
        "links": [
            {"type": "base_url", description: "<description>", "url": "<base_url>"},
            {"type": "name", description: "<company_description>", "name": "<company_name>"},
            {"type": "<type>", description: "<description>", "url": "<full_url>"},
            ...
        ]
    }
    """
        self.system_prompt += f"""
    The <base_url> includes only the base URL of the company, you know that already from the link provided above. 
    The <company_description> can be a small sentence up to 10 words describing the company function. 
    The <company_name> has to be the actual company name that you can find out. 
    The <type> can be 1 word for the page type. Eg. About, Company, Career, Jobs, Contact, Social, etc. Do maintain the other static types as provided in the example. 
    The <descrition> can be a small sentence up to 10 words describing the page function. 
    The <url> has to be the actual full url of the page. 
    The `...` mean that you have to do the same for all the relevant links that you picked out and fill the JSON object. 
    You have a limit of up to 30 links. There is no minimum limit.
    """
        
        return self.system_prompt

    def user_prompt(self):
        self.user_prompt = f"""
        The website URL is {self.url}. The website Title is {self.title}. The website Links are provided below: 
        {self.hrefs}
        """
        return self.user_prompt

In [None]:
website = Website('https://gtsig.eu')
website.printer()

In [None]:
website = Website('https://airbnb.com')
website.printer()

In [None]:
website = Website('https://airbnb.com', selenium=True)
website.printer()

In [None]:
links1 = get_links("https://gtsig.eu")
links1

In [None]:
links2 = get_links("https://airbnb.com")
links2

In [None]:
links3 = get_links("https://airbnb.com", selenium=True)
links3

In [None]:
class Brochure:
    json: str

    def __init__(self, json):
        self.json = json
        self.system_prompt()
        self.user_prompt()

    def printer(self):
        print(f'JSON: {self.json}')

    def system_prompt(self):
        self.system_prompt = f"""
    You are a brochure builder. 
    You are provided with a JSON structured object that includes company details. 
    You have to build a short brochure showcasing the company to attract clients, recruits and stakeholders. 
    The output has to be formatted in Markdown. 
    Do not output thoughts or anything else other than the final Markdown brochure.
    """
        
        return self.system_prompt

    def user_prompt(self):
        self.user_prompt = f"""
    The company details follow: 
    {self.json}
    """
        return self.user_prompt

In [None]:
brochure = get_brochure(links1)
display(Markdown(brochure))

In [None]:
brochure = get_brochure(links3)
display(Markdown(brochure))

In [None]:
brochure = get_brochure(links1, _stream=True)

In [None]:
brochure = get_brochure(links2, _stream=True)

In [None]:
brochure_maker("https://www.skroutz.gr")

In [None]:
links4 = get_links_oss("https://gtsig.eu")
links4

In [None]:
brochure = get_brochure_oss(links4)
display(Markdown(brochure))

In [None]:
brochure = get_brochure_oss(links4, _stream=True)

In [None]:
brochure_maker_oss("https://airbnb.com", selenium=True)

In [None]:
brochure_maker_oss("https://www.skroutz.gr")