In [None]:
import os
import json
from openai import OpenAI
import requests
from dotenv import load_dotenv
from IPython.display import display, Markdown
import gradio as gr

In [None]:
# Initialize Ollama OpenAI client
from sklearn.base import defaultdict


OLLAMA_BASE_URL = "http://localhost:11434/v1"
ollama = OpenAI(base_url=OLLAMA_BASE_URL, api_key='ollama')

USE_API = False  # Set to True to use OpenAI API instead of Ollama
models = defaultdict(dict)
if USE_API:
    load_dotenv()
    OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
    from openai import OpenAI
    openai = OpenAI()

    # Store lightweight models for API
    models["api"]["get_link_relevant"] = ["gpt-4.1-mini", "gpt-3.5-turbo", "llama-2-7b-chat"]
    models["api"]["create_brochure"] = ["gpt-3.5-turbo", "gpt-4.1-mini", "llama-2-7b-chat"]

else:
    # Store lightweight models for Ollama
    models["ollama"]["get_link_relevant"] = ["llama3.2", "gpt-oss", "llama2:7b"]
    models["ollama"]["create_brochure"] = ["gpt-oss", "llama2:7b"]
    

In [None]:
requests.get("http://localhost:11434").content

In [None]:
from bs4 import BeautifulSoup
import requests

# Standard headers to fetch a website
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
}

def fetch_website_contents(url):
    """
    Return the title and contents of the website at the given url;
    truncate to 2,000 characters as a sensible limit
    """
    try:
        response = requests.get(url, headers=headers)
    except requests.exceptions.RequestException as e:
        print(f"Error fetching the URL {url}: {e}")
        return ""

    soup = BeautifulSoup(response.content, "html.parser")
    title = soup.title.string if soup.title else "No title found"
    if soup.body:
        for body in soup.body(["script", "style", "img", "input"]):
            body.decompose()
        text = soup.body.get_text(separator="\n", strip=True)
    else:
        text = ""
    return (title + "\n\n" + text)[:2_000]
    
def fetch_website_links(url):
    """
    Return the links on the webiste at the given url
    I realize this is inefficient as we're parsing twice! This is to keep the code in the lab simple.
    Feel free to use a class and optimize it!
    """
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, "html.parser")
    links = [link.get("href") for link in soup.find_all("a")]
    return [link for link in links if link]

In [None]:
# Define system and user prompts
link_system_prompt = """
You are an expert web content analyzer. Your task is to identify and extract links from a given webpage that are most relevant to the main topic of the page. You are provided with a list of links found on a webpage. Provide only the URLs of the relevant links (such as About page, Company page, Careers/Jobs pages) to include in a brochure about the company without any additional commentary.
You should return the links in a JSON array format as shown below:
{
    "relevant_links": [
        {"type: "About page", "url": "https://example.com/about"},
        {"type: "Careers page", "url": "https://example.com/careers"}
    ]
}
"""

def get_links_user_prompt(url):
    user_prompt = f"""
Here is the URL of the webpage: {url}
Your task is to analyze the links on this page and identify those that are most relevant to the main topic of the page for inclusion in a company brochure.
Please return the relevant links in the specified JSON format.
Do not include Terms of Service, Privacy Policy, email links, or any other unrelated links.
Links:
"""
    links = fetch_website_links(url)
    user_prompt += "\n".join(links)
    return user_prompt

In [None]:
def select_relevant_links(url, model):
    """
    Select the relevant links to the url provided.
    Args:
        url (str): The url to select relevant links for.
    Returns:
        json: A list of relevant links in JSON format.
    """
    if USE_API:
        response = openai.chat.completions.create(
            model=model,
            messages=[
                {"role": "system", "content": link_system_prompt},
                {"role": "user", "content": get_links_user_prompt(url)}
            ],
            response_format={"type": "json_object"}
        )
    else:
        response = ollama.chat.completions.create(
            model=model,
            messages=[
                {"role": "system", "content": link_system_prompt},
                {"role": "user", "content": get_links_user_prompt(url)}
            ],
            response_format={"type": "json_object"}
        )
    content = response.choices[0].message.content
    try:
        relevant_links = json.loads(content)
    except json.JSONDecodeError:
        print(f"Failed to parse JSON response: {content}")
        return []
    return relevant_links

In [None]:
def fetch_page_and_relevant_links(url, model):
    """
    Fetch the page content and relevant links for the given URL.
    Args:
        url (str): The URL of the webpage.
    Returns:
        str: The content of the webpage and relevant links.
    """
    page_content = fetch_website_contents(url)
    relevant_links = select_relevant_links(url, model)
    
    result = f"## Webpage Content:\n\n{page_content}\n\n## Relevant Links:\n\n"
    for link in relevant_links.get("relevant_links", []):
        result += f"Link: {link['type']}\n"
        result += fetch_website_contents(link['url'])
        result += "\n\n"
    return result

In [None]:
brochure_system_prompt = """
You are a skilled brochure writer. Your task is to create a compelling brochure for a company based on the provided webpage content and relevant links. Use the information to highlight the company's strengths, values, and offerings in an engaging manner.
The brochure should be well-structured, informative, and persuasive, aiming to attract potential customers or clients.
Respond in markdown format without code blocks.
Include sections such as Introduction, About Us, Services/Products, Careers, and Contact Information where applicable.
"""

In [None]:
def get_brochure_user_prompt(company_name, url, model):
    user_prompt = f"""
    You are to create a brochure for {company_name}.
    Using the following webpage content and relevant links, create a compelling brochure for the company.
    Ensure the brochure is well-structured and highlights the company's strengths, values, and offerings.
    Use this information to build a short brochure of the company in markdown format without code blocks.
    """
    user_prompt += fetch_page_and_relevant_links(url, model)
    return user_prompt[:5000]  # Truncate to first 5000 characters to fit model context

In [None]:
def create_brochure(company_name, url, get_link_relevant_model, create_brochure_model):
    """
    Create a brochure for the given company using its webpage content and relevant links.
    Args:
        company_name (str): The name of the company.
        url (str): The URL of the company's webpage.
    Returns:
        str: The generated brochure in markdown format.
    """
    if USE_API:
        response = openai.chat.completions.create(
            model=create_brochure_model,
            messages=[
                {"role": "system", "content": brochure_system_prompt},
                {"role": "user", "content": get_brochure_user_prompt(company_name, url, get_link_relevant_model)}
            ],
            stream=True
        )
    else:
        response = ollama.chat.completions.create(
            model=create_brochure_model,
            messages=[
                {"role": "system", "content": brochure_system_prompt},
                {"role": "user", "content": get_brochure_user_prompt(company_name, url, get_link_relevant_model)}
            ],
            stream=True
        )
    brochure = ""
    for chunk in response:
        if hasattr(chunk.choices[0].delta, "content"):
            token = chunk.choices[0].delta.content or ""
            brochure += token
            yield brochure

In [None]:
import gradio as gr

source = "api" if USE_API else "ollama"
get_link_relevant_models_default = models[source]["get_link_relevant"][0]
create_brochure_models_default = models[source]["create_brochure"][0]

def start_loading():
    return gr.update(value="‚è≥ *Generating brochure‚Ä¶ Please wait‚Ä¶*")

def stop_loading(result):
    return gr.update(value=""), result


with gr.Blocks(theme=gr.themes.Soft(), css="""
#header {
    text-align:center;
    font-size:32px;
    font-weight:700;
    padding: 10px;
    margin-bottom: 15px;
}
#subheader {
    text-align:center;
    font-size:16px;
    margin-bottom: 30px;
    color: #555;
}
.card {
    border: 1px solid #ddd; 
    padding: 20px;
    border-radius: 12px;
    background: white;
    box-shadow: 0px 2px 8px rgba(0,0,0,0.05);
}
""") as demo:

    # ----- Header -----
    gr.HTML("<div id='header'>üìÑ AI Brochure Generator</div>")
    gr.HTML("<div id='subheader'>Generate a clean, formatted brochure from any company website.</div>")

    # ----- Input Card -----
    with gr.Row():
        with gr.Column(scale=1):
            with gr.Group(elem_classes="card"):
                company_name_input = gr.Textbox(label="üè¢ Company Name", placeholder="Enter the company name")
                url_input = gr.Textbox(label="üîó Company URL", placeholder="Enter the company website URL")
                
                get_link_relevant_model_selector = gr.Dropdown(
                    label="üîç Model for Extracting Relevant Links",
                    choices=models[source]["get_link_relevant"],
                    value=get_link_relevant_models_default,
                )

                create_brochure_model_selector = gr.Dropdown(
                    label="üìù Model for Creating Brochure",
                    choices=models[source]["create_brochure"],
                    value=create_brochure_models_default,
                )

                generate_button = gr.Button("‚ú® Generate Brochure", variant="primary")

                loading_indicator = gr.Markdown("")

        # ----- Output Card -----
        with gr.Column(scale=2):
            with gr.Group(elem_classes="card"):
                output_area = gr.Markdown(label="Generated Brochure")

    # ----- Examples Section -----
    gr.Examples(
        examples=[
            ["Hugging Face", "https://huggingface.co", get_link_relevant_models_default, create_brochure_models_default],
            ["Inria", "https://www.inria.fr/en/", get_link_relevant_models_default, create_brochure_models_default],
            ["OpenAI", "https://www.openai.com/", get_link_relevant_models_default, create_brochure_models_default]
        ],
        inputs=[company_name_input, url_input, get_link_relevant_model_selector, create_brochure_model_selector],
        label="Example Inputs"
    )

    # ----- Interactivity -----
    # Show "loading‚Ä¶" indicator
    generate_button.click(start_loading, outputs=loading_indicator, queue=False)

    # Run brochure generator
    run_event = generate_button.click(
        fn=create_brochure,
        inputs=[company_name_input, url_input, get_link_relevant_model_selector, create_brochure_model_selector],
        outputs=[output_area],
    )

    # Hide loading message after done
    run_event.then(
        stop_loading,
        inputs=output_area,
        outputs=[loading_indicator, output_area],
    )

demo.launch(inbrowser=True, auth=("lananh", "inria"))
