# Ollama Web Scraping Tool

This tool utilizes playwright and function calling with ollama to provide the llm with a webscraper that it can utilize

In [None]:
# make sure ollama is installed 
!curl -fsSL https://ollama.com/install.sh | sh

In [None]:
% pip install ollama playwright beautifulsoup4 playwright-stealth

# install required browsers
% playwright install

Once you have the browsers installed, you can import the libraries and define the web scraper tool. This tool is using playwright due to its ability to render complex javascript and bypass standard detection systems.

In [2]:
# Import necessary libraries
import json
import ollama
import asyncio
from playwright.async_api import async_playwright
from playwright_stealth import stealth_async
from bs4 import BeautifulSoup
import traceback

# Web Scraper class definition
class WebScraper:
    def __init__(self, headless=True, browser_type="chromium", chunk_size=256, max_tokens=1000):
        self.headless = headless
        self.browser_type = browser_type
        self.chunk_size = chunk_size
        self.max_tokens = max_tokens

    async def scrape_page(self, url: str) -> str:
        async with async_playwright() as p:
            browser = await getattr(p, self.browser_type).launch(
                headless=self.headless,
                args=["--disable-gpu", "--no-sandbox"]
            )
            context = await browser.new_context()
            page = await context.new_page()

            await stealth_async(page)
            await page.goto(url)

            html_content = await page.content()
            await browser.close()
        return html_content

    def extract_titles_articles_links(self, raw_html: str) -> list:
        soup = BeautifulSoup(raw_html, 'html.parser')
        extracted_data = []
        
        for article in soup.find_all(['article', 'section', 'div']):
            title_tag = article.find(['h1', 'h2', 'h3', 'h4', 'h5', 'h6'])
            link_tag = article.find('a', href=True)
            content = article.get_text(separator="\n", strip=True)
            
            if title_tag and link_tag and content:
                extracted_data.append({
                    'title': title_tag.get_text(strip=True),
                    'link': link_tag['href'],
                    'content': content
                })
        
        return extracted_data

    async def query_page_content(self, url: str) -> dict:
        raw_html = await self.scrape_page(url)
        structured_data = {
            "url": url,
            "extracted_data": self.extract_titles_articles_links(raw_html),
            "raw_html": raw_html
        }
        return structured_data


Once playwright retrieves all of the HTML on the given page, it will be cleaned with beautiful soup to make the response a bit more workable. Some models with small context windows may struggle with this method. Llama 3.1 8B works very well, but for example, command-r plus cannot use this tool because the HTML response will always be too large.

In [3]:
async def query_web_scraper(url: str) -> dict:
    scraper = WebScraper(headless=False)
    return await scraper.query_page_content(url)

async def write_raw_html_to_file(raw_html: str, filename: str = "scraped_content.html"):
    with open(filename, "w", encoding="utf-8") as f:
        f.write(raw_html)
    print(f"Raw HTML content has been written to {filename}")


query_web_scraper initiates the playwright response and write_raw_html_to_file is self explanatory. 

Now you can implement the multistep tool call to have the model initiate the scraping and use the results to create a structured JSON response:

In [4]:
# Initialize model and messages
model = 'llama3.1'

# Revised system message focused on structured JSON output
system_message = {
    'role': 'system',
    'content': 'You are an AI assistant specialized in processing web content and returning structured JSON data. Always provide your response as valid, well-formatted JSON without any additional text or comments. Focus on extracting and organizing the most relevant information from websites, including main sections, key services or products, and primary navigation links.'
}

# User message requesting the scraping of content
user_message = {
    'role': 'user',
    'content': 'Please scrape the content of https://ollama.com/blog/tool-support and provide a structured JSON response of all the titles and links on the page. After scraping, focus on the most important and relevant information.'
}

# Initialize conversation with the system message and user query
messages = [system_message, user_message]

# First API call: Send the query and function description to the model
response = ollama.chat(
    model=model,
    messages=messages,
    tools=[
        {
            'type': 'function',
            'function': {
                'name': 'query_web_scraper',
                'description': 'Scrapes the content of a web page and returns the structured JSON object with titles, articles, and associated links.',
                'parameters': {
                    'type': 'object',
                    'properties': {
                        'url': {
                            'type': 'string',
                            'description': 'The URL of the web page to scrape.',
                        },
                    },
                    'required': ['url'],
                },
            },
        },
    ]
)

# Append the model's response to the existing messages
messages.append(response['message'])

# Check if the model decided to use the provided function
if not response['message'].get('tool_calls'):
    print("The model didn't use the function. Its response was:")
    print(response['message']['content'])
else:
    # Process function calls made by the model
    scraped_data = None
    available_functions = {'query_web_scraper': query_web_scraper}

    for tool in response['message']['tool_calls']:
        function_name = tool['function']['name']
        function_to_call = available_functions[function_name]
        function_args = tool['function']['arguments']
        scraped_data = await function_to_call(function_args['url'])  # Use await for async function call
        
        print(f"Function '{function_name}' was called with the URL: {function_args['url']}")
        
        # Write raw HTML to file
        await write_raw_html_to_file(scraped_data['raw_html'])
        
        # Add function response to the conversation
        messages.append({
            'role': 'function',
            'name': function_name,
            'content': json.dumps(scraped_data),
        })

    if scraped_data:
        # Additional instruction to ensure proper use of scraped data
        additional_instruction = {
            'role': 'user',
            'content': f"""Here's the scraped data from the website:
            
            {json.dumps(scraped_data, indent=2)}
            
            Using this scraped data, create a structured JSON response that includes only the most relevant and important information from the website.
            Ignore head section. Focus on the main body section. Do not include HTML tags or unnecessary details.
            Ensure your response is in valid JSON format without any additional text or comments. Make sure you dont return empty JSON. The structure should match the information you are scraping."""
        }
        messages.append(additional_instruction)

        # Final API call: Get structured JSON response from the model
        final_response = ollama.chat(model=model, messages=messages)
        print(final_response['message']['content'])
    else:
        print("No data was scraped. Unable to proceed with creating a structured JSON response.")


Function 'query_web_scraper' was called with the URL: https://ollama.com/blog/tool-support
Raw HTML content has been written to scraped_content.html
{
  "title": "Tool Support",
  "description": "Bring your ideas to life with Ollama's tool support.",
  "section": {
    "h2": "What is a tool?",
    "p1": "A tool in Ollama allows you to take action directly on the model output. Each time a model processes an input, it can return one or more tools that can be used to further process the output.",
    "code": [
      {
        "lang": "python",
        "line": "ollama=ollama.Ollama('https://api.ollama.com/v1',\n\tmessages=messages,\n\ttools=tools,\n)"
      }
    ],
    "h3": "Full examples",
    "ul": [
      {
        "li": [
          {
            "href": "https://github.com/ollama/ollama-python/blob/main/examples/tools/main.py",
            "text": "Python"
          },
          {
            "href": "https://github.com/ollama/ollama-js/blob/main/examples/tools/tools.ts",
           