## Web Scrape Script

This colab notebook scrapes a website and extracts data.
- Scrapes website using sitemap.
- Extract page content and remove HTML tags, JavaScript and others.
- Create text chunks from page content paragraphs.

In [None]:
# Install chromium and chromium-driver

%%shell
# apt-get -qq update
apt-get -qq install chromium chromium-driver

Install necessary python libraries.

In [None]:
%pip install -q  selenium requests beautifulsoup4

In [None]:
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.service import Service

Load Chrome web driver to crawl website.

In [None]:
service = Service(executable_path=r'/usr/bin/chromedriver')
options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.517 Safari/537.36'
options.add_argument('user-agent={0}'.format(user_agent))
browser = webdriver.Chrome(service=service, options=options)

- Crawl [ElevenLabs Blog](https://elevenlabs.io/blog/).
- Extract latest 10 blog articles content.

In [None]:
website = 'elevenlabs.io'
websit_url = 'https://' + website + '/blog'
sitemap = websit_url + '/sitemap-posts.xml'

In [None]:
def find_linked_pages(soup):
  links = []
  for tag in soup.find_all(['a']):
    if tag.get('href') not in ['#', '/', '', None]:
      links.append(tag.get('href'))

  filtered = filter(lambda link:website in link, links)

  return list(dict.fromkeys(list(filtered)))

In [None]:
def soup_by_url(url):
  browser.get(url)
  # Selenium hands the page source to Beautiful Soup
  return BeautifulSoup(browser.page_source, 'html.parser')

In [None]:
def extract_page_content(soup):
  results = soup.find_all(['h1', 'p', 'ul', 'li', 'pre', 'button'])
  page_text = [result.text for result in results]
  return ' '.join(page_text)

In [None]:
soup = soup_by_url(sitemap)
links = find_linked_pages(soup)
print(f"Linked pages: {links}")
documents = []
for link in links:
  soup = soup_by_url(link)
  page_content = extract_page_content(soup)
