# Scrape websites to create document retrieval stores.

## Additional requirements:

```bash
pip install humanize pdfreader url_normalize tabulate unstructured langchain_community
```

In [None]:
# !pip install humanize pdfreader url_normalize tabulate unstructured langchain_community tqdm

In [None]:
from pathlib import Path
import sys

sys.path.append(str(Path().cwd().parent.parent.parent.parent.resolve()))

In [None]:
from trulens.core.utils.keys import check_keys

check_keys("OPENAI_API_KEY")

In [None]:
import datetime
import io
from multiprocessing import Event
from pathlib import Path
from queue import Queue
import sqlite3
import tempfile
from threading import Thread
from time import sleep
from typing import Callable, Iterable, Sequence, Union
from urllib.parse import urljoin
from urllib.parse import urlparse

from bs4 import BeautifulSoup
import humanize
from langchain.document_loaders import PagedPDFSplitter
from langchain.document_loaders import UnstructuredHTMLLoader
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain_community.vectorstores import Pinecone
import numpy as np
import pdfreader
import pinecone
import requests
from tqdm.auto import tqdm
from trulens.core.utils.containers import first
from trulens.core.utils.text import UNICODE_CHECK
from url_normalize import url_normalize

TRUERA_BASE_URL = "https://truera.com/"
TRUERA_DOC_URL = "https://docs.truera.com/1.34/public/"
TRUERA_SUPPORT_URL = "https://support.truera.com/hc/en-us/"
TRUERA_BLOG_URL = "https://truera.com/ai-quality-blog/"
TRULENS_URL = "https://trulens.org/"
TRUERA_URLS = [
    TRUERA_BASE_URL,
    TRUERA_DOC_URL,
    TRUERA_SUPPORT_URL,
    TRUERA_BLOG_URL,
    TRULENS_URL,
]

In [None]:
class WebScrape:
    TABLE_PAGES = "page"

    def __init__(
        self,
        filename: Path = Path("scrape.sqlite"),
        n_threads: int = 8,
        filters: Callable[[str], bool] = None,
    ):
        """
        Web document downloader. Walks over links, collecting documents.

        NOTE: This is not a serious scraper for large crawls.
        """

        self.filename = filename
        self.n_threads = n_threads
        self._create_tables()

        if isinstance(filters, str):
            filter_func = lambda url: filters in url
        elif isinstance(filters, Iterable):
            filter_func = lambda url: any(map(lambda f: f in url, filters))
        elif isinstance(filter, Callable):
            filter_func = filters
        else:
            raise TypeError(f"Unhandled filters type {type(filters)}")

        self.filter_func = filter_func

    @staticmethod
    def custom_normalize(url, base_url=None):
        if url.startswith("tel:"):
            return url

        if base_url is not None:
            base_url = url_normalize(
                urlparse(base_url)._replace(fragment=None, query=None).geturl()
            )
            if not base_url.endswith("/"):
                base_url += "/"
            url = urljoin(base_url, url)
        else:
            url = urlparse(url)._replace(fragment=None, query=None).geturl()

        url = url_normalize(url)

        return url

    def cursor(self):
        connection = sqlite3.connect(self.filename)
        cursor = connection.cursor()
        return cursor, connection

    def _create_tables(self):
        c, conn = self.cursor()
        c.execute(
            f"""
            CREATE TABLE IF NOT EXISTS {WebScrape.TABLE_PAGES} (
                url VARCHAR(128),
                type VARCHAR(64),
                retrieved INTEGER,
                content BYTES,
                PRIMARY KEY (url)
            )
        """
        )
        conn.commit()
        c.close()

    def get_urls(self) -> Iterable[sqlite3.Row]:
        c, _ = self.cursor()

        c.execute(
            f"""
            SELECT url
            FROM {WebScrape.TABLE_PAGES}
            """
        )
        rows = c.fetchall()

        c.close()

        return map(first, rows)

    def get_page(self, url: str) -> sqlite3.Row:
        c, _ = self.cursor()

        c.execute(
            f"""
            SELECT * 
            FROM {WebScrape.TABLE_PAGES} 
            WHERE url=?""",
            (url,),
        )
        row = c.fetchone()

        c.close()
        return row

    def request(self, url: str):
        return requests.get(url, stream=True)

    def delete_page(self, url: str):
        c, conn = self.cursor()

        c.execute(
            f"""
            DELETE FROM {WebScrape.TABLE_PAGES}
            WHERE url=?
            """,
            (url,),
        )
        conn.commit()
        c.close()

        print(f"page {url} deleted")

    def insert_page(self, url: str, type: str, content: bytes):
        retrieved = datetime.datetime.now().timestamp()

        c, conn = self.cursor()

        size = len(content)

        c.execute(
            f"""
            INSERT OR REPLACE 
            INTO {WebScrape.TABLE_PAGES} 
            VALUES (?, ?, ?, ?)""",
            (url, type, retrieved, content),
        )
        conn.commit()
        c.close()

        print(
            f"{UNICODE_CHECK} page {type} {humanize.naturalsize(size)} {url} -> {self.filename}"
        )

    def scrape(self, url: Union[str, Sequence[str]], redownload: bool = False):
        q = Queue(maxsize=1024 * 1024)

        if isinstance(url, str):
            q.put((url, None))
        elif isinstance(url, Sequence):
            for u in url:
                q.put((u, None))

        stopped = Event()
        stopped.clear()

        scraped = set()
        threads = []

        for _ in range(self.n_threads):
            thread = Thread(
                target=self._scrape,
                kwargs=dict(
                    queue=q,
                    redownload=redownload,
                    scraped=scraped,
                    stopped=stopped,
                ),
            )
            thread.start()
            threads.append(thread)

        sleep(1)

        while not q.empty():
            print("queue size:", q.qsize())
            sleep(1)

        print("queue empty")
        stopped.set()

        for thread in threads:
            thread.join()

    def _scrape(
        self, queue: Queue, stopped: Event, redownload: bool, scraped: set
    ):
        while not stopped.is_set():
            if not queue.empty():
                (url, from_url) = queue.get()
            else:
                sleep(1)
                continue

            url = WebScrape.custom_normalize(url)

            if url in scraped:
                continue

            scraped.add(url)

            page = self.get_page(url)
            if page is not None:
                ctype = page[1]
                content = page[3]

            if page is None or redownload:
                try:
                    res = self.request(url)
                except Exception as e:
                    print(f"WARNING: {url} from {from_url}: {e}")
                    continue

                if not res.ok:
                    print(f"WARNING: {url} from {from_url}: {res.status_code}")
                    continue

                if "content-type" not in res.headers:
                    print(
                        f"WARNING: {url} from {from_url} lacks needed headers:\n{list(res.headers.keys())}"
                    )
                    continue

                ctype = res.headers["content-type"]

                if "content-length" in res.headers:
                    size = int(res.headers["content-length"])
                    if size > 100 * (1024**2):
                        print(
                            f"WARNING: {url} from {from_url} is large {humanize.naturalsize(size)}"
                        )
                        continue  # skipping

                if ctype.startswith("image/"):
                    continue  # skipping

                content = res.content
                self.insert_page(url=url, type=ctype, content=res.content)

            size = len(content)
            if size > 100 * (1024**2):
                print(
                    f"WARNING: {url} from {from_url}: is large: {humanize.naturalsize(size)}"
                )
                pass

            if ctype.startswith("text/html"):
                soup = BeautifulSoup(content, "html.parser")

                anchors = soup.findAll("a")
                sub_urls = [a.get("href") for a in anchors]

            elif ctype.startswith("application/pdf"):
                with io.BytesIO() as fh:
                    fh.write(content)
                    fh.seek(0)

                    pdf = pdfreader.SimplePDFViewer(fh)

                    sub_urls = []

                    if pdf.annotations is not None:
                        for annot in pdf.annotations:
                            if annot.Subtype == "Link":
                                sub_url = annot.A.URI
                                if sub_url is not None:
                                    sub_url = sub_url.decode("ascii")
                                    if sub_url.startswith("http"):
                                        sub_urls.append(sub_url)

            else:
                print(
                    f"WARNING: {url} from {from_url}: unknown content type {ctype}"
                )
                continue

            for sub_url in sub_urls:
                if sub_url in scraped:
                    continue

                if sub_url is None:
                    continue

                if sub_url.startswith("tel:"):
                    # print(f"skip: {sub_url} from {url}: is tel")
                    scraped.add(sub_url)
                    continue

                sub_url = WebScrape.custom_normalize(sub_url, base_url=url)

                if sub_url in scraped:
                    continue

                try:
                    parts = urlparse(sub_url)
                    if parts.scheme is None:
                        print(f"WARNING: {sub_url} from {url}: no scheme")
                        scraped.add(sub_url)
                        continue

                    if parts.scheme not in ["http", "https"]:
                        # print(f"skip: {sub_url} from {url}: skip scheme {parts.scheme}")
                        scraped.add(sub_url)
                        continue

                except Exception as e:
                    print(f"WARNING: {sub_url} from {url}: {e}")
                    scraped.add(sub_url)
                    continue

                if self.filter_func(sub_url):
                    # print("adding", sub_url)
                    queue.put((sub_url, url))
                else:
                    scraped.add(sub_url)
                    pass

    def get_documents(self):
        docs = []

        seen_texts = dict()

        for url in tqdm(list(self.get_urls())):
            canon_url = WebScrape.custom_normalize(url)
            if url != canon_url:
                s.delete_page(url=url)
                continue

            if url in {
                "https://truera.com/resources/",
                "https://truera.com/ai-quality-blog/",
                "https://truera.com/event/live-events/",
                "https://truera.com/ai-quality-research/ai-quality-education/",
                "https://medium.com/trulens/archive",
            }:
                print("skipping", url)
                continue
            elif "/page/" in url:
                print("skipping", url)
                continue
            elif "/category/" in url:
                print("skipping", url)
                continue
            elif "Datasheet" in url and url.endswith(".pdf"):
                print("skipping", url)
                continue
            elif url.startswith("https://pypi.org/project/"):
                print("skipping", url)
                continue
            elif "trulens" in url:  # temporarily skipping anything with trulens
                print("skipping", url)
                continue

            row = s.get_page(url=url)
            type = row[1]

            if type.startswith("text/html"):
                loader = UnstructuredHTMLLoader
            elif type.startswith("application/pdf"):
                loader = PagedPDFSplitter
                # UnstructuredPDFLoader
            elif type.startswith("image"):
                # s.delete_page(url=url)
                continue
            else:
                # markdown: UnstructuredMarkdownLoader
                # jupyter?
                # github?
                # print(url, type)
                continue

            content = row[3]
            size = len(content)
            if content is None:
                raise ValueError(url)
            if size == 0:
                raise ValueError(f"empty: {url}")
            if size >= 100 * (1024**2):
                print(
                    f"WARNING: big content {url} {humanize.naturalsize(size)}"
                )

            file = tempfile.NamedTemporaryFile(mode="bw")
            file.write(row[3])
            file.flush()

            try:
                new_docs = loader(file.name).load()
                for new_doc in new_docs:
                    new_doc.metadata["source"] = url

                cont = new_doc.page_content

                if cont in seen_texts:
                    # print(
                    #    f"WARNING: {url} Already seen text in {seen_texts[cont]}. Skipping."
                    # )
                    continue
                seen_texts[cont] = url

                docs.append(new_doc)

            except Exception as e:
                print(f"WARNING: {url} {type} {e}")

            file.close()

        return docs


s = WebScrape(
    filters=lambda url: ("truera" in url or "trulens" in url)
    and "github.com" not in url
    and "support.truera.com" not in url
    and "cbinsights.com" not in url
    and "files.pythonhosted.org" not in url
    and "libraries.io" not in url
)
s.scrape(TRUERA_URLS)

In [None]:
docs = s.get_documents()

In [None]:
# text_splitter = NLTKTextSplitter(chunk_size=1024, chunk_overlap=0)
text_splitter = CharacterTextSplitter(chunk_size=512, chunk_overlap=0)
chunks = text_splitter.split_documents(docs)

In [None]:
big_chunks = [c for c in chunks if len(c.page_content) >= 256]

In [None]:
seen_content = dict()
unique_chunks = []
for chunk in big_chunks:
    content = chunk.page_content
    # if content in seen_content:
    # print(f"{chunk.metadata} already seen in {seen_content[content]}")
    # continue
    seen_content[content] = chunk.metadata
    unique_chunks.append(chunk)

In [None]:
# smallest chunk:
print(
    unique_chunks[
        np.array([len(c.page_content) for c in unique_chunks]).argmin()
    ]
)

# number of chunks:
print(len(unique_chunks))

In [None]:
embedding = OpenAIEmbeddings(model="text-embedding-ada-002")  # 1536 dims

In [None]:
from langchain.vectorstores import DocArrayHnswSearch

In [None]:
# Which chunks to write to vector db. Options here to play around with various
# drawbacks.

output_chunks = big_chunks  # unique_chunks

# To DocArrayHnswSearch

This is a local document store and retriever that requires no additional api keys.

In [None]:
db = DocArrayHnswSearch.from_documents(
    output_chunks,
    embedding,
    work_dir="hnswlib_trubot",
    n_dim=1536,
    max_elements=int(len(output_chunks) * 1.1),
)
# db = DocArrayHnswSearch.from_params(
#    embedding=embedding,
#    work_dir='hnswlib_trubot',
#    n_dim=1536,
#    max_elements=int(len(output_chunks) * 1.1)
# )

In [None]:
for doc in db.similarity_search("Who is Shayak?"):
    print("====")
    print(doc.metadata)
    print(doc.page_content)
    print("====")

# To Pinecone:

In [None]:
import os

check_keys("PINECONE_API_KEY", "PINECONE_ENV")

pinecone.init(
    api_key=os.environ.get("PINECONE_API_KEY"),  # find at app.pinecone.io
    environment=os.environ.get("PINECONE_ENV"),  # next to api key in console
)

In [None]:
# create / upload an index of the docs to pinecone

index_name = "llmdemo"
# Delete if already exists:
# pinecone.delete_index(index_name)
# pinecone.create_index(index_name, dimension=1536)
Pinecone.from_documents(output_chunks, embedding, index_name=index_name)