In [None]:
from pathlib import Path

import pinecone
import requests
from langchain.document_loaders import (PagedPDFSplitter, TextLoader,
                                        UnstructuredHTMLLoader,
                                        UnstructuredMarkdownLoader,
                                        UnstructuredPDFLoader)
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Pinecone

# from urllib.parse import urlparse

TRUERA_BASE_URL = 'https://truera.com'
TRUREA_DOC_URL = 'https://docs.truera.com/1.34/public'

In [None]:
from bs4 import BeautifulSoup

# Create a pinecone vector db from a few blogs and docs.
# TODO: langchain includes html loaders which may produce better chunks.

text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
pdf_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)

scrape_path = Path("webscrape")

collected = dict()
documents = []


def url_to_path(url):
    url_esc = url.replace("https://", "").replace("http://",
                                                  "").replace("/", ":")

    ext = ".html"

    if url_esc.endswith(".png"):
        ext = ""
    elif url_esc.endswith(".pdf"):
        ext = ""
    elif url_esc.endswith(".jpg"):
        ext = ""
    elif url_esc.endswith(".md"):
        ext = ""

    return scrape_path / (url_esc + ext)


def scrape(url):
    if url in collected:
        return

    collected[url] = True

    print(url)

    scrape_file = url_to_path(url)

    if str(url).endswith(".pdf"):
        # skipping for now since issues with the content extractors noted below
        # return
        pass

    if scrape_file.exists():
        print("cached", end=" ")
        content = bytes()
        with scrape_file.open("rb") as fh:
            for line in fh.readlines():
                content += line
    else:
        print("downloading", end=" ")
        response = requests.get(url)

        if response.encoding is None:
            content = response.content

            with scrape_file.open("wb") as fh:
                fh.write(content)

        else:
            content = response.text

            with scrape_file.open("w") as fh:
                fh.write(content)

    loader = UnstructuredHTMLLoader
    if url.endswith(".pdf"):
        #return
        loader = PagedPDFSplitter # freezes for some pdfs
        # loader = UnstructuredPDFLoader # cannot get requirement installation figured out

    elif url.endswith(".png"):
        return
    
    elif url.endswith(".jpg"):
        return
    
    elif url.endswith(".md"):
        loader = UnstructuredMarkdownLoader

    elif (not url.endswith("truera.com")) and (
            not url.endswith("truera.net")) and "." in url[-5:]:
        
        raise RuntimeError(f"Unhandled source type {url}")

    docs = loader(str(scrape_file)).load()
    print(f"got {len(docs)} document(s)")
    for doc in docs:
        doc.metadata['source'] = url
        documents.append(doc)

    try:
        soup = BeautifulSoup(content, 'html.parser')

    except Exception as e:
        print(e)
        return

    for surl in soup.findAll("a"):
        # print(url)
        sub = surl.get('href')
        if sub is not None:
            sub = str(sub)
            # print("\t", sub)

            if sub.startswith("mailto") or sub.startswith("tel"):
                continue

            if not (sub.startswith("http") or sub.startswith("//")):
                sub = url + "/" + sub

            # print("sub=", sub)

            if not (sub.startswith("https://truera.com")
                    or sub.startswith("https://support.truera.com")
                    or sub.startswith("https://marketing.truera.com")
                    or sub.startswith("https://go.truera.com")
                    or sub.startswith("https://app.truera.net")
                    or sub.startswith("https://docs.truera.com")):
                continue

            if "?" in sub:
                continue

            if "#" in sub:
                sub = sub.split("#")[0]

            while "/" == sub[-1]:
                sub = sub[0:-1]

            if sub.endswith("/."):
                continue

            if sub.endswith("/.."):
                continue

            if ".." in sub:
                continue

            if sub.endswith("//"):
                continue

            scrape(sub)

In [None]:
scrape(TRUERA_BASE_URL)

In [None]:
#collected = dict()
scrape(TRUREA_DOC_URL)

In [None]:
scrape("https://truera.com/ai-quality-blog/")

In [None]:
len(documents)

In [None]:
# split scraped documents into chunks

text_splitter = CharacterTextSplitter(chunk_size=512, chunk_overlap=0)
docs = text_splitter.split_documents(documents)

In [None]:
# keep only big ones

print(len(docs))
bigdocs = [doc for doc in docs if len(doc.page_content) > 256]
print(len(bigdocs))

In [None]:
from keys import *
pinecone.init(
    api_key=PINECONE_API_KEY,  # find at app.pinecone.io
    environment=PINECONE_ENV  # next to api key in console
)

In [None]:
# create / upload an index of the docs to pinecone

index_name = "llmdemo"
embedding = OpenAIEmbeddings(model='text-embedding-ada-002')  # 1536 dims
pinecone.delete_index(index_name)
pinecone.create_index(index_name, dimension=1536)
Pinecone.from_documents(bigdocs, embedding, index_name=index_name)