In [None]:
from pathlib import Path

import pinecone
import requests
from langchain.document_loaders import (PagedPDFSplitter, TextLoader,
                                        UnstructuredHTMLLoader,
                                        UnstructuredMarkdownLoader,
                                        UnstructuredPDFLoader)
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Pinecone

# from urllib.parse import urlparse

TRUERA_BASE_URL = 'https://truera.com'
TRUREA_DOC_URL = 'https://docs.truera.com/1.34/public'

In [9]:
from bs4 import BeautifulSoup

# Create a pinecone vector db from a few blogs and docs.
# TODO: langchain includes html loaders which may produce better chunks.

text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
pdf_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)

scrape_path = Path("webscrape")

collected = dict()
documents = []


def url_to_path(url):
    url_esc = url.replace("https://", "").replace("http://",
                                                  "").replace("/", ":")

    ext = ".html"

    if url_esc.endswith(".png"):
        ext = ""
    elif url_esc.endswith(".pdf"):
        ext = ""
    elif url_esc.endswith(".jpg"):
        ext = ""
    elif url_esc.endswith(".md"):
        ext = ""

    return scrape_path / (url_esc + ext)


def scrape(url):
    if url in collected:
        return

    collected[url] = True

    print(url)

    scrape_file = url_to_path(url)

    if str(url).endswith(".pdf"):
        # skipping for now since issues with the content extractors noted below
        # return
        pass

    if scrape_file.exists():
        print("cached", end=" ")
        content = bytes()
        with scrape_file.open("rb") as fh:
            for line in fh.readlines():
                content += line
    else:
        print("downloading", end=" ")
        response = requests.get(url)

        if response.encoding is None:
            content = response.content

            with scrape_file.open("wb") as fh:
                fh.write(content)

        else:
            content = response.text

            with scrape_file.open("w") as fh:
                fh.write(content)

    loader = UnstructuredHTMLLoader
    if url.endswith(".pdf"):
        #return
        loader = PagedPDFSplitter # freezes for some pdfs
        # loader = UnstructuredPDFLoader # cannot get requirement installation figured out

    elif url.endswith(".png"):
        return
    
    elif url.endswith(".jpg"):
        return
    
    elif url.endswith(".md"):
        loader = UnstructuredMarkdownLoader

    elif (not url.endswith("truera.com")) and (
            not url.endswith("truera.net")) and "." in url[-5:]:
        
        raise RuntimeError(f"Unhandled source type {url}")

    docs = loader(str(scrape_file)).load()
    print(f"got {len(docs)} document(s)")
    for doc in docs:
        doc.metadata['source'] = url
        documents.append(doc)

    try:
        soup = BeautifulSoup(content, 'html.parser')

    except Exception as e:
        print(e)
        return

    for surl in soup.findAll("a"):
        # print(url)
        sub = surl.get('href')
        if sub is not None:
            sub = str(sub)
            # print("\t", sub)

            if sub.startswith("mailto") or sub.startswith("tel"):
                continue

            if not (sub.startswith("http") or sub.startswith("//")):
                sub = url + "/" + sub

            # print("sub=", sub)

            if not (sub.startswith("https://truera.com")
                    or sub.startswith("https://support.truera.com")
                    or sub.startswith("https://marketing.truera.com")
                    or sub.startswith("https://go.truera.com")
                    or sub.startswith("https://app.truera.net")
                    or sub.startswith("https://docs.truera.com")):
                continue

            if "?" in sub:
                continue

            if "#" in sub:
                sub = sub.split("#")[0]

            while "/" == sub[-1]:
                sub = sub[0:-1]

            if sub.endswith("/."):
                continue

            if sub.endswith("/.."):
                continue

            if ".." in sub:
                continue

            if sub.endswith("//"):
                continue

            scrape(sub)

In [10]:
scrape(TRUERA_BASE_URL)

https://truera.com
cached got 1 document(s)
https://truera.com/ml-testing-and-debugging-the-missing-piece-in-ai-development
cached got 1 document(s)
https://truera.com/diagnostics
cached got 1 document(s)
https://truera.com/monitoring
cached got 1 document(s)
https://truera.com/platform
cached got 1 document(s)
https://truera.com/industries
cached got 1 document(s)
https://truera.com/ai-quality-management-for-banking
cached got 1 document(s)
https://truera.com/ai-quality-for-government
cached got 1 document(s)
https://truera.com/ai-quality-management-for-human-resources
cached got 1 document(s)
https://truera.com/ai-quality-insurance
cached got 1 document(s)
https://truera.com/manufacturing
cached got 1 document(s)
https://truera.com/retail-and-brands
cached got 1 document(s)
https://truera.com/why-truera
cached got 1 document(s)
https://truera.com/resources
cached got 1 document(s)
https://truera.com/events
cached got 1 document(s)
https://marketing.truera.com/trustworthy-ai-podcast
c

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


https://truera.com/tag/influence-sensitivity-plots
cached got 1 document(s)
https://truera.com/tag/qii
cached got 1 document(s)
https://truera.com/tag/overfitting
cached got 1 document(s)
https://truera.com/cfpb-circular-on-black-box-credit-models-five-considerations-for-lenders
cached got 1 document(s)
https://truera.com/can-your-approach-to-machine-learning-explainability-pass-these-4-key-tests
cached got 1 document(s)
https://truera.com/machine-learning-explainability-is-just-the-beginning
cached got 1 document(s)
https://truera.com/machine-learning-models-require-the-right-explanation-framework-and-its-easy-to-get-wrong
cached got 1 document(s)
https://truera.com/tag/ai-in-banking
cached got 1 document(s)
https://truera.com/tag/anti-discrimination-laws-and-ai
cached got 1 document(s)
https://truera.com/tag/cfpb
cached got 1 document(s)
https://truera.com/ai-quality-management-key-to-driving-business-value
cached got 1 document(s)
https://truera.com/building-a-powerful-ai-quality-pl

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


https://truera.com/how-to-diagnose-and-treat-machine-learning-models-afflicted-by-covid-19
cached got 1 document(s)
https://truera.com/ai-quality-management-key-processes-and-tools-part-2
cached got 1 document(s)
https://truera.com/ai-quality-management-key-processes-and-tools-part-1
cached got 1 document(s)
https://truera.com/what-is-ai-quality
cached got 1 document(s)
https://truera.com/tag/data-quality
cached got 1 document(s)
https://truera.com/tag/operational-compatibility
cached got 1 document(s)
https://truera.com/tag/ai-quality-process
cached got 1 document(s)
https://truera.com/tag/ai-quality-structure
cached got 1 document(s)
https://truera.com/tag/ai-quality-workflow
cached got 1 document(s)
https://truera.com/tag/managing-ai-quality
cached got 1 document(s)
https://truera.com/tag/ml-model-approval
cached got 1 document(s)
https://truera.com/tag/ml-model-evaluation
cached got 1 document(s)
https://truera.com/tag/ml-model-review
cached got 1 document(s)
https://truera.com/exp

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


https://truera.com/resources/trulens-explainability-for-neural-networks
cached got 1 document(s)
https://truera.com/tag/afwerx
cached got 1 document(s)
https://truera.com/tag/air-force
cached got 1 document(s)
https://truera.com/tag/sbir
cached got 1 document(s)
https://truera.com/tag/truera-customers
cached got 1 document(s)
https://truera.com/truera-selected-at-techchallenge-showcase
cached got 1 document(s)
https://truera.com/mixing-theory-and-practice-my-journey-to-truera
cached got 1 document(s)
https://truera.com/category/business/page/2
cached got 1 document(s)
https://truera.com/hot-takes-emea-executive-roundtable-on-trustworthy-ai
cached got 1 document(s)
https://marketing.truera.com/model-risk-management/ai-ml
cached got 1 document(s)
https://truera.com/tag/ai-adoption
cached got 1 document(s)
https://truera.com/tag/ai-investment
cached got 1 document(s)
https://truera.com/tag/ai-trends
cached got 1 document(s)
https://truera.com/us-financial-regulators-views-on-ai
cached got

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


https://truera.com/tag/financial-services-ai
cached got 1 document(s)
https://truera.com/tag/request-for-information
cached got 1 document(s)
https://truera.com/tag/us-financial-regulators
cached got 1 document(s)
https://truera.com/how-to-interpret-and-use-feature-importance-in-ml-models
cached got 1 document(s)
https://truera.com/tag/explainable-ai
cached got 1 document(s)
https://truera.com/tag/feature-importance
cached got 1 document(s)
https://truera.com/tag/feature-influence
cached got 1 document(s)
https://truera.com/tag/shapley-values
cached got 1 document(s)
https://truera.com/how-do-you-debug-and-address-model-drift
cached got 1 document(s)
https://truera.com/tag/ai-drift
cached got 1 document(s)
https://truera.com/tag/debug-model-drift
cached got 1 document(s)
https://truera.com/tag/mitigate-model-drift
cached got 1 document(s)
https://truera.com/tag/ml-drift
cached got 1 document(s)
https://truera.com/category/data-science/page/2
cached got 1 document(s)
https://truera.com/

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


https://truera.com/wp-content/uploads/2020/08/Dtd-pets15.pdf
downloading got 21 document(s)


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


https://truera.com/wp-content/uploads/2021/10/TruEra-Standard-Chartered-Case-Study-2021.pdf
downloading got 3 document(s)


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


https://truera.com/wp-content/uploads/2023/04/TruEra-Monitoring-Datasheet-2023.pdf
downloading got 3 document(s)


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


https://truera.com/wp-content/uploads/2023/04/TruEra-Retail-and-Brands-Datasheet.pdf
downloading got 3 document(s)


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


https://truera.com/wp-content/uploads/2023/01/TruEra-MRM-Datasheet.pdf
downloading got 1 document(s)


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


https://truera.com/wp-content/uploads/2022/12/TruEra-Company-Product-Datasheet-2022.pdf
downloading got 3 document(s)


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


The markup you provided was rejected by the parser. Trying a different parser or a different encoding may help.

Original exception(s) from parser:
 AssertionError: expected name token at '<![����@\x19*� �\x1f����*A'
https://truera.com/wp-content/uploads/2022/10/TruEra-Datasheet-Diagnostics-2-0.pdf
downloading got 3 document(s)


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


https://truera.com/wp-content/uploads/2022/10/TruEra-HR-Datasheet-20221012.pdf
downloading got 3 document(s)


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


https://truera.com/wp-content/uploads/2022/05/TruEra-for-Manufacturing-Datasheet.pdf
downloading got 4 document(s)


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


https://truera.com/wp-content/uploads/2022/03/AI-Regulation-in-Finance_TruEra-Datasheet.pdf
downloading got 4 document(s)


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


https://truera.com/wp-content/uploads/2021/09/TruEra-Government-Datasheet.pdf
downloading got 3 document(s)


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


https://truera.com/wp-content/uploads/2021/07/TruEra-for-Life-Insurance.pdf
downloading got 3 document(s)


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


https://truera.com/wp-content/uploads/2021/07/TruEra-for-PC-Insurance.pdf
downloading got 3 document(s)


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


https://truera.com/wp-content/uploads/2021/05/TruEra-Banking-Datasheet-19-May-2021-121909.pdf
downloading got 4 document(s)


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


https://go.truera.com/ai-quality-whitepaper
cached got 1 document(s)
https://go.truera.com/model-risk-management-for-the-ml-era
cached got 1 document(s)
https://marketing.truera.com/fairness-in-machine-learning-white-paper
cached got 1 document(s)
https://truera.com/resources/pwc-uk-picks-ai-quality-leader-truera-to-collaborate-on-ai-risk-management
cached got 1 document(s)
https://truera.com/resources/truera-launches-first-automated-test-harness-for-ml-models-with-truera-diagnostics-2-0-release
cached got 1 document(s)
https://truera.com/resources/truera-named-to-fintech-power-50-list-of-the-worlds-top-fintech-trailblazers
cached got 1 document(s)
https://truera.com/resources/hewlett-packard-enterprise-invests-in-ai-quality-leader-truera
cached got 1 document(s)
https://truera.com/resource/in-the-news/page/2
cached got 1 document(s)
https://truera.com/resources/truera-honored-with-fast-companys-2022-world-changing-ideas-award-for-reducing-bias-and-improving-accuracy-in-ai-based-credit

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


https://truera.com/research/explainable-ai-research
cached got 1 document(s)
https://truera.com/ai-quality-blog/page/2
cached got 1 document(s)
https://truera.com/ai-quality-blog/page/3
cached got 1 document(s)
https://truera.com/ai-quality-blog/page/4
cached got 1 document(s)
https://truera.com/ai-quality-blog/page/5
cached got 1 document(s)
https://truera.com/ai-quality-blog/page/6
cached got 1 document(s)
https://truera.com/ai-quality-blog/page/7
cached got 1 document(s)
https://truera.com/tag/financial-crime
cached got 1 document(s)
https://truera.com/tag/trustworthy-ai-podcast
cached got 1 document(s)
https://truera.com/event/podcasts
cached got 1 document(s)
https://truera.com/event/webinars
cached got 1 document(s)
https://truera.com/event/live-events
cached got 1 document(s)
https://truera.com/events/plug-and-play-alberta-2022-featuring-ai-quality
cached got 1 document(s)
https://truera.com/event/live-events/page/2
cached got 1 document(s)
https://go.truera.com/webinar-ai-gover

Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


https://truera.com/banking
cached got 1 document(s)
https://truera.com/government
cached got 1 document(s)
https://truera.com/wp-content/uploads/2021/05/TruEra-Monitoring-Datasheet-19-May-2021-172808.pdf
downloading got 3 document(s)


Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.


https://truera.com/tag/ai-testing
cached got 1 document(s)
https://truera.com/tag/qa-for-ai
cached got 1 document(s)
https://truera.com/tag/qa-for-ml
cached got 1 document(s)
https://truera.com/tag/truera-diagnostics
cached got 1 document(s)


In [11]:
#collected = dict()
scrape(TRUREA_DOC_URL)

https://docs.truera.com/1.34/public
cached got 1 document(s)
https://docs.truera.com/1.34/public/client-installation
cached got 1 document(s)
https://support.truera.com/hc/en-us/categories/4419133979291-Announcements
cached got 1 document(s)
https://support.truera.com/hc/en-us
cached got 1 document(s)
https://docs.truera.com/1.34/public/project_structure
cached got 1 document(s)
https://docs.truera.com/1.34/public/diagnostics-quickstart
cached got 1 document(s)
https://docs.truera.com/1.34/public/performance
cached got 1 document(s)
https://docs.truera.com/1.34/public/data-ingestion
cached got 1 document(s)
https://docs.truera.com/1.34/public/local_data_ingestion
cached got 1 document(s)
https://docs.truera.com/1.34/public/remote_data_ingestion
cached got 1 document(s)
https://docs.truera.com/1.34/public/model_ingestion_local_upload
cached got 1 document(s)
https://docs.truera.com/1.34/public/notebook_quickstart_with_feature_transform
cached got 1 document(s)
https://docs.truera.com/1.

In [13]:
scrape("https://truera.com/ai-quality-blog/")

https://truera.com/ai-quality-blog/
downloading got 1 document(s)


In [14]:
len(documents)

521

In [15]:
# split scraped documents into chunks

text_splitter = CharacterTextSplitter(chunk_size=512, chunk_overlap=0)
docs = text_splitter.split_documents(documents)

Created a chunk of size 797, which is longer than the specified 512
Created a chunk of size 966, which is longer than the specified 512
Created a chunk of size 852, which is longer than the specified 512
Created a chunk of size 1118, which is longer than the specified 512
Created a chunk of size 960, which is longer than the specified 512
Created a chunk of size 887, which is longer than the specified 512
Created a chunk of size 740, which is longer than the specified 512
Created a chunk of size 872, which is longer than the specified 512
Created a chunk of size 1308, which is longer than the specified 512
Created a chunk of size 1115, which is longer than the specified 512
Created a chunk of size 952, which is longer than the specified 512
Created a chunk of size 727, which is longer than the specified 512
Created a chunk of size 615, which is longer than the specified 512
Created a chunk of size 624, which is longer than the specified 512
Created a chunk of size 615, which is longer 

In [16]:
# keep only big ones

print(len(docs))
bigdocs = [doc for doc in docs if len(doc.page_content) > 256]
print(len(bigdocs))

1910
1601


In [20]:
from keys import *
pinecone.init(
    api_key=PINECONE_API_KEY,  # find at app.pinecone.io
    environment=PINECONE_ENV  # next to api key in console
)

In [22]:
# create / upload an index of the docs to pinecone

index_name = "llmdemo"
embedding = OpenAIEmbeddings(model='text-embedding-ada-002')  # 1536 dims
pinecone.delete_index(index_name)
pinecone.create_index(index_name, dimension=1536)
Pinecone.from_documents(bigdocs, embedding, index_name=index_name)

<langchain.vectorstores.pinecone.Pinecone at 0x7fb67c378610>