In [None]:
# Text splitter functionality is provided by LangChain framework
from langchain_text_splitters import HTMLHeaderTextSplitter, RecursiveCharacterTextSplitter

# Make use of BS for hadling the web content
import requests
from bs4 import BeautifulSoup

import lancedb

# Sentence transformers to use the embedding models locally
from sentence_transformers import SentenceTransformer, util
import pandas as pd

#### Utilities

> Library functions

**Custom Meta Data**  
From the parsed and split content, this function helps to create meta data in a custom way, that can be used while creating Knowledge DB

In [None]:
def meta_data_from_headings (heading: dict, n: int = 1, from_end: bool = True, sep: str = " : ") -> str:
    """
    Concatenates n values from a heading dictionary, either from the start or from the end.

    Param:
        heading (dict): Input dictionary for headings.
        n (int): Number of elements to take.
        from_end (bool): If True, take from the end; else from the start.
        sep (str): Optional separator to use between concatenated strings.

    Returns: Meta data as concatenation of headings.
    """
    values = list(heading.values())

    if n <= 0:
        n = 1
    if n > len(values):
        n = len(values)

    # Select n items from start or end
    selected = values[-n:] if from_end else values[:n]

    # Always concatenate in forward direction
    return sep.join(str(v) for v in selected)

**Get Main Content**

In [None]:
def get_main_content (url, type):

    html = requests.get(url).text
    soup = BeautifulSoup(html, "html.parser")

    # Remove layout elements
    for tag in soup(["nav", "header", "footer", "aside", "script", "style"]):
        tag.decompose()

    # Check and get main section of the pages
    main = soup.find("main")

    if not main:
        
        # fallback method, if no 'main' section in html page
        candidates = soup.find_all("div", recursive=True)
        main = max(candidates, key=lambda c: len(c.get_text(strip=True)), default=soup.body)

    # Get cleaned HTML content. Tags retained
    main_html = str(main)

    # If HTML content is required, provide with the tags
    if type == 'html':
        return (main_html)

    # If text is requirred, provide only the text content
    elif type == 'text':

        text_soup = BeautifulSoup (main_html, "html.parser")
        main_text = text_soup.get_text(separator="\n", strip=True)
        return main_text

**Multi-Pass Chunking**
> Often the scenario could be to incorporate multiple ways of chunking to have better granularity and meaning in the chunks  
> The Sentence and content aware chunkers are used in Combination to retain the context and be granular as well  
> The context is captured by the Meta data

In [None]:
# Define what are the splitters to be considered. There is default in library itself
seperators = [".", "?", "!"]

# Splitter function based on seperator and the length criteria
text_splitter = RecursiveCharacterTextSplitter (chunk_size=300, chunk_overlap=0,
                                                length_function=len, is_separator_regex=False,
                                                keep_separator=False,
                                                separators=seperators,
                                                )

# levels of header tags in html to split on
header_levels = [
    ("h1", "Header 1"),
    ("h2", "Header 2"),
    ("h3", "Header 3"),
    ("h4", "Header 4"),
]

# Define a Splitter object for HTML content from the lib
# This library also gives splitter for Markdown, JSON etc
html_splitter = HTMLHeaderTextSplitter(header_levels)

**Combine 2 methods**  
Get the content and split it based on document structure first  
Some of the chunks can be big, because of the way the text is present  
Pass those blocks for one more level of splitting by sentences  
Capture the meta data from the headings and use it along with the text

In [None]:
def Build_Chunks (url, source, chunk_size_limit):

    # Get the main content
    HTML_Content = get_main_content (url, "html")

    # Chunk based on document structure
    docs = html_splitter.split_text (HTML_Content)

    # Start with empty list
    Chunks = []

    with open ('chunks.txt', mode='w') as f:

        for doc in docs :

            try :

                meta_data = meta_data_from_headings (doc.metadata)

                if not meta_data:
                    meta_data = 'Generic'

                # If the chunk is too long,
                if (len(doc.page_content) > chunk_size_limit):

                    # Split by sentece(s) by shorter lenth
                    splits = text_splitter.split_text(doc.page_content)

                    # Make them individual chunk with same meta data
                    for split in splits:

                        # Capture if the meta data and text are not the same
                        if (meta_data != split):

                            Chunk = {'source': source,'topic' : meta_data, 'text' : split}
                            print (Chunk, "\n----",file=f)

                            Chunks = Chunks + [Chunk]
                        
                else :
                    
                    if (meta_data != doc.page_content):
                        
                        Chunk = {'source': source, 'topic' : meta_data, 'text' : doc.page_content}
                        print (Chunk, "\n----",file=f)
                        Chunks = Chunks + [Chunk]
                
                # print (doc.metadata)
                # print ("Content : ", doc.page_content,"\n---")
                
            except Exception :
                pass

    print (len(Chunks))

    return Chunks

In [None]:
url = "https://www.ibm.com/think/topics/cloud-computing"

Chunks = Build_Chunks (url, "IBM", 500)

**Vectorise the data**  
> Once the chunks are creared along with the supporging data, use embedding model and craete vectors  
> Use a HF model which is suitable for general purpose 

In [None]:
Embedder = SentenceTransformer ("sentence-transformers/all-MiniLM-L6-v2")

In [None]:
# Create vectors and store in the Chunks 
for idx, Chunk in enumerate (Chunks):

    vector = Embedder.encode (Chunk['text'])
    # print (type(vector))
    # print (vector)

    Chunks[idx]['vector'] = vector.tolist ()
  

In [None]:
Chunks

In [None]:
# Check various topics existing in all Chunks
Topics = list({c["topic"] for c in Chunks})
Topics

In [None]:
# Create a Lance DB Vector Base
DB = lancedb.connect ('Vector_DB')

# Create a Table and add the Chunks data
table = DB.create_table("article", data=Chunks, mode="overwrite") 
print (table.schema)

In [None]:
# Query a vector
Query = "Platforms used as business in today's world"

Query_Vector = Embedder.encode (Query).tolist ()

Results = table.search(Query_Vector).limit(5).to_list ()

for Rs in Results :

    print (Rs['_distance']," ## ",Rs ['text'])

**Create a Tech Repo**  
> From various sources in internet, create a knowledge repo with all information chunked and vectorised

In [None]:
# Gather multiple reference material for Technology information
References = [{'Source' : 'Microsoft', 'url' : "https://azure.microsoft.com/en-us/resources/cloud-computing-dictionary/what-is-cloud-computing"},
              {'Source' : 'IBM', 'url' : "https://www.ibm.com/think/topics/cloud-computing"},
              {'Source' : 'Oracle', 'url' : "https://www.oracle.com/in/cloud/what-is-cloud-computing/"},
              {'Source' : 'AWS', 'url' : "https://aws.amazon.com/what-is/iot/"},
              {'Source' : 'IBM', 'url' : "https://www.ibm.com/think/topics/edge-ai"},              
              {'Source' : 'Microsoft', 'url' : "https://azure.microsoft.com/en-us/resources/cloud-computing-dictionary/what-is-edge-computing"},
              {'Source' : 'IBM', 'url' : "https://www.ibm.com/think/topics/edge-computing"},              
              {'Source' : 'Fortinet', 'url' : "https://www.fortinet.com/resources/cyberglossary/edge-computing"},
              {'Source' : 'NVIDIA', 'url' : "https://blogs.nvidia.com/blog/what-is-edge-ai/"},
              {'Source' : 'MIT', 'url' : "https://mitsloan.mit.edu/ideas-made-to-matter/machine-learning-explained"},
              {'Source' : 'AWS', 'url' : "https://aws.amazon.com/what-is/machine-learning/"}
            ]

Chunks = []
for Ref in References:
    
    Parts = Build_Chunks (Ref['url'], Ref ['Source'], 500)
    Chunks = Chunks + Parts

print (len(Chunks))

In [None]:
Embedder_1 = SentenceTransformer ("sentence-transformers/all-mpnet-base-v2")

In [None]:
# Create vectors and store in the Chunks 
for idx, Chunk in enumerate (Chunks):

    vector = Embedder_1.encode (Chunk['text'])
    Chunks[idx]['vector'] = vector.tolist ()

In [None]:
# Create a Table and add the Chunks data
table = DB.create_table("tech_ref", data=Chunks, mode="overwrite") 
print (table.schema)

In [None]:
# Query a vector
Query = "There are many service providers"

Query_Vector = Embedder_1.encode (Query).tolist ()

# Results = table.search(Query_Vector).distance_type("cosine").limit(5).to_list ()
Results = table.search().where("topic IN ('What is edge computing?')").to_list ()

for Rs in Results :

    # print (Rs['_distance'],Rs['source']," ## ",Rs ['text'])
    print (Rs['source']," ## ",Rs ['text'])