In [11]:
from vertexai.generative_models import (
    Content,
    FunctionDeclaration,
    GenerationConfig,
    GenerativeModel,
    Part,
    Tool
)
import google.generativeai as genai
import os
import json
import vertexai
from vertexai.language_models import TextEmbeddingModel
from google.oauth2 import service_account

  from .autonotebook import tqdm as notebook_tqdm


In [53]:
os.environ["GOOGLE_API_KEY"] = "AIzaSyBFOkj3Cla3JGGYS1xDTEF6Uol3Mv-Jugc"

In [119]:
from langchain_google_genai import GoogleGenerativeAIEmbeddings

embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")

In [60]:
total_docs= []

In [62]:
from langchain_community.document_loaders import PyPDFLoader
import os


folder_path = 'books'  # Replace with the actual folder path

for filename in os.listdir(folder_path):
    file_path = os.path.join(folder_path, filename)
    if os.path.isfile(file_path):
        # Process the file
        loader = PyPDFLoader(file_path)
        pages = loader.load_and_split()
        total_docs.extend(pages)

In [63]:
len(pages)

1225

In [67]:
from langchain_community.vectorstores import Chroma
from chromadb.config import Settings
import os

ROOT_DIRECTORY = os.path.dirname(os.path.realpath('__file__'))

CHROMA_SETTINGS = Settings(
    anonymized_telemetry=False,
    is_persistent=True,
)

db = Chroma.from_documents(pages, 
                           embeddings, 
                           collection_metadata={'hnsw:space': 'cosine'},
                           persist_directory = f"{ROOT_DIRECTORY}/DB")

In [None]:
db1 = Chroma(
    persist_directory=f"{ROOT_DIRECTORY}/DB",
    embedding_function=embeddings,
    client_settings=CHROMA_SETTINGS
)

In [81]:
hypo_params = {
	    'baseline_k': 20,
	    'exploration_multiplier': 5
	}

In [82]:
text = "How load balancer works in cloud computing?"

In [83]:

k = hypo_params['baseline_k'] * hypo_params['exploration_multiplier']
if hypo_params['verbose']:
    print(f"\n>>> Performing Initial Retrieval of {k} documents...\n")
docs = db.similarity_search_with_score(
    text, 
    k=k
)


>>> Performing Initial Retrieval of 100 documents...



In [102]:
from langchain_core.prompts import PromptTemplate, ChatPromptTemplate
from langchain_core.runnables import RunnableParallel, RunnablePassthrough
from langchain_core.pydantic_v1 import BaseModel, Field
from langchain_google_genai import GoogleGenerativeAI
from langchain_core.output_parsers import JsonOutputParser

KEYWORD_EXTRACTION_PROMPT = """
Your goal is to extract a list of keywords from an input phrase, sentence, or several sentences.

- You can only generate 1 to 5 keywords.
- Keywords should be nouns, issues, concepts
- Keywords should not include verbs, prepositions, pronouns
- Each keyword can only be one word long.
- If the input is just a single word, return that word as the only keyword.

{format_instructions}

The input is:
{input}
"""

class KeywordListSchema(BaseModel):
    keywordList: list[str] = Field(description="list of one-word keywords based on a given phrase")

parser = JsonOutputParser(pydantic_object=KeywordListSchema)

prompt = ChatPromptTemplate.from_template(
    template=KEYWORD_EXTRACTION_PROMPT,
    intput_variables = ["input"],
    partial_variables = {
        'format_instructions': parser.get_format_instructions()
    }
)

llm = GoogleGenerativeAI(model="models/text-bison-001")

keyword_extraction_chain = (
    {'input': RunnablePassthrough()}
    | prompt
    | llm
    | parser
)

keywords = keyword_extraction_chain.invoke(text)['keywordList']

In [104]:
remaining_docs_with_keywords = list()
    
print(f"""\n>>> Checking {len(docs[hypo_params['baseline_k']:])} 
        Docs ranked after {hypo_params['baseline_k']} for presence of keyword...""")

for r in docs[hypo_params['baseline_k']:]:
    page_content = r[0].page_content.lower()
    for keyword in keywords:
        if keyword.lower() in page_content:
            remaining_docs_with_keywords.append(r)
            continue
            
print(f">>> ...{len(remaining_docs_with_keywords)} neglected Docs identified\n")


>>> Checking 80 
            Docs ranked after 20 for presence of keyword...
>>> ...111 neglected Docs identified



In [126]:
from hdbscan import HDBSCAN

print(f"\n>>> Clustering neglected Docs...")

data_embeddings = embeddings.embed_documents([
    r[0].page_content 
    for r in remaining_docs_with_keywords])
hdb = HDBSCAN(min_samples=1, min_cluster_size=3).fit(data_embeddings)
remaining_docs_with_cat = filter(lambda x: x[1] != -1, zip([r[0].page_content for r in remaining_docs_with_keywords], hdb.labels_))

cat_dict = {}

for page_content, cat in remaining_docs_with_cat:
    if cat not in cat_dict:
        cat_dict[cat] = [page_content]
    else:
        cat_dict[cat].append(page_content)
        
print(f">>> ...{len(cat_dict)} Clusters identified\n")
        


>>> Clustering neglected Docs...
>>> ...20 Clusters identified



In [133]:
hypo_docs = list()
    
if hypo_params['verbose']:
    print(f"\n>>> Generating Hypothetical Documents for each Doc Cluster...\n")

HYPOTHETICAL_DOCUMENT_PROMPT = """
Your instruction is to generate a single hypothetical document from an input.
- This hypothetical document must be similar in style, tone and voice as examples you are provided with.
- This hypothetical document must appear like it was written by the same author as the examples you are provided with.
- This hypothetical document must also be similar in length with the examples you are provided with.

{format_instructions}

### EXAMPLES ###
Below are some examples of hypothetical documents, all written by the same author, in pairs of <Input> and <Hypothetical Document>:

{ref_documents}

### INSTRUCTION ###
Now generate a new hypothetical document. 

<Input>
{input}
<Hypothetical Document>

"""

class HypotheticalDocumentSchema(BaseModel):
    hypotheticalDocument: str = Field(description="a hypothetical document given an input word, phrase or question")

parser = JsonOutputParser(pydantic_object=HypotheticalDocumentSchema)

prompt = ChatPromptTemplate.from_template(
    template=HYPOTHETICAL_DOCUMENT_PROMPT,
    intput_variables = ["input", "ref_documents"],
    partial_variables = {
        'format_instructions': parser.get_format_instructions()
    }
)

hypothetical_document_chain = (
    {'input': RunnablePassthrough(), 'ref_documents': RunnablePassthrough()}
    | prompt
    | llm
    | parser
)

cat_ii = 1
for cat in cat_dict.keys():

    ref_doc_string = ""
    doc_ii = 1
    hypo_doc = hypothetical_document_chain.invoke(
        {'input': text, 'ref_documents': ref_doc_string}
    )['hypotheticalDocument']
    
    hypo_docs.append(hypo_doc)
    
    cat_ii += 1


>>> Generating Hypothetical Documents for each Doc Cluster...



In [134]:
len(hypo_docs)

20

In [135]:
hypo_docs

["A load balancer is a software or hardware device that distributes incoming network or application traffic across a number of servers. Load balancing can improve performance and reliability by distributing the load across multiple servers. In cloud computing, load balancing is used to distribute traffic across multiple virtual machines (VMs) or containers. There are a number of different load balancing algorithms that can be used, each with their own advantages and disadvantages. The most common load balancing algorithms are:\n\n* Round robin: This is the simplest load balancing algorithm. It simply distributes traffic across the available servers in a round-robin fashion.\n* Weighted round robin: This is a variation of round robin that assigns weights to each server. Traffic is then distributed to the servers in proportion to their weights.\n* Least connections: This algorithm distributes traffic to the server with the fewest connections.\n* Weighted least connections: This is a vari