In [None]:
from langchain_community.document_loaders import PyPDFLoader

loader = PyPDFLoader("Employee_Handbook.pdf")
pages = loader.load_and_split()

In [15]:
pages=pages[4:]

In [16]:
text = "\n".join([doc.page_content for doc in pages])

In [17]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size=500,
    chunk_overlap=150,
    length_function=len,
    is_separator_regex=False,
)
docs=text_splitter.create_documents([text])

In [18]:
for i, d in enumerate(docs):
    d.metadata = {"doc_id": i}

In [19]:
import os
import google.generativeai as genai
import pandas as pd
os.environ["GEMINI_API_KEY"]="API-HERE"
def get_embeddings(text):
    model = 'models/embedding-001'
    embedding = genai.embed_content(model=model,
                                    content=text,
                                    task_type="retrieval_document")
    return embedding['embedding']
content_list = [doc.page_content for doc in docs]

# Get embeddings for each content
embeddings = [get_embeddings(content) for content in content_list]

# Create DataFrame with page_content and embeddings
dataframe = pd.DataFrame({
    'page_content': content_list,
    'embeddings': embeddings
})

In [7]:

import clickhouse_connect

client = clickhouse_connect.get_client(
    host='host-here',
    port=443,
    username='user-name-here',
    password='password-here'
)

In [9]:
client.command("""
    CREATE TABLE default.movies (
        id Int64,
        page_content String,
        embeddings Array(Float32),
        CONSTRAINT check_data_length CHECK length(embeddings) = 768
    ) ENGINE = MergeTree()
    ORDER BY id
    """) 

['0', 'chi-msc-5332e745-msc-5332e745-0-0', 'OK', '0', '0']

In [20]:

batch_size = 10  # Adjust based on your needs

num_batches = len(dataframe) // batch_size

for i in range(num_batches):
    start_idx = i * batch_size
    end_idx = start_idx + batch_size
    batch_data = dataframe[start_idx:end_idx]

    client.insert("default.movies", batch_data.to_records(index=False).tolist(), column_names=batch_data.columns.tolist())
    print(f"Batch {i+1}/{num_batches} inserted.")
    
client.command("""
ALTER TABLE default.movies
    ADD VECTOR INDEX vector_index embeddings
    TYPE MSTG
""")

Batch 1/17 inserted.
Batch 2/17 inserted.
Batch 3/17 inserted.
Batch 4/17 inserted.
Batch 5/17 inserted.
Batch 6/17 inserted.
Batch 7/17 inserted.
Batch 8/17 inserted.
Batch 9/17 inserted.
Batch 10/17 inserted.
Batch 11/17 inserted.
Batch 12/17 inserted.
Batch 13/17 inserted.
Batch 14/17 inserted.
Batch 15/17 inserted.
Batch 16/17 inserted.
Batch 17/17 inserted.


['0', 'chi-msc-5332e745-msc-5332e745-0-0', 'OK', '0', '0']

In [21]:
def get_relevant_docs(user_query):
    query_embeddings=get_embeddings(user_query)
    results = client.query(f"""
        SELECT page_content,
        distance(embeddings, {query_embeddings}) as dist FROM default.movies ORDER BY dist LIMIT 3
    """)
    relevant_docs = []
    for row in results.named_results():
        relevant_docs.append(row['page_content'])
    return relevant_docs

In [22]:
def make_rag_prompt(query, relevant_passage):
    relevant_passage=' '.join(relevant_passage)
    prompt = (
        f"You are an helpful and informative chatbot that answers questions using text from the reference passage included below. "
        f"Respond in a complete sentence and make sure that your response is easy to understand for everyone."
        f"Maintain a friendly and conversational tone. If the passage is irrelevant, feel free to ignore it.\n\n"
        f"QUESTION: '{query}'\n"
        f"PASSAGE: '{relevant_passage}'\n\n"
        f"ANSWER:"
    )
    return prompt

In [23]:
import google.generativeai as genai
def generate_response(user_prompt):
    model = genai.GenerativeModel('gemini-pro')
    answer = model.generate_content(user_prompt)
    return answer.text

In [24]:
def generate_answer(query):
    #retrieve top 3 relevant text chunks
    relevant_text = get_relevant_docs(query)
    text=" ".join(relevant_text)
    prompt = make_rag_prompt(query, 
                             relevant_passage=relevant_text) # joining the relevant chunks to create a single passage
    answer = generate_response(prompt)

    return answer

In [31]:
answer = generate_answer(query="what are the office working hours")
print(answer)

The office working hours are as follows: 9:00 am to 5:45 pm or 9:30 am to 6:15 pm.


In [27]:
answer = generate_answer(query="what is the lunch time")
print(answer)

Lunch is between 1:00 PM and 1:30 PM.
