In [1]:

print("#### Proposition-Based Chunking ####")

from langchain.output_parsers import PydanticOutputParser
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.prompts import ChatPromptTemplate
from langchain.chains import LLMChain
from typing import List
from langchain_core.pydantic_v1 import BaseModel
from langchain import hub
import textract
import time
from dotenv import load_dotenv
import os

load_dotenv()

api_key = "AIzaSyADAsholuvCPecxj8W9zj-TkZ431vtSMTc"

llm = ChatGoogleGenerativeAI(model='gemini-1.5-flash', google_api_key=api_key)



#### Proposition-Based Chunking ####


  from .autonotebook import tqdm as notebook_tqdm


In [104]:
text = textract.process("test.txt")
text = text.decode('utf-8')
print(text)
print(type(text))

One day, you are walking through a prehistoric forest, you hear a twig snap behind you. You turn, slowly, hoping that it isn't what you think it is. Standing behind you is a Stegosaurus, chomping away on some leafy ferns. You breathe a sigh of relief. You were afraid it was a predator with its eye on you as a tasty snack. Of course, that was millions of years ago, so it could only happen in your imagination. In fact it was so long ago that there were no people around at all; there weren't even any apes or monkeys yet!

The prehistoric world of the dinosaurs is both strange and exciting. Many things were very different from what you see today. Some dinosaurs stood taller than buildings, and others weighed as much as your entire family put together. Some dinosaurs were small enough to fit in your backpack. Some would not hesitate to eat you on the spot if they were hungry. Others spent all day chomping on ferns and other plants. The one kind of dinosaur that never existed was a dull dino

In [105]:
obj = hub.pull("wfh/proposal-indexing")


class Sentences(BaseModel):
    sentences: List[str]

parser = PydanticOutputParser(pydantic_object=Sentences)

prompt = ChatPromptTemplate.from_messages([
    ("system", "You are an AI assistant that extracts key propositions from text."),
    ("human", "Extract key propositions from the following text:\n\n{text}"),
    ("human", "Format your response as a JSON object with a 'sentences' key containing a list of proposition strings.")
])

chain = LLMChain(llm=llm, prompt=prompt, output_parser=parser)

def get_propositions(text):
    try:
        result = chain.run(text=text)
        return result.sentences
    except Exception as e:
        print(f"Error in extraction: {e}")
        return []

paragraphs = text.split("\n\n")
text_propositions = []

for i, para in enumerate(paragraphs):
    propositions = get_propositions(para)
    text_propositions.extend(propositions)
    print(f"Done with {i}")
    time.sleep(8)   

print(f"You have {len(text_propositions)} propositions")
#print(text_propositions)
#print(len(text_propositions))



Done with 0
Done with 1
You have 20 propositions


In [108]:
from sentence_transformers import SentenceTransformer
import numpy


embedding_model = SentenceTransformer('Snowflake/snowflake-arctic-embed-l')


embeddings = embedding_model.encode(text_propositions, show_progress_bar=True)

embeddings_np = numpy.array(embeddings).astype('float32')

Batches: 100%|██████████| 1/1 [00:45<00:00, 45.68s/it]


In [109]:
import faiss #facebook ai similarity search 

query = "safasf"


def retrieve(query, nearest_neighbours=5):

    dimension = embeddings_np.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(embeddings_np)

    query_embedding = embedding_model.encode(query, show_progress_bar=True)
    query_embedding_np = numpy.array([query_embedding]).astype('float32')
    distance, indices = index.search(query_embedding_np, nearest_neighbours)

    contexts = []
    for i in range(len(indices[0])):
        chunk_index = indices[0][i]
        similarity = 1 / (1 + distance[0][i])
        chunk_text = text_propositions[chunk_index]
        contexts.append(f"Rank {i+1}: {chunk_text} | Similarity: {similarity:.4f}")

    return "\n\n".join(contexts)





In [110]:
retrieved_context = retrieve(query)

Batches: 100%|██████████| 1/1 [00:13<00:00, 13.86s/it]


In [111]:
prompt_template = ChatPromptTemplate.from_template("""
Answer the following question based on the provided context:

Question: {question}

Context: {context}

Provide relevant answers to the question based on the context.
Don’t justify your answers.
Don’t give information not mentioned in the CONTEXT INFORMATION.
Do not say "according to the context" or "mentioned in the context" or similar.
""")

prompt = prompt_template.format(context=retrieved_context, question=query)
print(prompt)



Human: 
Answer the following question based on the provided context:

Question: safasf

Context: Rank 1: You hope it isn't a predator. | Similarity: 0.6786

Rank 2: The event happened millions of years ago. | Similarity: 0.6724

Rank 3: There were no people around millions of years ago. | Similarity: 0.6665

Rank 4: Many things were very different from what you see today. | Similarity: 0.6662

Rank 5: There were no apes or monkeys millions of years ago. | Similarity: 0.6652

Provide relevant answers to the question based on the context.
Don’t justify your answers.
Don’t give information not mentioned in the CONTEXT INFORMATION.
Do not say "according to the context" or "mentioned in the context" or similar.



In [112]:


import os
import google.generativeai as genai


api_key = "AIzaSyADAsholuvCPecxj8W9zj-TkZ431vtSMTc"
if not api_key:
    raise ValueError("API key not found in environment variables.")

genai.configure(api_key=api_key)

llm = genai.GenerativeModel('gemini-1.5-pro')

response = llm.generate_content(prompt)

print(response.text)


I'm sorry, but I cannot answer the question. The provided context does not contain any relevant information to answer "safasf". 



In [18]:
refined_prompt_template = ChatPromptTemplate.from_template("""The original query is as follows: {query}
We have provided an existing answer: {existing_answer}
We have the opportunity to refine the existing answer (only if needed) with some more context below.
------------
{context}
------------
Given the new context, refine the original answer to better answer the query. If the context isn't useful, return the original answer.
Don't mention Refined Answer
""")

refined_prompt = refined_prompt_template.format(query=query, existing_answer=response.text, context=retrieved_context)

response = llm.generate_content(refined_prompt)
print(response.text)


The provided context does not contain the answer to the question "Relations between India and Singapore". 

