In [74]:

print("#### Proposition-Based Chunking ####")

from langchain.output_parsers import PydanticOutputParser
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.prompts import ChatPromptTemplate
from langchain.chains import LLMChain
from typing import List
from langchain_core.pydantic_v1 import BaseModel
from langchain import hub
import textract
import time
from dotenv import load_dotenv
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

load_dotenv()

api_key = "AIzaSyADAsholuvCPecxj8W9zj-TkZ431vtSMTc"

llm = ChatGoogleGenerativeAI(model='gemini-1.5-flash', google_api_key=api_key)



#### Proposition-Based Chunking ####


In [81]:
text = textract.process("test.txt")
text = text.decode('utf-8')
print(text)
#print(type(text))

Dinosaur, (clade Dinosauria), the common name given to a group of reptiles, often very large, that first appeared roughly 245 million years ago (near the beginning of the Middle Triassic Epoch) and thrived worldwide for nearly 180 million years. Most died out by the end of the Cretaceous Period, about 66 million years ago, but many lines of evidence now show that one lineage evolved into birds about 155 million years ago.

The name dinosaur comes from the Greek words deinos (“terrible” or “fearfully great”) and sauros (“reptile” or “lizard”). The English anatomist Richard Owen proposed the formal term Dinosauria in 1842 to include three giant extinct animals (Megalosaurus, Iguanodon, and Hylaeosaurus) represented by large fossilized bones that had been unearthed at several locations in southern England during the early part of the 19th century. Owen recognized that these reptiles were far different from other known reptiles of the present and the past for three reasons: they were large

In [75]:
summary_prompt = ChatPromptTemplate.from_messages([
    ("system", "You are an AI assistant that extracts summaries from text."),
    ("human", "Summarize the following text:\n\n{text}"),
    ("human", "Format your response as a concise summary.")
])

chain_summary = LLMChain(llm=llm, prompt=summary_prompt, output_parser=parser)

sentiment_prompt = ChatPromptTemplate.from_messages([
    ("system", "You are an AI assistant that analyzes the sentiment of text."),
    ("human", "Analyze the sentiment of the following text:\n\n{text}"),
    ("human", "Format your response as a JSON object with a 'sentiment' key containing the sentiment analysis.")
])

chain_sentiment = LLMChain(llm=llm, prompt=sentiment_prompt, output_parser=parser)

key_proposition_prompt = ChatPromptTemplate.from_messages([
    ("system", "You are an AI assistant that extracts key propositions from text."),
    ("human", "Extract key propositions from the following text:\n\n{text}"),
    ("human", "Format your response as a JSON object with a 'sentences' key containing a list of proposition strings.")
])

chain_key_proposition = LLMChain(llm=llm, prompt=key_proposition_prompt, output_parser=parser)



In [82]:
obj = hub.pull("wfh/proposal-indexing")


class Sentences(BaseModel):
    sentences: List[str]

parser = PydanticOutputParser(pydantic_object=Sentences)

chain = chain_key_proposition

def get_propositions(text):
    try:
        result = chain.run(text=text)
        return result.sentences
    except Exception as e:
        print(f"Error in extraction: {e}")
        return []

paragraphs = text.split("\n\n")
text_propositions = []

for i, para in enumerate(paragraphs):
    propositions = get_propositions(para)
    text_propositions.extend(propositions)
    print(f"Done with {i}")
    time.sleep(8)   

print(f"You have {len(text_propositions)} propositions")
#print(text_propositions)
#print(len(text_propositions))



Done with 0
Done with 1
Done with 2
Done with 3
You have 31 propositions


In [83]:
from sentence_transformers import SentenceTransformer
import numpy


embedding_model = SentenceTransformer('Snowflake/snowflake-arctic-embed-l')


embeddings = embedding_model.encode(text_propositions, show_progress_bar=True)

embeddings_np = numpy.array(embeddings).astype('float32')

Batches: 100%|██████████| 1/1 [00:04<00:00,  4.20s/it]


In [84]:
import faiss #facebook ai similarity search 

query = "What are the earliest evidences of dinosaurs?"


def retrieve(query, nearest_neighbours=5):

    dimension = embeddings_np.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(embeddings_np)

    query_embedding = embedding_model.encode(query, show_progress_bar=True)
    query_embedding_np = numpy.array([query_embedding]).astype('float32')
    distance, indices = index.search(query_embedding_np, nearest_neighbours)

    contexts = []
    for i in range(len(indices[0])):
        chunk_index = indices[0][i]
        similarity = 1 / (1 + distance[0][i])
        chunk_text = text_propositions[chunk_index]
        contexts.append(f"Rank {i+1}: {chunk_text} | Similarity: {similarity:.4f}")

    return "\n\n".join(contexts)





In [86]:
retrieved_context = retrieve(query)

Batches: 100%|██████████| 1/1 [00:00<00:00,  1.20it/s]


In [63]:
prompt_template = ChatPromptTemplate.from_template("""
Answer the following question based on the provided context:

Question: {question}

Context: {context}

Provide relevant answers to the question based on the context.
Don’t justify your answers.
Don’t give information not mentioned in the CONTEXT INFORMATION.
Do not say "according to the context" or "mentioned in the context" or similar.
""")

prompt = prompt_template.format(context=retrieved_context, question=query)
print(prompt)



Human: 
Answer the following question based on the provided context:

Question: What are the basic eligibility criteria for an individual to apply for Indian citizenship ?

Context: Rank 1: Any person born in India as defined in the Government of India Act, 1935 (as originally enacted), or whose parents or grandparents were born in India, and who is ordinarily residing in a country outside India, can be deemed a citizen of India. | Similarity: 0.7431

Rank 2: A person cannot be a citizen of India under Article 5, 6, or 8 if they voluntarily acquired citizenship of a foreign state. | Similarity: 0.7258

Rank 3: A person is considered an Overseas Citizen of India if either of their parents was born in the territory of India. | Similarity: 0.7229

Rank 4: A person is considered an Overseas Citizen of India if they were born in the territory of India. | Similarity: 0.7211

Rank 5: A person who has migrated to India from Pakistan shall be deemed a citizen of India at the commencement of thi

In [64]:
prompt_template_legal = ChatPromptTemplate.from_template("""
### Legal Assistance Query

You are a legal assistant. Answer the following query based on the provided context.                                                     

**Query:**
{question}

**Context:**
{context}

### Instructions:
- Provide a clear and concise answer to the query.
- Add relevant information based on the context.
- Mention additional facts or details that are not included in the context. 
- Do not include irrelevant information that is out of context of the query.
- Do not go beyond the scope of the context.
- Do not provide output that does not contain the context.
- Avoid phrases like "according to the context" or similar.
                                                         
""")


prompt = prompt_template_legal.format(context=retrieved_context, question=query)
print(prompt)



Human: 
### Legal Assistance Query

You are a legal assistant. Answer the following query based on the provided context.                                                     

**Query:**
What are the basic eligibility criteria for an individual to apply for Indian citizenship ?

**Context:**
Rank 1: Any person born in India as defined in the Government of India Act, 1935 (as originally enacted), or whose parents or grandparents were born in India, and who is ordinarily residing in a country outside India, can be deemed a citizen of India. | Similarity: 0.7431

Rank 2: A person cannot be a citizen of India under Article 5, 6, or 8 if they voluntarily acquired citizenship of a foreign state. | Similarity: 0.7258

Rank 3: A person is considered an Overseas Citizen of India if either of their parents was born in the territory of India. | Similarity: 0.7229

Rank 4: A person is considered an Overseas Citizen of India if they were born in the territory of India. | Similarity: 0.7211

Rank 5: 

In [87]:
prompt_template_quiz =  ChatPromptTemplate.from_template("""
### Student Doubt Solver

You are a knowledgeable tutor. Answer the following query based on the provided context.

**Query:**
{question}

**Context:**
{context}

### Instructions:
- Provide a clear and concise answer to the query.
- Explain concepts in an easy-to-understand manner.
- Include relevant examples or additional information based on the context.
- Offer tips or additional resources that could help the student.
- Avoid including information that is out of the context of the query.
- Do not go beyond the scope of the context.
- Avoid using overly technical language unless necessary, and provide explanations for any technical terms used.

"""
)


prompt = prompt_template_quiz.format(context=retrieved_context, question=query)
print(prompt)


Human: 
### Student Doubt Solver

You are a knowledgeable tutor. Answer the following query based on the provided context.

**Query:**
What are the earliest evidences of dinosaurs?

**Context:**
Rank 1: Dinosaurs first appeared near the beginning of the Middle Triassic Epoch. | Similarity: 0.7490

Rank 2: Dinosaurs first appeared roughly 245 million years ago. | Similarity: 0.7412

Rank 3: The earliest verifiable published record of dinosaur remains is a note in the 1820 American Journal of Science and Arts by Nathan Smith. | Similarity: 0.7085

Rank 4: One lineage of dinosaurs evolved into birds about 155 million years ago. | Similarity: 0.7066

Rank 5: Most dinosaurs died out about 66 million years ago. | Similarity: 0.6996

### Instructions:
- Provide a clear and concise answer to the query.
- Explain concepts in an easy-to-understand manner.
- Include relevant examples or additional information based on the context.
- Offer tips or additional resources that could help the student.


In [88]:


import os
import google.generativeai as genai


api_key = "AIzaSyADAsholuvCPecxj8W9zj-TkZ431vtSMTc"
if not api_key:
    raise ValueError("API key not found in environment variables.")

genai.configure(api_key=api_key)

llm = genai.GenerativeModel('gemini-1.5-pro')

response = llm.generate_content(prompt)

print(response.text)


The earliest dinosaurs appeared around **245 million years ago**. This was near the beginning of the **Middle Triassic Epoch**, a period of time on the geological timescale.  

Want to learn more about the Triassic period and the rise of dinosaurs?  Check out your textbook or search online for resources using keywords like "Triassic Period" and "Early Dinosaurs"! 



In [89]:
refined_prompt_template = ChatPromptTemplate.from_template("""The original query is as follows: {query}
We have provided an existing answer: {existing_answer}
                                                           
We have the opportunity to refine the existing answer (only if needed) with some more context below.
------------
{context}
------------
                                                           
Instructions:                                                           
- Given the new context, refine the original answer to better answer the query. If the context isn't useful, return the original answer.
- Directly mention the refined answer without saying "Here is the refined answer" or similar.
                                                                            
""")

refined_prompt = refined_prompt_template.format(query=query, existing_answer=response.text, context=retrieved_context)

response = llm.generate_content(refined_prompt)
print(response.text)



The earliest dinosaurs appeared around **245 million years ago** near the beginning of the **Middle Triassic Epoch**.  Some of the earliest dinosaurs include **Herrerasaurus**, **Eoraptor**, and **Coelophysis**. These early dinosaurs were relatively small, many being bipedal carnivores, though some, like **Plateosaurus**, were larger herbivores.  

Want to learn more about the Triassic period and the rise of dinosaurs?  Check out your textbook or search online for resources using keywords like "Triassic Period" and "Early Dinosaurs"! 

