In [86]:

print("#### Proposition-Based Chunking ####")

from langchain.output_parsers import PydanticOutputParser
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.prompts import ChatPromptTemplate
from langchain.chains import LLMChain
from typing import List
from langchain_core.pydantic_v1 import BaseModel
from langchain import hub
import textract
import time
from dotenv import load_dotenv
import os


load_dotenv()

api_key = os.getenv('GENAI_2ND_KEY')

llm = ChatGoogleGenerativeAI(model='gemini-1.5-flash', google_api_key=api_key)



#### Proposition-Based Chunking ####


In [84]:
text = textract.process("Review Essay.docx")
text = text.decode('utf-8')
print(text)
print(type(text))

Review Essay (GES1006/GESS1004)
(A0284714Y)



Singapore is a place where people from different cultures live together. This essay talks about how Indian communities have changed over time, from when Singapore was ruled by other countries to now. By analyzing key works by Amrith, Bhattacharya, and Chacko, it aims to answer: how have Indian migrants shaped and been shaped by Singapore's socio-political landscape?

The essay looks at Amrith and Bhattacharya's stories about history to understand why Indian people moved during colonial times and what they faced. Then it talks about Chacko's ideas about what happened after colonialism ended, especially focusing on how Indian professionals fit into society. By comparing these different views, the essay shows how Indian communities have stayed strong and made their own choices over time. It also talks about how the effects of colonialism are still felt today, even as the world changes with globalization and cities growing.

Jayati Bhattachary

In [87]:
obj = hub.pull("wfh/proposal-indexing")


class Sentences(BaseModel):
    sentences: List[str]

parser = PydanticOutputParser(pydantic_object=Sentences)

prompt = ChatPromptTemplate.from_messages([
    ("system", "You are an AI assistant that extracts key propositions from text."),
    ("human", "Extract key propositions from the following text:\n\n{text}"),
    ("human", "Format your response as a JSON object with a 'sentences' key containing a list of proposition strings.")
])

chain = LLMChain(llm=llm, prompt=prompt, output_parser=parser)

def get_propositions(text):
    try:
        result = chain.run(text=text)
        return result.sentences
    except Exception as e:
        print(f"Error in extraction: {e}")
        return []

paragraphs = text.split("\n\n")
text_propositions = []

for i, para in enumerate(paragraphs):
    propositions = get_propositions(para)
    text_propositions.extend(propositions)
    print(f"Done with {i}")
    time.sleep(8)   

print(f"You have {len(text_propositions)} propositions")
#print(text_propositions)
#print(len(text_propositions))



Done with 0
Error in extraction: Invalid json output: Please provide the text you would like me to analyze. I need the text to extract key propositions from.
Done with 1
Done with 2
Done with 3
Done with 4
Done with 5
Error in extraction: Invalid json output: Please provide the text you want me to analyze. I need the text to extract key propositions from.
Done with 6
Done with 7
Done with 8
Done with 9
Done with 10
Done with 11
Done with 12
Done with 13
You have 75 propositions


In [88]:
from sentence_transformers import SentenceTransformer
import numpy


embedding_model = SentenceTransformer('Snowflake/snowflake-arctic-embed-l')


embeddings = embedding_model.encode(text_propositions, show_progress_bar=True)

embeddings_np = numpy.array(embeddings).astype('float32')

Batches: 100%|██████████| 3/3 [00:08<00:00,  2.80s/it]


In [89]:
import faiss #facebook ai similarity search 

query = "Relations between India and Singapore"


def retrieve(query, nearest_neighbours=5):

    dimension = embeddings_np.shape[1]
    index = faiss.IndexFlatL2(dimension)
    index.add(embeddings_np)

    query_embedding = embedding_model.encode(query, show_progress_bar=True)
    query_embedding_np = numpy.array([query_embedding]).astype('float32')
    distance, indices = index.search(query_embedding_np, nearest_neighbours)

    contexts = []
    for i in range(len(indices[0])):
        chunk_index = indices[0][i]
        similarity = 1 / (1 + distance[0][i])
        chunk_text = text_propositions[chunk_index]
        contexts.append(f"Rank {i+1}: {chunk_text} | Similarity: {similarity:.4f}")

    return "\n\n".join(contexts)





In [90]:
retrieved_context = retrieve(query)

Batches: 100%|██████████| 1/1 [00:01<00:00,  1.04s/it]


In [91]:
prompt_template = ChatPromptTemplate.from_template("""
Answer the following question based on the provided context:

Question: {question}

Context: {context}

Provide relevant answers to the question based on the context.
Don’t justify your answers.
Don’t give information not mentioned in the CONTEXT INFORMATION.
Do not say "according to the context" or "mentioned in the context" or similar.
""")

prompt = prompt_template.format(context=retrieved_context, question=query)
print(prompt)



Human: 
Answer the following question based on the provided context:

Question: Relations between India and Singapore

Context: Rank 1: Indian communities in Singapore have undergone significant transformations throughout history. | Similarity: 0.7563

Rank 2: Indian immigration to Singapore is complex and fascinating. | Similarity: 0.7560

Rank 3: Connections between Singapore and India have shifted from labor-related aspects to entrepreneurial initiatives and economic collaboration. | Similarity: 0.7548

Rank 4: Indian communities in Singapore will continue to evolve. | Similarity: 0.7467

Rank 5: Indian communities contribute to Singapore's diverse cultural landscape. | Similarity: 0.7393

Provide relevant answers to the question based on the context.
Don’t justify your answers.
Don’t give information not mentioned in the CONTEXT INFORMATION.
Do not say "according to the context" or "mentioned in the context" or similar.



In [99]:


import os
import google.generativeai as genai


api_key = os.getenv("GENAI_API_KEY")
if not api_key:
    raise ValueError("API key not found in environment variables.")

genai.configure(api_key=api_key)

llm = genai.GenerativeModel('gemini-1.5-pro')

response = llm.generate_content(prompt)

print(response.text)


Connections between Singapore and India have shifted from labor-related aspects to entrepreneurial initiatives and economic collaboration. 
Indian immigration to Singapore is complex and fascinating. 



In [102]:
refined_prompt_template = ChatPromptTemplate.from_template("""The original query is as follows: {query}
We have provided an existing answer: {existing_answer}
We have the opportunity to refine the existing answer (only if needed) with some more context below.
------------
{context}
------------
Given the new context, refine the original answer to better answer the query. If the context isn't useful, return the original answer.
Don't mention Refined Answer
""")

refined_prompt = refined_prompt_template.format(query=query, existing_answer=response.text, context=retrieved_context)

response = llm.generate_content(refined_prompt)
print(response.text)


Connections between Singapore and India extend beyond just economic collaboration. Indian communities have been an integral part of Singapore's history, their immigration a complex and fascinating story of transformation. While initially defined by labor-related contributions,  Indian communities in Singapore have evolved to spearhead entrepreneurial initiatives, reflecting a shift in the two nations' relationship. Despite these changes, the historical and cultural ties between India and Singapore remain deeply intertwined, with Indian communities continuing to shape Singapore's diverse cultural landscape. 

