In [1]:
#pip install langchain==0.1.6
#pip install sqlalchemy==2.0.1

#pip install langchain_aws streamlit streamlit_chat

import boto3  # AWS SDK for Python(integrate AWS services for Python)
from langchain.prompts import PromptTemplate
from langchain_community.embeddings import BedrockEmbeddings # To access Bedrock's Titan Text embedding model.BedrockEmbeddings uses AWS Bedrock API to generate embeddings for given text for saving to vectordb 
from langchain_community.vectorstores.faiss import FAISS
from langchain_aws import ChatBedrock
from langchain.chains import ConversationalRetrievalChain  # Langchain fuction to create chatbot that can remember past interactions
import streamlit as st
from streamlit_chat import message

In [2]:
## Add Contexual compressor
from langchain.retrievers import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import LLMChainExtractor

In [2]:
pip show sqlalchemy
pip show langchain_aws
pip list | grep langchain

SyntaxError: invalid syntax (2448159784.py, line 1)

In [15]:
def pretty_print_docs(docs):
  print(f"\n{'-'* 100}\n".join([F"Document{i+1}:\n\n" + d.page_content for i,d in enumerate(docs)]))

In [7]:
DB_FAISS_PATH = 'vectorstore/db_faiss'

# Initialise boto session
boto_session = boto3.Session()
aws_region = boto_session.region_name
# runtime client to call Bedrock models
br_runtime = boto_session.client(service_name='bedrock-runtime', region_name=aws_region)

#Loading the model
def load_llm():
    llm = ChatBedrock(
                      model_id='anthropic.claude-3-sonnet-20240229-v1:0',
                      model_kwargs={"temperature": 0.12, "max_tokens": 2000},
                      client=br_runtime
        )
    return llm


custom_prompt_template = """
    You are a Data Discovery AI assistance created by Fannie Mae.Your goal is to help users with data related questions. 
    You should maintain a friendly customer service tone.

    Here are some important rules for the interaction:
    - Use English language for answering questions.
    - Always stay in charater
    - Look across all datasets before answering the question.
    - Provide top 20 records on questions with multiple records
    - If someone asks irrelevant question, say, I am Data Discovery AI assistant and can't help with above question.
    - Be specific and don't add extra information in the response that is not asked for.

    How do you respond to user's question?

    Think about your answer first before you respond.You will look around all datasets and will use the context to answer the user's question.
    If you don't know the answer, just say that you don't know, don't try to make up an answer.
    
    Context: {context}
    Chat History: {chat_history}
    Question: {question}

    Only return the helpful answer below and nothing else.
    Helpful answer:
"""


def set_custom_prompt():
    """
    Prompt template for QA retrieval for each vectorstore
    """
    prompt = PromptTemplate(template=custom_prompt_template,
                            input_variables=['context', 'question', 'chat_history'])
    return prompt


embeddings = BedrockEmbeddings(model_id='amazon.titan-embed-text-v1', client=br_runtime)
#db = FAISS.load_local(DB_FAISS_PATH, embeddings, allow_dangerous_deserialization=True)
#llm = load_llm()


TypeError: FAISS.__init__() got an unexpected keyword argument 'allow_dangerous_deserialization'

In [9]:
db = FAISS.load_local(DB_FAISS_PATH, embeddings)

In [16]:
retriever = db.as_retriever(search_kwargs={'k':30})
docs = retriever.get_relevant_documents(query="list all tables") 
#pretty_print_docs(docs)

Document1:

﻿Dataset Reg ID: DSET000111
Dataset Name: Participant Demographic Data
Dataset Description: This table provides comparison of demographic group prevalence in Orgs. Volunteer population to that of greater US population
Dataset Type: EDL
Cloud ID: BPO2233
Data Source System: Demographics
Dataset Layer: S3
Dataset Classification Type: External
Topic: US Census Bureau
Topic Category: Demographic Analysis
Business Owner: toolika@gmail.com
Technical POC: abc@gmail.com
S3 Bucket/AWS Service: s3-us-east-1
Structured Data: Structured
Cloud Database Name: cloud_participant_demographic
Table Name: ameri_part_demo_grp
File Format: PARQUET
Refresh Type: Snapshot
Refresh Frequency: Yearly
Dataset Data Retention Period: 6 Years
----------------------------------------------------------------------------------------------------
Document2:

﻿Attribute Reg ID: 
Physical Attribute Name: curr_list_strt_date
Logical Attribute Name: curr list strt date
Description: The agreed upon date, between 

In [18]:
llm = load_llm()
#making the compressor
compressor = LLMChainExtractor.from_llm(llm=llm)
#compressor retriver = base retriever + compressor
compression_retriever = ContextualCompressionRetriever(base_retriever=retriever,
                                                       base_compressor=compressor)

In [19]:
#default compressor prompt
#print(compressor.llm_chain.prompt.template)

Given the following question and context, extract any part of the context *AS IS* that is relevant to answer the question. If none of the context is relevant return NO_OUTPUT. 

Remember, *DO NOT* edit the extracted parts of the context.

> Question: {question}
> Context:
>>>
{context}
>>>
Extracted relevant parts:


In [20]:
from langchain.retrievers.document_compressors import EmbeddingsFilter

embdeddings_filter = EmbeddingsFilter(embeddings=embeddings)
compression_retriever_filter = ContextualCompressionRetriever(base_retriever=retriever,
                                                       base_compressor=embdeddings_filter)
#
compressed_docs = compression_retriever_filter.get_relevant_documents(query="List all tables")
#pretty_print_docs(compressed_docs)


Document1:

﻿Dataset Reg ID: DSET000113
Dataset Name: Loan Performance Data
Dataset Description: This table provides monthly loan-level credit performance data through the Performance Cutoff Date up to property disposition is reported for all loans included in the performance files
Dataset Type: EDL
Cloud ID: BPO2235
Data Source System: Loan Performance
Dataset Layer: S3
Dataset Classification Type: External
Topic: Freddie Loan Population
Topic Category: Loan Performance Data
Business Owner: toolika@gmail.com
Technical POC: ghi@gmail.com
S3 Bucket/AWS Service: s3-us-east-1
Structured Data: Structured
Cloud Database Name: freddie_performance_data
Table Name: fre_performance_data
File Format: PARQUET
Refresh Type: Snapshot
Refresh Frequency: Quarterly
Dataset Data Retention Period: Not defined
----------------------------------------------------------------------------------------------------
Document2:

﻿Dataset Reg ID: DSET000111
Dataset Name: Participant Demographic Data
Dataset Descr

In [25]:
prompt = set_custom_prompt()
from langchain.chains import RetrievalQA
qa = RetrievalQA.from_chain_type(llm=llm,
                                 chain_type="stuff",
                                 retriever=compression_retriever_filter,
                                 verbose=True)
#Ask Question
qa("List all tables")



[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m


{'query': 'List all tables',
 'result': 'Based on the context provided, here are the table names mentioned:\n\n1. fre_performance_data\n2. ameri_part_demo_grp\n3. fre_origination_data'}

In [None]:
"""
qa_chain = ConversationalRetrievalChain.from_llm(llm=llm,
                                                 chain_type='stuff',
                                                 retriever=compression_retriever_filter,
                                                 verbose=True,
                                                 combine_docs_chain_kwargs={'prompt': prompt},
                                                 return_source_documents=True,
                                                 output_key='answer'
                                                )
                                                """