In [None]:
# !pip install groq qdrant-client
# !pip install langchain-groq
# !pip install langchainhub

# Imports

In [1]:
import os
from PyPDF2 import PdfReader
import numpy as np
from langchain_community.vectorstores import Qdrant
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import CharacterTextSplitter
import numpy as np
from langchain.tools.retriever import create_retriever_tool
from langchain.agents import Tool
from langchain import hub
from langchain.agents import AgentExecutor, create_react_agent
from langchain_community.llms import OpenAI
from langchain_groq import ChatGroq
import gradio as gr
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

# Extracting Text from PDFs

In [2]:
def extract_text_from_pdf(pdf_path):
    """
    Extract text content from a PDF file.

    Args:
    - pdf_path (str): The path to the PDF file.

    Returns:
    - str: The extracted text content.
    """
    reader = PdfReader(pdf_path)
    extracted_text = ""
    for page in reader.pages:
        extracted_text += page.extract_text()
    return extracted_text

def extract_text_from_pdfs_in_directory(directory):
    """
    Extract text content from all PDF files in a directory and save as text files.

    Args:
    - directory (str): The path to the directory containing PDF files.
    """
    for filename in os.listdir(directory):
        if filename.endswith(".pdf"):
            pdf_path = os.path.join(directory, filename)
            extracted_text = extract_text_from_pdf(pdf_path)
            txt_filename = os.path.splitext(filename)[0] + ".txt"
            txt_filepath = os.path.join(directory, txt_filename)
            with open(txt_filepath, "w") as txt_file:
                txt_file.write(extracted_text)

# Specify the directory containing PDF files
directory_path = "Docs/"

# Extract text from PDFs in the directory and save as text files
extract_text_from_pdfs_in_directory(directory_path)


# Generating Embedding 

In [3]:
GROQ_API_KEY = "<GROQ_API_KEY>"

In [4]:
# Step 1: List all .txt files in the directory
directory_path = "Docs"
txt_files = [file for file in os.listdir(directory_path) if file.endswith('.txt')]
print(txt_files)

['Facebook Terms of Usage.txt', 'SLSHandbook Contract Law.txt', 'Google Terms of Service_en_in.txt', 'OPENAI Terms of use.txt']


In [5]:
all_documents = {}
for txt_file in txt_files:
    loader = TextLoader(os.path.join(directory_path, txt_file))
    documents = loader.load()

    # Step 2: Split documents into chunks and add metadata
    text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=100, separator="\n")
    docs = text_splitter.split_documents(documents)
    for doc in docs:
        doc.metadata["source"] = txt_file  # Add source metadata

    all_documents[txt_file] = docs

Created a chunk of size 1028, which is longer than the specified 1000


In [6]:
# Initialize the TextEmbedding model
from langchain.embeddings import HuggingFaceEmbeddings

# Step 3: Initialize the TextEmbedding model
embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-mpnet-base-v2"
)

In [7]:
# Step 4: Create Qdrant vector store collections for each document
qdrant_collections = {}
for txt_file in txt_files:
    qdrant_collections[txt_file] = Qdrant.from_documents(
        all_documents[txt_file],
        embeddings,
        location=":memory:", 
        collection_name=txt_file,
    )

In [8]:
for txt_file in txt_files:
    print(f"Collection: {qdrant_collections[txt_file].collection_name}")

Collection: Facebook Terms of Usage.txt
Collection: SLSHandbook Contract Law.txt
Collection: Google Terms of Service_en_in.txt
Collection: OPENAI Terms of use.txt


In [9]:
retriever = {}
for txt_file in txt_files:
    retriever[txt_file] = qdrant_collections[txt_file].as_retriever()

# Setting ReAct Agents

In [32]:
def get_age(name: str, person_database: dict) -> int:
    """
    Get the age of a person from the database.

    Args:
    - name (str): The name of the person.
    - person_database (dict): A dictionary containing person information.

    Returns:
    - int: The age of the person if found, otherwise None.
    """
    if name in person_database:
        return person_database[name]["Age"]
    else:
        return None


def get_age_info(name: str) -> str:
    """
    Get age and health information for a person.

    Args:
    - name (str): The name of the person.

    Returns:
    - str: A string containing age and health information for the person.
    """
    person_database = {
        "Sam": {"Age": 21, "Nationality": "US"},
        "Alice": {"Age": 25, "Nationality": "UK"},
        "Bob": {"Age": 11, "Nationality": "US"}
    }
    age = get_age(name, person_database)
    if age is not None:
        return f"\nAge: {age}\n"
    else:
        return f"\nAge Information for {name} not found.\n"
    


def get_today_date(input : str) -> str:
    import datetime
    today = datetime.date.today()
    return f"\n {today} \n"



def get_relevant_document(name : str) -> str:
    # String name for fuzzy search
    search_name = name

    # Find the best match using fuzzy search
    best_match = process.extractOne(search_name, txt_files, scorer=fuzz.ratio)

    # Get the selected file name
    selected_file = best_match[0]
    
    selected_retriever = retriever[selected_file]

    global query
    results = selected_retriever.get_relevant_documents(query)
    global retrieved_text
    
    total_content = "\n\nBelow are the related document's content: \n\n"
    chunk_count = 0
    for result in results:
        chunk_count += 1
        if chunk_count > 4:
            break
        total_content += result.page_content + "\n"
    retrieved_text = total_content
    return total_content


def get_summarized_text(name : str) -> str:
    from transformers import pipeline
    summarizer = pipeline("summarization", model="Falconsai/text_summarization")
    global retrieved_text
    article = retrieved_text
    return summarizer(article, max_length=1000, min_length=30, do_sample=False)[0]['summary_text']


# Define the Tool
get_age_info_tool = Tool(
    name="Get Age",
    func=get_age_info,
    description="Useful for getting age information for any person. Input should be the name of the person."
)

get_today_date_tool = Tool(
    name="Get Todays Date",
    func=get_today_date,
    description="Useful for getting today's date"
)

get_relevant_document_tool = Tool(
    name="Get Relevant document",
    func=get_relevant_document,
    description="Useful for getting relevant document that we need."
)

get_summarized_text_tool = Tool(
    name="Get Summarized Text",
    func=get_summarized_text,
    description="Useful for getting summarized text for any document."
)

# Common Agent prompts

In [33]:
prompt_react = hub.pull("hwchase17/react")
print(prompt_react.template) 

Answer the following questions as best you can. You have access to the following tools:

{tools}

Use the following format:

Question: the input question you must answer
Thought: you should always think about what to do
Action: the action to take, should be one of [{tool_names}]
Action Input: the input to the action
Observation: the result of the action
... (this Thought/Action/Action Input/Observation can repeat N times)
Thought: I now know the final answer
Final Answer: the final answer to the original input question

Begin!

Question: {input}
Thought:{agent_scratchpad}


# Running the ReAct agent

In [34]:
retrieved_text = ""
tools = [get_relevant_document_tool, get_summarized_text_tool, get_today_date_tool, get_age_info_tool]

model = ChatGroq(model_name="llama3-70b-8192", groq_api_key=GROQ_API_KEY, temperature=0)
# model = OpenAI(openai_api_key="<YOUR_OPENAI>")

react_agent = create_react_agent(model, tools=tools, prompt=prompt_react)
react_agent_executor = AgentExecutor(
    agent=react_agent, tools=tools, verbose=True, handle_parsing_errors=True
)

In [41]:
query = "Give me the summary for the question : What age requirement is specified for using the OpenAI Services, and what provision applies if the user is under 18?"
react_agent_executor.invoke({"input": query})



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mThought: I need to find a relevant document that mentions the age requirement for using OpenAI Services.

Action: Get Relevant document
Action Input: OpenAI Terms of Service[0m[36;1m[1;3m

Below are the related document's content: 

Who we are
OpenAI is an AI research and deployment company. Our mission is to ensure that artificial general intelligence benefits all of humanity. For more information about OpenAI, please visit https://openai.com/about.
Registration and access
Minimum age. You must be at least 13 years old or the minimum age required in your country to consent to use the Services. If you are under 18 you must have your parent or legal guardian’s permission to use the Services. 
Registration. You must provide accurate and complete information to register for an account to use our Services. You may not share your account credentials or make your account available to anyone else and are responsible for all activ

Token indices sequence length is longer than the specified maximum sequence length for this model (732 > 512). Running this sequence through the model will result in indexing errors
Your max_length is set to 1000, but your input_length is only 732. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=366)


[33;1m[1;3mWho we are OpenAI is an AI research and deployment company . You must be at least 13 years old or the minimum age required in your country to consent to use the Services . If you are under 18, you must have your parent or legal guardian’s permission to use our Services. You may not share your account credentials or make your account available to anyone else . This Terms of Use apply to your use of ChatGPT Enterprise, our APIs, and our other services for businesses and developers .[0m[32;1m[1;3mThought: I now know the answer to the original question.

Final Answer: The age requirement for using OpenAI Services is at least 13 years old or the minimum age required in your country to consent to use the Services. If you are under 18, you must have your parent or legal guardian's permission to use the Services.[0m

[1m> Finished chain.[0m


{'input': 'Give me the summary for the question : What age requirement is specified for using the OpenAI Services, and what provision applies if the user is under 18?',
 'output': "The age requirement for using OpenAI Services is at least 13 years old or the minimum age required in your country to consent to use the Services. If you are under 18, you must have your parent or legal guardian's permission to use the Services."}

In [42]:
query = "Give me summary of What resources does Google offer to users for assistance and guidance in using its services?"
react_agent_executor.invoke({"input": query})



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mThought: I need to find a relevant document that provides information on the resources Google offers to users for assistance and guidance in using its services.

Action: Get Relevant document
Action Input: Google user resources[0m[36;1m[1;3m

Below are the related document's content: 

services — whether that content belongs to you, Google, or others
In case of problems or disagreements, which describes other legal rights you have, and what to expect
in case someone violates these terms
Understanding these terms is important because, by using our services, you’re agreeing to these terms.
Besides these terms, we also publish a Privacy Policy. Although it’s not part of these terms, we encourage
you to read it to better understand how you can update, manage, export, and delete your information.
Terms
Service providerGoogle services are provided by, and you’re contracting with:
Google LLC
organized under the laws of the State 

Token indices sequence length is longer than the specified maximum sequence length for this model (805 > 512). Running this sequence through the model will result in indexing errors
Your max_length is set to 1000, but your input_length is only 805. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=402)


[33;1m[1;3mThis Privacy Policy is important because, by using our services, you’re agreeing to these terms . You’re responsible for what you do with your Google Account . To use our services on behalf of an organization, we sometimes send you service announcements and other information . These Terms of Service help dene Google’s relationship with you as you interact with our services .[0m[32;1m[1;3mThought: I now know the final answer

Final Answer: Google offers various resources to users for assistance and guidance in using its services, including terms of service, privacy policy, security checkup, and service-related communications. These resources help establish what users can expect from Google and what Google expects from users, including rules for using services, intellectual property rights, and content management. Additionally, Google provides resources for organizations using its services, such as assigning Google Accounts and requiring additional rules.[0m

[1m> Finis

{'input': 'Give me summary of What resources does Google offer to users for assistance and guidance in using its services?',
 'output': 'Google offers various resources to users for assistance and guidance in using its services, including terms of service, privacy policy, security checkup, and service-related communications. These resources help establish what users can expect from Google and what Google expects from users, including rules for using services, intellectual property rights, and content management. Additionally, Google provides resources for organizations using its services, such as assigning Google Accounts and requiring additional rules.'}

In [36]:
query = "What are my rights to my data on Facebook?"
react_agent_executor.invoke({"input": query})



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mThought: I need to find a relevant document that outlines Facebook's data policy.

Action: Get Relevant document
Action Input: Facebook data policy[0m[36;1m[1;3m

Below are the related document's content: 

viii. Processing friend lists from Facebook to establish social connections in your App unless each person in that connection has granted you access to that information for that purpose.
b. Additional Terms for Restricted Platform Data
i. You will not request Restricted Platform Data unless it is necessary to meaningfully improve the quality of the applicable User's experience in the specific product or service for which the User shared the data.
ii. It must be clear to the User why you are requesting their Restricted Platform Data in order to improve the quality of their experience.
iii. For clarity, your Processing of Restricted Platform Data must comply with the applicable Developer Docs and other provisions of these

2024-05-18 17:01:10.788620: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-18 17:01:10.788656: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-18 17:01:10.789733: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-05-18 17:01:10.795490: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Token indices sequence length is longer than the spec

[33;1m[1;3mYou will not request Restricted Platform Data unless it is necessary to meaningfully improve the quality of the applicable User's experience in the specific product or service for which the User shared the data . You must comply with the applicable Developer Docs and other provisions of these Terms (including the Prohibited Practices) You may only share Platform Data in compliance with these Terms . We may require that your Clients agree to these Terms or other applicable terms or policies .[0m[32;1m[1;3mThought: I have a summarized version of the document, now I need to understand what it means in terms of my rights to my data on Facebook.

Action: None
Action Input: None[0mNone is not a valid tool, try one of [Get Relevant document, Get Summarized Text, Get Todays Date, Get Age].[32;1m[1;3mThought: I understand that the summarized text outlines Facebook's data policy, but I need to interpret it in the context of my rights to my data on Facebook.

Action: None
Actio

KeyboardInterrupt: 

In [35]:
query = "I am Bob. Will i be eligible in 2027 for the age requirement specified for using the OpenAI Services by OpenAI Terms?"
react_agent_executor.invoke({"input": query})



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mThought: I need to know the age requirement specified in the OpenAI Terms and my current age to determine if I'll be eligible in 2027.

Action: Get Relevant document
Action Input: OpenAI Terms[0m[36;1m[1;3m

Below are the related document's content: 

Who we are
OpenAI is an AI research and deployment company. Our mission is to ensure that artificial general intelligence benefits all of humanity. For more information about OpenAI, please visit https://openai.com/about.
Registration and access
Minimum age. You must be at least 13 years old or the minimum age required in your country to consent to use the Services. If you are under 18 you must have your parent or legal guardian’s permission to use the Services. 
Registration. You must provide accurate and complete information to register for an account to use our Services. You may not share your account credentials or make your account available to anyone else and are respon

KeyboardInterrupt: 

In [None]:
import gradio as gr
from io import StringIO
import sys
import re

def generate_response(question):
    """
    Generate a response based on the provided question using ChatGroq.

    Args:
    - question (str): The question input by the user.

    Returns:
    - str: The generated response based on the question.
    """
    tools = [get_relevant_document_tool, get_summarized_text_tool, get_today_date_tool, get_age_info_tool]

    model = ChatGroq(model_name="llama3-70b-8192", groq_api_key=GROQ_API_KEY, temperature=0)
    # model = OpenAI(openai_api_key="<YOUR_OPENAI>")

    react_agent = create_react_agent(model, tools=tools, prompt=prompt_react)
    react_agent_executor = AgentExecutor(
        agent=react_agent, tools=tools, verbose=True, handle_parsing_errors=True
    )
    
    # Redirect stdout to capture text
    with StringIO() as text_output:
        sys.stdout = text_output
        completion = react_agent_executor.invoke({"input": question})
        sys.stdout = sys.__stdout__  # Reset stdout
        
        # Get the captured text
        text_output_str = text_output.getvalue()
    
    # Remove ANSI escape codes
    text_output_str = re.sub(r'\x1b\[[0-9;]*m', '', text_output_str)

    return text_output_str

# Set up the Gradio interface
iface = gr.Interface(
    fn=generate_response,
    inputs=[gr.Textbox(label="Question")],  # Pass input as a list
    outputs=[gr.Textbox(label="Generated Response")],  # Pass output as a list
    title="Intellegent RAG with Qdrant, LangChain ReAct and Llama3 from Groq Endpoint",
    description="Enter a question and get a generated response based on the retrieved text.",
)

iface.launch()
