## Importing Libraries

In [1]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_groq import ChatGroq
import os
from dotenv import load_dotenv
from langchain.document_loaders import PyPDFLoader, TextLoader
from langchain.schema import Document
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
import google.generativeai as genai
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain.vectorstores import FAISS
from langchain_core.output_parsers import StrOutputParser, JSONOutputParser

import re
_ = load_dotenv()

os.getenv("GOOGLE_API_KEY")
genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))

  from tqdm.autonotebook import tqdm, trange


## Document Loading

In [16]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Define a function to chunk documents
def chunk_documents(documents, chunk_size=1000, chunk_overlap=50):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
    )
    chunked_documents = []
    for doc in documents:
        chunks = text_splitter.split_text(doc.content)
        for chunk in chunks:
            chunked_doc = CustomDocument(content=chunk, metadata=doc.metadata)
            chunked_documents.append(chunked_doc)
    return chunked_documents

In [17]:
# Chunk the cleaned documents
chunked_documents = chunk_documents(documents)

In [19]:
chunked_documents[1].content

'NOTES START FROM HERE AND GROW UPCONTENT BELOW THIS LINE CONTENT BELOW THIS LINE SUBTITLE BELOW THIS LINE TITLE CAN NOT GO ABOVE THIS LINE Disclaimer THIS DOCUMENT IS NOT AN OFFER OR AN INVITATION TO BUY OR SELL SECURITIES. IMPORTANT: Please read the following before continuing. This document has been prepared by S.P.E.E.H. HIDROELECTRICA S.A. (th e “Company”) and relates to the Company and its subsidiary (together, the “Group”) and the following applies to the information in this document (the “Information”). The Information does not purport to contain full, accurate or complete information required to evaluate the Company or the Gr oup and/or its financial position. The Information does not constitute a recommendation regarding any loans or securities of the Company or any other member of the Group. By accepting to access the Information, you ( i) agree to be bound by the foregoing limitations; and (ii) have read, understood and agree to comply with the contents of thi s disclaimer.

In [20]:
# checking the top 10 chunked documents
for i in range(2):
    print(chunked_documents[i].content)
    print()

NOTES START FROM HERE AND GROW UPCONTENT BELOW THIS LINE CONTENT BELOW THIS LINE SUBTITLE BELOW THIS LINE TITLE CAN NOT GO ABOVE THIS LINE NOTES START FROM HERE AND GROW UPCONTENT BELOW THIS LINE CONTENT BELOW THIS LINE SUBTITLE BELOW THIS LINE TITLE CAN NOT GO ABOVE THIS LINE 15 May 2024#1 Power Producer in Romania and One of the Largest European Hydro Companies Q1 2024 Key Financials Update

NOTES START FROM HERE AND GROW UPCONTENT BELOW THIS LINE CONTENT BELOW THIS LINE SUBTITLE BELOW THIS LINE TITLE CAN NOT GO ABOVE THIS LINE Disclaimer THIS DOCUMENT IS NOT AN OFFER OR AN INVITATION TO BUY OR SELL SECURITIES. IMPORTANT: Please read the following before continuing. This document has been prepared by S.P.E.E.H. HIDROELECTRICA S.A. (th e “Company”) and relates to the Company and its subsidiary (together, the “Group”) and the following applies to the information in this document (the “Information”). The Information does not purport to contain full, accurate or complete information requ

In [49]:
# # Function to embed documents using GoogleGenerativeAIEmbeddings
# def embed_documents(documents):
#     embedding_model = GoogleGenerativeAIEmbeddings(model = "models/embedding-001")
#     texts = [doc.content for doc in documents]
#     embeddings = embedding_model.embed_documents(texts)
#     return embeddings



# # Embed the chunked documents
# embeddings = embed_documents(chunked_documents)

In [21]:

def get_vector_store(documents):
    embeddings = GoogleGenerativeAIEmbeddings(model = "models/embedding-001")
    texts = [doc.content for doc in documents]
    vector_store = FAISS.from_texts(texts, embedding=embeddings)
    vector_store.save_local("faiss_index")

vec_db = get_vector_store(chunked_documents)

## Model initialization

In [22]:
model = ChatGroq(temperature=0, model_name="llama3-8b-8192")

In [23]:
model.invoke('interpret this: Hidroelectrica a modului de gestionare a situațiilor cu privire la cadouri sau avantaje Conducerea și angajatii Hidroelectrica -analiză anuală sau in funcț ie de necesitate și actualizare atunci când se constată necesitatea')

AIMessage(content='Here\'s an interpretation of the text:\n\n"Hidroelectrica" is a company or organization that is implementing a system for managing situations related to gifts or benefits.\n\nThe system is designed to ensure that the company\'s leadership and employees (Conducerea și angajatii Hidroelectrica) conduct an annual analysis or as needed, and update the system whenever necessary.\n\nIn other words, the company is establishing a process for managing situations related to gifts or benefits, such as:\n\n* Identifying potential conflicts of interest or ethical dilemmas\n* Ensuring compliance with relevant laws and regulations\n* Providing guidance and training to employees on the company\'s policies and procedures\n* Monitoring and reporting on the effectiveness of the system\n\nThe goal of this system is to ensure transparency, accountability, and ethical behavior within the company.', response_metadata={'token_usage': {'completion_tokens': 166, 'prompt_tokens': 80, 'total_to

In [24]:
from langchain.chains import ConversationChain
from langchain.memory import  ConversationBufferWindowMemory
from langchain_core.prompts.prompt import PromptTemplate

In [25]:
# question reviewer

def query_reviewer(question: str) -> str:
    template = """<|begin_of_text|><|start_header_id|>system<|end_header_id|>
    You are an expert at Reviewing Questions.
    Background: The user is interested in knowing some information about the company Hidroelectrica. \n
    The client loves the company so much and wants to know more about it. \n


    The client will send in a question, your job is to review the question and create text sub-questions. \n The sub-questions are -questions written in a better way. 
    The database content is in both English and Romanian. \n
    Your job is to create four sub-questions, the first two will be in english and the others will be in romanian \n
    Make sure the sub-questions are relevant to the client's question. \n

    Output format : [sub-questions1, sub-questions2, sub-questions3, sub-questions4] \n
    Return only the output without any additional information. \n

    <|eot_id|><|start_header_id|>user<|end_header_id|>
    Human: {question}
    <|eot_id|><|start_header_id|>assistant<|end_header_id|>
    AI Assistant:"""

    question_prompt = PromptTemplate(input_variables=["question"], template=template)
    initiator_router = question_prompt | model | StrOutputParser()
    output = initiator_router.invoke({"question":question})
    return output


In [26]:
sub_query = query_reviewer("Who is the founder of Hidroelectrica?")
print(sub_query)

["What is the name of the founder of Hidroelectrica?", "Who is the person behind the establishment of Hidroelectrica?", "Cine este fondatorul Hidroelectricei?", "Cine a înființat Hidroelectrica?"]


In [27]:
def user_input(user_questions: list) -> list:
    # Load embeddings
    embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
    
    # Load the FAISS index
    new_db = FAISS.load_local("faiss_index", embeddings, allow_dangerous_deserialization=True)
    
    relevant_docs = []
    for question in user_questions:
        # Get top 3 similar documents for each question
        docs = new_db.similarity_search(question, k=3)
        
        # Extract page content from the documents
        for doc in docs:
            relevant_docs.append(doc.page_content)
    
    return relevant_docs

In [28]:
answer = user_input(sub_query[:2])

In [29]:
answer

['used in the Romanian valuation practice by the licensed valuers body in all instances where there is no rece nt information',
 'NOTES START FROM HERE AND GROW UPCONTENT BELOW THIS LINE CONTENT BELOW THIS LINE SUBTITLE BELOW THIS LINE TITLE CAN NOT GO ABOVE THIS LINE Disclaimer THIS DOCUMENT IS NOT AN OFFER OR AN INVITATION TO BUY OR SELL SECURITIES. IMPORTANT: Please read the following before continuing. This document has been prepared by S.P.E.E.H. HIDROELECTRICA S.A. (th e “Company”) and relates to the Company and its subsidiary (together, the “Group”) and the following applies to the information in this document (the “Information”). The Information does not purport to contain full, accurate or complete information required to evaluate the Company or the Gr oup and/or its financial position. The Information does not constitute a recommendation regarding any loans or securities of the Company or any other member of the Group. By accepting to access the Information, you (i) agree to 

In [71]:
def initiator(chat_history: list) -> str:
    initiator_prompt = PromptTemplate(
        template="""<|begin_of_text|><|start_header_id|>system<|end_header_id|>
        You are an expert at evaluating the conversation between a USER and an AI. \n
        Your focus is on the AI response. You're looking for a place where the AI response has "ALL REQUIREMENTS RECORDED" in it. \n
        
        If you see "ALL REQUIREMENTS RECORDED" in thre AI repsonse:
            Return 'START' as your response. \
        Else if it's not there:
            Return 'WAIT' as your response
        
        Do not make assumptions. Stick to the details in the chat. 
        You can only return "START" or "WAIT", no other explanation is needed.
        <|eot_id|><|start_header_id|>user<|end_header_id|>
        CHAT_HISTORY: {chat_history} \n
        <|eot_id|><|start_header_id|>assistant<|end_header_id|>""",
        input_variables=["chat_history"],
    )

    initiator_router = initiator_prompt | model | StrOutputParser()
    output = initiator_router.invoke({"chat_history":chat_history})
    return output