In [1]:
import os

os.environ["GROQ_API_KEY"] = "your_GROQ_key"

In [None]:
from langchain.document_loaders import PyPDFLoader, UnstructuredWordDocumentLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

def load_document(filepath):
    _, file_extension = os.path.splitext(filepath)

    if file_extension.lower() == '.pdf':
        loader = PyPDFLoader(filepath)
    elif file_extension.lower() == '.docx':
        loader = UnstructuredWordDocumentLoader(filepath)
    else:
        raise ValueError("Unsupported file format: {}".format(file_extension))
    
    return loader.load()


filepath = "Your_file_path"
docs = load_document(filepath)

# Set up the text splitter
r_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200
)

# Split the document into chunks
splitted_docs = r_splitter.split_documents(docs)

In [None]:
from sentence_transformers import SentenceTransformer
from langchain_community.vectorstores import FAISS
from langchain.embeddings.base import Embeddings

# Create a custom embedding function
class SentenceTransformerEmbeddings(Embeddings):
    def __init__(self, model_name='all-mpnet-base-v2'):
        self.model = SentenceTransformer(model_name)

    def embed_documents(self, texts):
        return self.model.encode(texts).tolist()

    def embed_query(self, text):
        return self.model.encode(text).tolist()

# Initialize the embedding function
embeddings = SentenceTransformerEmbeddings()

# Create the FAISS index
vector_store = FAISS.from_documents(splitted_docs, embeddings)

In [4]:
from langchain_groq import ChatGroq

llm = ChatGroq(
    groq_api_key = "your_key",
    model_name = 'mixtral-8x7b-32768'
)

In [5]:
from langchain_core.prompts import ChatPromptTemplate
from langchain.chains.combine_documents import create_stuff_documents_chain

prompt = ChatPromptTemplate.from_template(
    """
    Answer the following quesion based on the retrieved context.
    Think step by step. Keep your answers brief. Reply with "I'm not sure how to answer that. Please provide more context and try simplyfying your question." when you don't have an answer for any question. 
    Do not respond with wrong answers. All answers should come form the retireved context.
    <context>
    {context}
    </context>
    
    Question: {input}
    """
)

llm_chain = create_stuff_documents_chain(
    llm, 
    prompt
)

In [6]:
from langchain.chains import create_retrieval_chain

retriever = vector_store.as_retriever()
retrieval_chain = create_retrieval_chain(
    retriever, 
    llm_chain
)

In [None]:
import re
import textwrap

def format_response(response):
    # Split the response into paragraphs
    paragraphs = re.split(r'\n\s*\n', response)
    
    formatted_paragraphs = []
    for paragraph in paragraphs:
        # Check if the paragraph is a numbered list
        if re.match(r'\d+\.', paragraph):
            # Split the numbered list into items
            items = re.split(r'\n\s*(\d+\.)', paragraph)
            formatted_items = []
            for i in range(1, len(items), 2):
                number = items[i]
                text = items[i+1] if i+1 < len(items) else ""
                wrapped_text = textwrap.fill(text.strip(), width=76, subsequent_indent='    ')
                formatted_items.append(f"{number} {wrapped_text}")
            formatted_paragraphs.append('\n'.join(formatted_items))
        else:
            # Regular paragraph
            wrapped = textwrap.fill(paragraph.strip(), width=80)
            formatted_paragraphs.append(wrapped)
    
    # Join paragraphs with double line breaks
    formatted_response = '\n\n'.join(formatted_paragraphs)
    
    return formatted_response

# Use the function
prompt = "Detailed steps to create a warehouse. Explain any required permissions too."
response = retrieval_chain.invoke({"input": prompt})
formatted_answer = format_response(response['answer'])
print(formatted_answer)
