importing packages

In [24]:
!pip install --upgrade --quiet langchain_core langchain_chroma langchain_text_splitters langchain langchain_google_genai sentence_transformers langsmith gradio gdown

In [2]:
import pandas as pd
import numpy as np
import os
from langchain_text_splitters import RecursiveCharacterTextSplitter, CharacterTextSplitter
from langchain.schema import Document
from sentence_transformers import SentenceTransformer
from langchain_chroma import Chroma
from langchain.prompts import ChatPromptTemplate
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser, JsonOutputParser
from langchain_core.runnables import RunnablePassthrough, RunnableParallel
from operator import itemgetter
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.documents import Document
# from kaggle_secrets import UserSecretsClient
from google.colab import userdata
import gdown
from langchain.load import dumps, loads
import gradio as gr

  from tqdm.autonotebook import tqdm, trange


In [3]:
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_API_KEY"] = userdata.get("LANGCHAIN_KEY")

# **Run This cell**

Embedding to find similarity between texts and find relevant

In [4]:
class embedding:
    def __init__(self):
        self.model = SentenceTransformer('all-mpnet-base-v2')
    def embed_documents(self,docs):
        embeddings= self.model.encode(docs)
        return embeddings.tolist()
    def embed_query(self,query):
        return self.model.encode(query).tolist()

In [21]:
class ragChain:
    generate_queries_template = """You are an AI language model assistant. Your task is to generate five arabic
      different versions of the given user question to retrieve relevant documents from a vector
      database. By generating multiple perspectives on the user question, your goal is to help
      the user overcome some of the limitations of the distance-based similarity search.
      Provide these alternative questions separated by newlines without any additional thoughts outside the answers. Original question: {question}"""

    final_rag_chain_template="""
      you are an AI powered QA Assistant to provide accurate, contextually relevant answers to customer questions.
      at the end of the answer you have to thank the user.
      the answer in arabic and in details.
      you should answer based on the context provided:
      {context}

      what to do if answer is not included in the prompt for context:

      1.you should appoligize to the user and say that you dont have the answer in your informations

      for answer:
      1. the output must be in details based on the context provided

      Question: {question}

      answer:
      """
    url = "https://drive.google.com/drive/u/0/folders/149jD_H7oQ3JEp6D1UBaMTNIyMOs8yz06"

    documents=[]
    # Retrieve
    def __init__(self):


        if not os.path.exists(f"{os.getcwd()}/vector"):
            # If it fails, initialize with documents
            gdown.download_folder(self.url)

        vector_database = Chroma(persist_directory=f"{os.getcwd()}/vector", embedding_function=embedding())
        retriever=vector_database.as_retriever(search_type="similarity",search_kwargs={'k':3})

        google_api_key = userdata.get("GOOGLE_AI_STUDIO2")
        llm = ChatGoogleGenerativeAI(model="gemini-pro",google_api_key=google_api_key,temperature=0)
        # Multi Query: Different Perspectives
        prompt_perspectives = ChatPromptTemplate.from_template(self.generate_queries_template)


        generate_queries = (
            prompt_perspectives
            | llm
            | StrOutputParser()
            | (lambda x: x.split("\n"))
        )

        retrieval_chain = generate_queries | retriever.map() | self.get_unique_union

        final_rag_chain_prompt = ChatPromptTemplate.from_template(self.final_rag_chain_template)

        final_rag_chain = (
            RunnablePassthrough.assign(context=(lambda x: self.format_docs(x["context"])))
            | final_rag_chain_prompt
            | llm
            | StrOutputParser()
        )

        rag_chain_with_source = RunnableParallel(
            {"context": retrieval_chain, "question": RunnablePassthrough()}
        ).assign(answer=final_rag_chain)
        self.rag_chain_with_source=rag_chain_with_source


    def invoke(self,question):
        return self.rag_chain_with_source.invoke(question)


    def format_docs(self, docs):
        return "\n\n".join(doc.page_content for doc in docs)

    # Get unique union of retrieved docs
    def get_unique_union(self, documents: list[list]):
      """ Unique union of retrieved docs """
      # Flatten list of lists, and convert each Document to string
      flattened_docs = [dumps(doc) for sublist in documents for doc in sublist]
      # Get unique documents
      unique_docs = list(set(flattened_docs))
      # Return
      return [loads(doc) for doc in unique_docs]

    def gettingDocuments(self, data_directory):
      data = []
      for folder in os.listdir(data_directory):
          for file in os.listdir(f'{data_directory}/{folder}'):

              file = open(f"{data_directory}/{folder}/{file}", "r",encoding="utf8")
              content = file.read()

              data.append({'category': folder, 'Content': content})

              file.close()
      self.documents = [Document(page_content=text["Content"]) for text in data[:40000]]


#INFERENCE


In [23]:
import pandas as pd
import numpy as np
import os
from langchain_text_splitters import RecursiveCharacterTextSplitter, CharacterTextSplitter
from langchain.schema import Document
from sentence_transformers import SentenceTransformer
from langchain_chroma import Chroma
from langchain.prompts import ChatPromptTemplate
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser, JsonOutputParser
from langchain_core.runnables import RunnablePassthrough, RunnableParallel
from operator import itemgetter
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.documents import Document
# from kaggle_secrets import UserSecretsClient
from google.colab import userdata
import gdown
from langchain.load import dumps, loads
import gradio as gr

rag_chain_with_source = ragChain()

def getting_answers(question, history=[]):
    rag_answer = rag_chain_with_source.invoke(question)
    answer = rag_answer["answer"] + "\n\n\n\n\nالنصوص القريبة للسوال المطروح: \n" + "\n".join([docs.page_content for docs in rag_answer["context"]])
    history.append((question, answer))

    return history,history


demo = gr.Interface(
    fn=getting_answers,
    inputs=["text", "state"],  # "text" for user input, "state" to keep track of the chat history
    outputs=["chatbot", "state"],  # "chatbot" for chat UI, "state" to store chat history
)

demo.launch()

Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://a77d32a74304b4bd84.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


