In [1]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'sanad-dataset:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F819052%2F1401544%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240827%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240827T022714Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3Dc263897ca83b59b94a3a23d9db3cb25f5aecf6f9f8739d8401d3c891d38330872a45bfcc8cf65376f352aa8da7118ca5209aac5fc2c2159db8dddb657c427393b6630f9f35f0b0796f9b14d4d75ffd9611b5b71e1ccafe2c8beea57d365ea9f672066ebc592aaf8d97f828970fa843a76a5c40047140961fb407d30cf29bcc3ce67e47bc61c1ec4fd9d416b868d832871e2631ced4be443c7251bb3cb2df52e649190890040acbe4c1040b5a8273f3e5adb414f3a721bd449d53d350e46fe5211a6509f261e0b47a721954659aa479ed3b4808cace78de95606b7dcd619234a290352c039336fb6c1890a6ee9cbafa199644c475a882fb16d2fdff1ddb221184'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


Downloading sanad-dataset, 68916418 bytes compressed
Downloaded and uncompressed: sanad-dataset
Data source import complete.


importing packages

In [2]:
!pip install --upgrade langchain_core langchain_chroma langchain_text_splitters langchain langchain_google_genai sentence_transformers langsmith gradio

Collecting langchain_core
  Downloading langchain_core-0.2.35-py3-none-any.whl.metadata (6.2 kB)
Collecting langchain_chroma
  Downloading langchain_chroma-0.1.3-py3-none-any.whl.metadata (1.5 kB)
Collecting langchain_text_splitters
  Downloading langchain_text_splitters-0.2.2-py3-none-any.whl.metadata (2.1 kB)
Collecting langchain
  Downloading langchain-0.2.14-py3-none-any.whl.metadata (7.1 kB)
Collecting langchain_google_genai
  Downloading langchain_google_genai-1.0.10-py3-none-any.whl.metadata (3.8 kB)
Collecting sentence_transformers
  Downloading sentence_transformers-3.0.1-py3-none-any.whl.metadata (10 kB)
Collecting langsmith
  Downloading langsmith-0.1.104-py3-none-any.whl.metadata (13 kB)
Collecting gradio
  Downloading gradio-4.42.0-py3-none-any.whl.metadata (15 kB)
Collecting jsonpatch<2.0,>=1.33 (from langchain_core)
  Downloading jsonpatch-1.33-py2.py3-none-any.whl.metadata (3.0 kB)
Collecting tenacity!=8.4.0,<9.0.0,>=8.1.0 (from langchain_core)
  Downloading tenacity-8.

In [4]:
import pandas as pd
import numpy as np
import os
from langchain_text_splitters import RecursiveCharacterTextSplitter, CharacterTextSplitter
from langchain.schema import Document
from sentence_transformers import SentenceTransformer
from langchain_chroma import Chroma
from langchain.prompts import ChatPromptTemplate
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser, JsonOutputParser
from langchain_core.runnables import RunnablePassthrough, RunnableParallel
from operator import itemgetter
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.documents import Document
# from kaggle_secrets import UserSecretsClient
from google.colab import userdata
from langchain.load import dumps, loads
import gradio as gr

In [5]:
# user_secrets = UserSecretsClient()

os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_API_KEY"] = userdata.get("LANGCHAIN_KEY")

In [6]:
data_directory='/kaggle/input/sanad-dataset'
data=[]
for folder in os.listdir(data_directory):
    for file in os.listdir(f'{data_directory}/{folder}'):

            file = open(f"{data_directory}/{folder}/{file}", "r",encoding="utf8")
            content = file.read()

            data.append({'category': folder, 'Content': content})

            file.close()

In [7]:
data[0]

{'category': 'Sports',
 'Content': 'أعلن نادي باريس سان جيرمان رسميا، أمس الثلاثاء، تعيين الإسباني أوناي ايمري مدربا للفريق حتى 2018 خلفاً للوران بلان المقال من منصبه.وقال ايمري «أشعر بفخر لتولي الإشراف على تدريب باريس سان جيرمان، لقد أصبح هذا النادي أحد أكبر الأندية على الصعيد الأوروبي ، وأنا سعيد وفخور للفرصة المتاحة أمامي لمساعدته في تحقيق طموحاته ».أما رئيس النادي ناصر الخليفي فقال «سيضع أوناي كل خبرته وقدراته التدريبية وخصاله الإنسانية في تصرف النادي، إنه يملك موهبة قوية لاستخراج أفضل ما لدى اللاعبين».'}

In [8]:
documents = [Document(page_content=text["Content"]) for text in data[:40000]]

splitting data into documents

In [9]:
len(documents)

40000

Embedding to find similarity between texts and find relevant

In [10]:
class embedding:
    def __init__(self):
        self.model = SentenceTransformer('all-mpnet-base-v2')
    def embed_documents(self,docs):
        embeddings= self.model.encode(docs)
        return embeddings.tolist()
    def embed_query(self,query):
        return self.model.encode(query).tolist()

In [11]:
embed_model=embedding()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [12]:
result=embed_model.embed_query("ما الجائزة التي حصدتها شعاع كابيتال ؟")

In [13]:
len(result)

768

In [14]:
vector_database=Chroma.from_documents(documents=documents,embedding=embed_model)

In [15]:
retriever=vector_database.as_retriever(search_type="similarity",search_kwargs={'k':3})

In [41]:
retrieved_docs=retriever.invoke("ماذا تعلم عن منطقة الفنون ؟")

construct template

In [20]:
# from kaggle_secrets import UserSecretsClient
# user_secrets = UserSecretsClient()
google_api_key = userdata.get("GOOGLE_AI_STUDIO2")
llm = ChatGoogleGenerativeAI(model="gemini-pro",google_api_key=google_api_key,temperature=0)

In [21]:

# Multi Query: Different Perspectives
template = """You are an AI language model assistant. Your task is to generate five arabic
different versions of the given user question to retrieve relevant documents from a vector
database. By generating multiple perspectives on the user question, your goal is to help
the user overcome some of the limitations of the distance-based similarity search.
Provide these alternative questions separated by newlines without any additional thoughts outside the answers. Original question: {question}"""
prompt_perspectives = ChatPromptTemplate.from_template(template)


generate_queries = (
    prompt_perspectives
    | llm
    | StrOutputParser()
    | (lambda x: x.split("\n"))
)


In [22]:


def get_unique_union(documents: list[list]):
    """ Unique union of retrieved docs """
    # Flatten list of lists, and convert each Document to string
    flattened_docs = [dumps(doc) for sublist in documents for doc in sublist]
    # Get unique documents
    unique_docs = list(set(flattened_docs))
    # Return
    return [loads(doc) for doc in unique_docs]

# Retrieve
retrieval_chain = generate_queries | retriever.map() | get_unique_union




In [23]:
from langchain_core.pydantic_v1 import BaseModel, Field


class Structeredoutput(BaseModel):
    answer: str = Field(description="This is the answer to the question")
    context: list = Field(description="this is the context")

In [53]:


# RAG
# template = """Answer the following question based on this context:

# {context}

# Question: {question}
# """

template="""
you are an AI powered QA Assistant to provide accurate, contextually relevant answers to customer questions.
at the end of the answer you have to thank the user.
the answer in arabic and in details.
you should answer based on the context provided:
{context}

what to do if answer is not included in the prompt for context:

1.you should appoligize to the user and say that you dont have the answer in your informations

for answer:
1. the output must be in details based on the context provided

Question: {question}

answer:
"""

prompt = ChatPromptTemplate.from_template(template)


final_rag_chain = (
    {"context": retrieval_chain,
     "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

# final_rag_chain.invoke({"question":question})



In [54]:
answer=final_rag_chain.invoke("ماذا تعلم عن منطقة الفنون ؟")
print(answer)

عذراً، لا تتوفر لدي معلومات حول منطقة الفنون في السياق المقدم. شكرًا لك.


In [55]:
import gradio as gr

def getting_answers(question, history=[]):
    answer = final_rag_chain.invoke(question)
    history.append((question, answer))
    retrieved_docs=retrieval_chain.invoke(question)



    return history,history ,[docs.page_content for docs in retrieved_docs]

demo = gr.Interface(
    fn=getting_answers,
    inputs=["text", "state"],  # "text" for user input, "state" to keep track of the chat history
    outputs=["chatbot", "state","text"],  # "chatbot" for chat UI, "state" to store chat history
)

demo.launch()

Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://6d823a768791f4d3e1.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


