# <i>SinAI</i> (The RAG Sefaria Project)

Note: Before running this notebook, be sure you've started Docker and are running a Qdrant instance. For more info, please visit [Qdrant's Website](https://qdrant.tech/)

#### Set OpenAI Api Key

In [None]:
%env OPENAI_API_KEY=<openai_api_key> # Replace with your OpenAI API Key

#### Install Dependencies

In [None]:
%pip install -q llama-index-vector-stores-qdrant llama-index-readers-file llama-index-llms-openai llama-index-embeddings-openai qdrant_client redis

In [None]:
import logging
import sys
import os
import json
import pprint
import qdrant_client
from qdrant_client import models
from IPython.display import display, Markdown
from llama_index.core import VectorStoreIndex, Settings, Document
from llama_index.vector_stores.qdrant import QdrantVectorStore
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core.ingestion import IngestionPipeline

embed_model = OpenAIEmbedding(model='text-embedding-3-large', embed_batch_size=100) #Replace with preferred embedding model
Settings.embed_model = embed_model

#### Instantiate Qdrant client

In [None]:
client = qdrant_client.QdrantClient(
    host="localhost",
    port=6333
)

In [None]:
client.create_collection(
    collection_name="torah",
    vectors_config=models.VectorParams(size=3072, distance=models.Distance.COSINE),
)

#### Let's load the data

In [None]:
def process_files(base_dir):
    documents = []
    for root, dirs, files in os.walk(base_dir):
        for dirname in dirs:
            hebrew_dir = os.path.join(root, dirname, "Hebrew")
            json_file = os.path.join(hebrew_dir, "merged.json")
            if os.path.exists(json_file):
                with open(json_file, 'r') as f:
                    data = json.load(f)
                    book = data.get('title', '')
                    language = data.get('language', '')
                    for chapter_index, chapter in enumerate(data.get('text', [])):
                        for verse_index, verse in enumerate(chapter):
                            doc = Document(
                                text=verse,
                                metadata={
                                    "book": book,
                                    "language": language,
                                    "chapter": chapter_index + 1,
                                    "verse": verse_index + 1
                                }
                            )
                            documents.append(doc)
    return documents

In [None]:
torah_documents = process_files('/path/to/data/json/Tanakh/Torah') # Replace with your Torah directory

In [None]:
torah_documents[0]


#### Connect to Torah Vector Store

In [None]:
torah_vector_store = QdrantVectorStore(client=client, collection_name="torah")

In [None]:
pipeline = IngestionPipeline(
    transformations=[
        embed_model,
    ],
    vector_store=torah_vector_store,
    documents=torah_documents
)

pipeline.run(torah_documents)

#### Let's test it out

In [None]:
torah_index = VectorStoreIndex.from_vector_store(torah_vector_store)

In [None]:
query_engine = torah_index.as_query_engine(similarity_top_k=10)

response = query_engine.query("Which pasukim tell me about the laws of Pesach?")

display(Markdown(f"<b>{response.response}</b>"))
for pasuk in response.source_nodes:
    pprint.pp(pasuk.text)
    pprint.pp(pasuk.metadata)

# Nevi'im Ingestion

In [None]:
client.create_collection(
    collection_name="neviim",
    vectors_config=models.VectorParams(size=3072, distance=models.Distance.COSINE),
)

neviim_docs = process_files('/path/to/data/json/Tanakh/Prophets') # Replace with your Neviim directory

neviim_vector_store = QdrantVectorStore(client=client, collection_name="neviim")

pipeline = IngestionPipeline(
    transformations=[
        embed_model,
    ],
    vector_store=neviim_vector_store,
    documents=neviim_docs,
)

pipeline.run(neviim_docs)

In [None]:
neviim_index = VectorStoreIndex.from_vector_store(neviim_vector_store)
neviim_query_engine = neviim_index.as_query_engine(similarity_top_k=20)
response = neviim_query_engine.query("Can you detail the story of Isaiah with relevant verses?")
display(response)

# Ketuvim Ingestion

In [None]:
client.create_collection(
    collection_name="ketuvim",
    vectors_config=models.VectorParams(size=3072, distance=models.Distance.COSINE),
)

ketuvim_docs = process_files('/path/to/your/data/json/Tanakh/Writings') # Replace with your Ketuvim directory

ketuvim_vector_store = QdrantVectorStore(client=client, collection_name="ketuvim")

pipeline = IngestionPipeline(
    transformations=[
        embed_model,
    ],
    vector_store=ketuvim_vector_store,
    documents=ketuvim_docs,
)

pipeline.run(ketuvim_docs)

# Ingest Mishnah

In [None]:
def process_mishna(base_dir):
    documents = []
    for root, dirs, files in os.walk(base_dir):
        sederpath = os.path.normpath(root).split(os.sep)
        seder = sederpath[-1]
        seder = seder.replace("Seder ", "") 
        for dirname in dirs:
            hebrew_dir = os.path.join(root, dirname, "Hebrew")
            json_file = os.path.join(hebrew_dir, "merged.json")
            if os.path.exists(json_file):
                with open(json_file, 'r') as f:
                    data = json.load(f)
                    masechta = data.get('title', '')
                    masechta = masechta.replace("Mishnah ", "")
                    language = data.get('language', '')
                    for chapter_index, chapter in enumerate(data.get('text', [])):
                        for mishna_index, mishna in enumerate(chapter):
                            doc = Document(
                                text=mishna,
                                metadata={
                                    "seder": seder,
                                    "masechta": masechta,
                                    "language": language,
                                    "chapter": chapter_index + 1,  # chapters are 1-indexed
                                    "mishna": mishna_index + 1      # verses are 1-indexed
                                }
                            )
                            documents.append(doc)
    return documents

In [None]:
client.create_collection(
    collection_name="mishna",
    vectors_config=models.VectorParams(size=3072, distance=models.Distance.COSINE),
)

mishna_docs = process_mishna('/path/to/your/data/json/Mishnah/Mishna') # Replace with your Mishna directory

mishna_vector_store = QdrantVectorStore(client=client, collection_name="mishna")

pipeline = IngestionPipeline(
    transformations=[
        embed_model,
    ],
    vector_store=mishna_vector_store,
    documents=mishna_docs,
)

pipeline.run(mishna_docs)

# Ingest Talmud Bavli

#### Start Redis

In [None]:
!redis-server --daemonize yes

#### Ingestion

In [None]:
def amud_to_daf(amud):
    base = (amud - 3) // 2 + 2
    is_a = (amud % 2) != 0
    return f"{base}{'a' if is_a else 'b'}"

def process_talmud(base_dir):
    documents = []
    for root, dirs, files in os.walk(base_dir):
        sederpath = os.path.normpath(root).split(os.sep)
        seder = sederpath[-1]
        seder = seder.replace("Seder ", "") 
        for dirname in dirs:
            hebrew_dir = os.path.join(root, dirname, "Hebrew")
            json_file = os.path.join(hebrew_dir, "merged.json")
            if os.path.exists(json_file):
                with open(json_file, 'r') as f:
                    data = json.load(f)
                    masechta = data.get('title', '')
                    language = data.get('language', '')
                    for chapter_index, chapter in enumerate(data.get('text', [])):
                        if chapter_index < 2:
                            continue
                        amud = chapter_index
                        for sentence_index, sentence in enumerate(chapter):
                            doc = Document(
                                text=sentence,
                                metadata={
                                    "seder": seder,
                                    "masechta": masechta,
                                    "language": language,
                                    "daf": amud_to_daf(amud + 1),
                                    "sentence": sentence_index + 1
                                }
                            )
                            documents.append(doc)
    return documents

def batch_documents(documents, batch_size):
    for i in range(0, len(documents), batch_size):
        yield documents[i:i + batch_size]


In [None]:
client.create_collection(
    collection_name="talmud_bavli",
    vectors_config=models.VectorParams(size=3072, distance=models.Distance.COSINE),
)

talmud_docs = process_talmud('/path/to/your/data/json/Talmud/Bavli/Gemara') # Replace with your Gemara directory
talmud_vector_store = QdrantVectorStore(client=client, collection_name="talmud")

pipeline = IngestionPipeline(
    transformations=[
        embed_model,
    ],
    vector_store=talmud_vector_store,
)

for batch in batch_documents(talmud_docs, 4000):
    pipeline.run(documents=batch)

# Ingest Rishonim on the Talmud

In [None]:
import hashlib
import redis
from llama_index.core.node_parser import SentenceSplitter

redis_client = redis.Redis(host='localhost', port=6379, db=0)

splitter = SentenceSplitter(
    chunk_size=2048,
    chunk_overlap=20,
)

logging.basicConfig(filename='rishonim.log', level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

embed_model = OpenAIEmbedding(model='text-embedding-3-large', embed_batch_size=100)
Settings.embed_model = embed_model

client = qdrant_client.QdrantClient(
    host="localhost",
    port=6333
)

def amud_to_daf(amud):
    base = (amud - 3) // 2 + 2
    is_a = (amud % 2) != 0
    return f"{base}{'a' if is_a else 'b'}"

def generate_document_hash(document):
    hash_input = f"{document.text}{document.metadata['rishon']}{document.metadata['seder']}{document.metadata['masechta']}{document.metadata['language']}{document.metadata['amud']}{document.metadata['sentence']}"
    return hashlib.sha256(hash_input.encode()).hexdigest()

def flatten_text(nested_list):
    """ Recursively flatten a list and yield non-empty strings. """
    for item in nested_list:
        if isinstance(item, str) and item.strip():
            yield item
        elif isinstance(item, list):
            yield from flatten_text(item)

def process_talmud(base_dir, rishon):
    for root, dirs, files in os.walk(base_dir):
        sederpath = os.path.normpath(root).split(os.sep)
        seder = sederpath[-1].replace("Seder ", "")
        for dirname in dirs:
            hebrew_dir = os.path.join(root, dirname, "Hebrew")
            json_file = os.path.join(hebrew_dir, "merged.json")
            if os.path.exists(json_file):
                with open(json_file, 'r') as f:
                    data = json.load(f)
                    masechta = data.get('title', '').replace(f"{rishon} on ", "").replace(f"{rishon} ", "")
                    language = data.get('language', '')
                    for amud_index, amud in enumerate(data.get('text', [])):
                        if amud_index < 2:
                            continue
                        sentence_index = 0
                        for sentence in flatten_text(amud):
                            sentence_index += 1
                            yield Document(
                                text=sentence,
                                metadata={
                                    "rishon": rishon,
                                    "seder": seder,
                                    "masechta": masechta,
                                    "language": language,
                                    "daf": amud_to_daf(amud_index + 1),
                                    "sentence": sentence_index,
                                }
                            )

def process_all_rishonim(directory):
    all_documents = []
    for rishon in os.listdir(directory):
        rishon_dir = os.path.join(directory, rishon)
        if os.path.isdir(rishon_dir):
            documents = process_talmud(rishon_dir, rishon)
            all_documents.extend(documents)
    return all_documents

def batch_documents(documents, batch_size):
    for i in range(0, len(documents), batch_size):
        yield documents[i:i + batch_size]

def ingest_documents(directory, batch_size=4000):
    documents = []
    redis_pipeline = redis_client.pipeline()

    for rishon in os.listdir(directory):
        rishon_dir = os.path.join(directory, rishon)
        if os.path.isdir(rishon_dir):
            for document in process_talmud(rishon_dir, rishon):
                doc_hash = generate_document_hash(document)
                redis_pipeline.sismember("processed_hashes", doc_hash)
                documents.append((document, doc_hash))
                
                if len(documents) >= batch_size:
                    results = redis_pipeline.execute()
                    new_documents = []
                    hashes_to_add = []
                    for (doc, hash), exists in zip(documents, results):
                        if not exists:
                            new_documents.append(doc)
                            hashes_to_add.append(hash)

                    if new_documents:
                        pipeline.run(documents=new_documents)
                        for hash in hashes_to_add:
                            redis_client.sadd("processed_hashes", hash)
                        logging.info(f"Ingesting {len(new_documents)} documents")
                        logging.info(f"{new_documents[0].metadata if new_documents else 'No new documents'}")
                    
                    documents = []
                    redis_pipeline = redis_client.pipeline()

    if documents:
        results = redis_pipeline.execute()
        new_documents = []
        hashes_to_add = []
        for (doc, hash), exists in zip(documents, results):
            if not exists:
                new_documents.append(doc)
                hashes_to_add.append(hash)

        if new_documents:
            pipeline.run(documents=new_documents)
            for hash in hashes_to_add:
                redis_client.sadd("processed_hashes", hash)

            logging.info(f"Ingesting {len(new_documents)} documents")
            logging.info(f"{new_documents[0].metadata if new_documents else 'No new documents'}")

    redis_pipeline.execute()

In [None]:
client.create_collection(
    collection_name="rishonim_bavli",
    vectors_config=models.VectorParams(size=3072, distance=models.Distance.COSINE),
)

rishonim_vector_store = QdrantVectorStore(client=client, collection_name="rishonim_bavli")

pipeline = IngestionPipeline(
    transformations=[
        splitter,
        embed_model,
    ],
    vector_store=rishonim_vector_store,
)

ingest_documents('/path/to/json/Talmud/Bavli/Rishonim on Talmud') # Replace with your Rishonim directory


### Shutdown Redis

In [None]:
!redis-cli shutdown