# ![icon.svg](https://cocoindex.io/icon.svg) Welcome to [Cocoindex](https://cocoindex.io/)




#  ![icon.svg](https://cocoindex.io/icon.svg) This example will show you how you can get started with Cocoindex by building embedding for RAG

# Install Cocoindex and other required packages using pip

In [None]:
%pip install cocoindex python-dotenv

# Grab some markdown files for demo

In [None]:
!mkdir -p markdown_files && \
wget -P markdown_files https://raw.githubusercontent.com/cocoindex-io/cocoindex/refs/heads/main/examples/text_embedding/markdown_files/1706.03762v7.md && \
wget -P markdown_files https://raw.githubusercontent.com/cocoindex-io/cocoindex/refs/heads/main/examples/text_embedding/markdown_files/1810.04805v2.md && \
wget -P markdown_files https://raw.githubusercontent.com/cocoindex-io/cocoindex/refs/heads/main/examples/text_embedding/markdown_files/rfc8259.md


# Create a Postgres Server

In [None]:
# Update package lists
!sudo apt-get update

# Install PostgreSQL setup helper
!sudo apt install -y postgresql-common

# Automatically press Enter for the setup script
!yes "" | sudo /usr/share/postgresql-common/pgdg/apt.postgresql.org.sh

# Install PostgreSQL 17 and pgvector extension
!sudo apt install -y postgresql-17 postgresql-17-pgvector

# Start PostgreSQL service
!sudo service postgresql start

# Create user and database for cocoindex
!sudo -u postgres psql -c "CREATE USER cocoindex WITH PASSWORD 'cocoindex';"
!sudo -u postgres createdb cocoindex -O cocoindex

# Enable the pgvector extension
!sudo -u postgres psql -d cocoindex -c "CREATE EXTENSION IF NOT EXISTS vector;"



# Update .env with POSTGRES URL

In [None]:
%%writefile .env
COCOINDEX_DATABASE_URL="postgresql://cocoindex:cocoindex@localhost:5432/cocoindex"

# Create a new file and import modules

In [None]:
%%writefile main.py
from dotenv import load_dotenv
import cocoindex


# Define your embedding function

In [None]:
%%writefile -a main.py

def text_to_embedding(text: cocoindex.DataSlice) -> cocoindex.DataSlice:
    """
    Embed the text using a SentenceTransformer model.
    This is shared logic between indexing and querying.
    """
    return text.transform(
        cocoindex.functions.SentenceTransformerEmbed(
            model="sentence-transformers/all-MiniLM-L6-v2"))


# Define your flow

In [None]:
%%writefile -a main.py

@cocoindex.flow_def(name="TextEmbedding")
def text_embedding_flow(flow_builder: cocoindex.FlowBuilder, data_scope: cocoindex.DataScope):
    """
    Define a flow that embeds text into a vector database.
    """
    data_scope["documents"] = flow_builder.add_source(
        cocoindex.sources.LocalFile(path="markdown_files"))

    doc_embeddings = data_scope.add_collector()

    with data_scope["documents"].row() as doc:
        doc["chunks"] = doc["content"].transform(
            cocoindex.functions.SplitRecursively(),
            language="markdown", chunk_size=2000, chunk_overlap=500)

        with doc["chunks"].row() as chunk:
            chunk["embedding"] = text_to_embedding(chunk["text"])
            doc_embeddings.collect(filename=doc["filename"], location=chunk["location"],
                                   text=chunk["text"], embedding=chunk["embedding"])

    doc_embeddings.export(
        "doc_embeddings",
        cocoindex.storages.Postgres(),
        primary_key_fields=["filename", "location"],
        vector_indexes=[
            cocoindex.VectorIndexDef(
                field_name="embedding",
                metric=cocoindex.VectorSimilarityMetric.COSINE_SIMILARITY)])



# Define query handler



In [None]:
%%writefile -a main.py

query_handler = cocoindex.query.SimpleSemanticsQueryHandler(
    name="SemanticsSearch",
    flow=text_embedding_flow,
    target_name="doc_embeddings",
    query_transform_flow=text_to_embedding,
    default_similarity_metric=cocoindex.VectorSimilarityMetric.COSINE_SIMILARITY)


#Define search function and main

In [None]:
%%writefile -a main.py

@cocoindex.main_fn()
def _run():
    while True:
        try:
            query = input("Enter search query (or Enter to quit): ")
            if query == '':
                break
            results, _ = query_handler.search(query, 10)
            print("\nSearch results:")
            for result in results:
                print(f"[{result.score:.3f}] {result.data['filename']}")
                print(f"    {result.data['text']}")
                print("---")
            print()
        except KeyboardInterrupt:
            break

if __name__ == "__main__":
    load_dotenv(override=True)
    _run()


# Setup

In [None]:
!yes yes | python main.py cocoindex setup

# Update

In [None]:
!python main.py cocoindex update

# Run query

In [None]:
!python main.py