In [1]:
%pip install --upgrade --quiet pip setuptools wheel
%pip install --upgrade --quiet  langchain langchain-openai faiss-cpu tiktoken crate 'crate[sqlalchemy]' pandas jq 
%pip install --use-pep517 --quiet python-dotenv

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


# Use CrateDB as fulltext search retriver and Mistral-7B as language model

## Setup environment variables

In [2]:
import os

from dotenv import load_dotenv

load_dotenv()

True

## setup embeddings

In [3]:
from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings()
len(embeddings.embed_query("a"))

1536

In [4]:
conn_url = "crate://{user}:{password}@{server}".format(
    user=os.environ["CRATEDB_USER"],
    password=os.environ["CRATEDB_PASS"],
    server=os.environ["CRATEDB_SERVER"],
)
conn_url

'crate://crate:@localhost:4201'

In [5]:
# open file
from langchain_community.document_loaders import JSONLoader, DirectoryLoader


def metadata_func(record: dict, metadata: dict) -> dict:
    metadata["source_url"] = record.get("url")
    metadata["source_title"] = record.get("title")

    if "source" in metadata:
        metadata["source"] = metadata["source_url"]

    return metadata


loader = DirectoryLoader(
    './',
    glob="everything-*.json",
    loader_cls=JSONLoader,
    loader_kwargs={
        "jq_schema": ".[]",
        "text_content": False,
        "content_key": "html",
        "metadata_func": metadata_func,
    }
)

data = loader.load()
# data[:1]

In [6]:
# split documents
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    separators=[
        "\n\n",
        "\n",
        " ",
        ".",
        ",",
    ],
    chunk_size=500,
    chunk_overlap=50,
    length_function=len,
    is_separator_regex=False,
)

docs_splits = text_splitter.split_documents(data)
# docs_splits[:2]

## RAG search, indexing pipeline

In [7]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import ChatOpenAI

In [8]:
from langchain_community.utilities.sql_database import SQLDatabase
from rag.vectorstore.crate import CrateVectorStore

# vectorstore = CrateVectorStore(
#     embeddings=embeddings,
#     db=SQLDatabase.from_uri(conn_url),
#     drop_if_exists=True,
# )
vectorstore = CrateVectorStore.from_documents(
    documents=docs_splits,
    embedding=embeddings,
    database_kwargs={
        "database_uri": conn_url,
    },
    # vectorstore_kwargs={
    #    "drop_if_exists" : True,
    # },
)
vectorstore

<rag.vectorstore.crate.CrateVectorStore at 0x16dfff8d0>

In [9]:
retriever = vectorstore.as_retriever(
    search_kwargs={'k': 10, 'fetch_k': 100, "algorith": "fulltext"}
)

In [10]:
import json

In [11]:
template = """Answer the question based only on the following context, if possible use links inside answer to reference the source, use markdown:

today date is 2024 April 3rd

{context}

Question: {question}
"""
prompt = ChatPromptTemplate.from_template(template)
model = ChatOpenAI()


def format_docs(docs):
    breakpoint()
    return json.dumps([{"text": d.page_content, "source": d.metadata.get('source')} for d in docs])


chain = (
        {"context": retriever | format_docs,
         "question": RunnablePassthrough()}
        | prompt
        | model
        | StrOutputParser()
)

# result = chain.invoke("How to limit permissions?")
# result = chain.invoke(" How AWS marketplace works, and why I cannot see deployment in my account?")
# result = chain.invoke("What are edge regions and how to use them?")
result = chain.invoke("Write me example of using blobs?")
# result = chain.invoke("How to use BLOB store in CrateDB? and what are the benefits?")
result


'To use blobs in CrateDB, you first need to create a blob table. You can do this using the Crate Shell (CraSh) and issuing a SQL statement like this:\n\n```sh\ncrash -c "create blob table myblobs clustered into 3 shards with (number_of_replicas=1)"\n```\n\nOnce the blob table is created, you can start working with your blob container by uploading blobs. The blob container can work with files or file-like objects that produce bytes when read. To upload a blob, you can use a PUT request with the SHA1 hash of the blob as its ID.\n\nFor example, you can upload a blob by issuing a PUT request like this:\n\n```sh\ncurl -X PUT \'http://127.0.0.1:4200/_blobs/myblobs/4a756ca07e9487f482465a99e8286abc86ba4dc7\' -d \'contents\'\n```\n\nTo retrieve blobs, you can use a GET request to the appropriate endpoint. And to delete a blob, you can use a DELETE request.\n\nThese are just some basic examples of using blobs in CrateDB. For more detailed information and examples, you can refer to the [official 

In [12]:
from IPython.display import display, Markdown

display(Markdown(result))

To use blobs in CrateDB, you first need to create a blob table. You can do this using the Crate Shell (CraSh) and issuing a SQL statement like this:

```sh
crash -c "create blob table myblobs clustered into 3 shards with (number_of_replicas=1)"
```

Once the blob table is created, you can start working with your blob container by uploading blobs. The blob container can work with files or file-like objects that produce bytes when read. To upload a blob, you can use a PUT request with the SHA1 hash of the blob as its ID.

For example, you can upload a blob by issuing a PUT request like this:

```sh
curl -X PUT 'http://127.0.0.1:4200/_blobs/myblobs/4a756ca07e9487f482465a99e8286abc86ba4dc7' -d 'contents'
```

To retrieve blobs, you can use a GET request to the appropriate endpoint. And to delete a blob, you can use a DELETE request.

These are just some basic examples of using blobs in CrateDB. For more detailed information and examples, you can refer to the [official documentation](https://cratedb.com/docs/crate/reference/en/5.6/general/blobs.html).

In [13]:
display(Markdown(chain.invoke("What are edge regions and how to use them?")))

Edge regions are custom regions where customers can deploy their CrateDB clusters on their own infrastructure, such as AWS, Azure, or physical servers at their facilities. Kubernetes serves as the interface hosting the CrateDB Edge clusters, giving customers control over their data through the CrateDB Cloud web interface. 

To use edge regions, customers can create a custom region in the CrateDB Cloud console, deploy a cluster in that region, and upgrade the edge region components regularly for new features, bug fixes, and security updates. Customers can also clone clusters between cloud providers and the Edge environment for disaster recovery or transitioning to/from on-premises hardware.

Source: [Crate.io Expands CrateDB Cloud with the Launch of CrateDB Edge](https://cratedb.com/press/crate-io-expands-cratedb-cloud-with-the-launch-of-cratedb-edge)

In [14]:
display(Markdown(chain.invoke("How AWS marketplace works, and why I cannot see deployment in my account?")))

AWS Marketplace allows users to subscribe to services such as CrateDB Cloud directly through their AWS account. When you subscribe to a service on AWS Marketplace, your hourly usage is billed directly by Amazon, not by Crate.io. Once you subscribe, you will be redirected to a page where you can confirm the subscription.

If you are unable to see the deployment in your account after subscribing, it may take up to 10 minutes for the subscription to be confirmed and usable in the CrateDB Cloud console. Make sure you are logged in to the AWS Marketplace with the account you used to subscribe to the offer. You can find your account name in the top right corner, and in the dropdown menu, select 'Your Marketplace Software' to manage your subscriptions.

For more information on how to subscribe to CrateDB Cloud on AWS Marketplace, you can refer to [this source](https://cratedb.com/docs/cloud/en/latest/tutorials/deploy/marketplace/subscribe-aws.html).

In [15]:
display(Markdown(chain.invoke("What are recent blog posts about CrateDB?")))

Recent blog posts about CrateDB include:

1. [Unlocking data insights at scale for the mining industry](https://cratedb.com/blog/latest-product-news-events-and-tutorials-around-cratedb)
2. [Ingesting and handling large amounts of data with CrateDB](https://cratedb.com/blog/crate-commoncrawl)
3. [Crate 0.55, Our Biggest Release Yet?](https://cratedb.com/blog/tag/news)
4. [Crate.io at Software Engineering Daily](https://cratedb.com/blog/tag/news)
5. [Newsletter: Know all about CrateDB v.5.5 release](https://cratedb.com/blog/crate-series-a-round)
6. [Newsletter: Big news: The CrateDB Tour is coming to Berlin and a new release is out](https://cratedb.com/blog/cratedb-v4-6-is-now-stable-and-ready-to-use)
7. [Distributed query execution in CrateDB: What you need to know](https://cratedb.com/blog/correlated-sub-queries-in-cratedb)
8. [Newsletter: Latest product news, events, and tutorials around CrateDB](https://cratedb.com/blog/author/michael-kremmel)
9. [How we scaled ingestion to one million rows per second](https://cratedb.com/blog/testing-release-0-41-0)
10. [CrateDB v5.2 brings support for (backward) scrollable cursors, bitwise operators and min_by/max_by aggregations](https://cratedb.com/blog/tag/cratedb-cloud)

These are some of the recent blog posts related to CrateDB.

In [16]:
display(Markdown(chain.invoke("Write me example python code to use CrateDB?")))

To use CrateDB in Python, you can follow these steps:

1. Install the Crate Python driver by running:
```bash
pip install crate
```

2. Connect to CrateDB using the Python driver:
```python
from crate import client

# Connect to CrateDB
connection = client.connect("http://localhost:4200")

# Create a cursor
cursor = connection.cursor()

# Execute a SQL query
cursor.execute("SELECT * FROM my_table")

# Fetch and print results
for row in cursor.fetchall():
    print(row)

# Close cursor and connection
cursor.close()
connection.close()
```

3. You can find more information and examples in the [CrateDB Python driver documentation](https://cratedb.com/connect/python).

Remember to adjust the connection URL and SQL query according to your CrateDB setup and database structure.

In [17]:
display(Markdown(chain.invoke("Write me example golang code to use CrateDB?")))

To use CrateDB with Go, you can connect to CrateDB using the pgx driver. Here is an example of how you can interact with CrateDB using the pgx driver in Go:

```go
package main

import (
    "context"
    "fmt"
    "os"

    "github.com/jackc/pgx/v5"
)

func main() {
    // Establish a connection to CrateDB
    conn, err := pgx.Connect(context.Background(), "postgresql://username:password@localhost:5432/cratedb")
    if err != nil {
        fmt.Fprintf(os.Stderr, "Unable to connect to database: %v\n", err)
        os.Exit(1)
    }
    defer conn.Close(context.Background())

    // Perform a query
    rows, err := conn.Query(context.Background(), "SELECT * FROM your_table")
    if err != nil {
        fmt.Fprintf(os.Stderr, "Query failed: %v\n", err)
        os.Exit(1)
    }
    defer rows.Close()

    // Iterate over the results
    for rows.Next() {
        var id int
        var name string
        err := rows.Scan(&id, &name)
        if err != nil {
            fmt.Fprintf(os.Stderr, "Scan failed: %v\n", err)
            os.Exit(1)
        }
        fmt.Printf("ID: %d, Name: %s\n", id, name)
    }
}
```

Make sure to replace `username`, `password`, and `your_table` with your actual credentials and table name in the connection string. This code snippet demonstrates how to connect to CrateDB, execute a query, and retrieve the results using the pgx driver in Go.

For further information and detailed documentation, you can refer to the [CrateDB Go Driver documentation](https://cratedb.com/connect/go).

In [18]:
display(Markdown(chain.invoke("create RAG search with CrateDB and OpenAI?")))

To create a Retrieval Augmented Generation (RAG) search with CrateDB and OpenAI, you can leverage the vector store and vector similarity search capabilities of CrateDB along with the embedding algorithms provided by OpenAI. This combination allows for efficient data retrieval, similarity analysis, and content generation based on large language models (LLMs).

Here are the key steps to create a RAG search with CrateDB and OpenAI:
1. Utilize CrateDB as a vector store to efficiently manage diverse data types and ensure real-time data accessibility for continuous model training and prediction.
2. Implement the RAG approach based on CrateDB's vector store and the OpenAI embedding model to drive the content generation process.
3. Combine vector, full-text, and keyword searches in CrateDB to enhance semantic similarity and keyword matching, improving search precision and relevance.
4. Leverage the flexibility of CrateDB to handle vector data, eliminating the need for separate vector databases and enabling smoother scaling as data grows.
5. Integrate OpenAI's embedding algorithms with CrateDB to enhance the AI model's capabilities and optimize AI projects.
6. Implement advanced search capabilities in CrateDB, such as similarity search and flexible filtering, to find similarities across data represented as vectors and improve search precision for AI/ML use cases.

By following these steps and leveraging the capabilities of CrateDB and OpenAI, you can create a powerful RAG search system for enhanced data analysis, content generation, and AI applications.

Sources:
- [CrateDB Solutions: AI/ML Database](https://cratedb.com/solutions/ai-ml-database)
- [CrateDB Blog: Leverage Vector Search to Use Embeddings and Generative AI: Retrieval Augmented Generation (RAG) with CrateDB](https://cratedb.com/blog/leverage-vector-search-to-use-embeddings-and-generative-ai-retrieval-augmented-generation-rag-with-cratedb)
- [CrateDB Solutions: Vector Database](https://cratedb.com/solutions/vector-database)

In [19]:
display(Markdown(chain.invoke("how to alter table and add fulltext index?")))

To alter a table and add a fulltext index, you can use the `ALTER TABLE` statement with the `ADD COLUMN` clause. Here is an example:

```sql
ALTER TABLE my_table ADD COLUMN my_column TEXT INDEX USING FULLTEXT;
```

This will add a fulltext index to the `my_column` in the `my_table` table. Make sure to replace `my_table` and `my_column` with your actual table and column names.

For more information, you can refer to the [CrateDB documentation on adding columns](https://cratedb.com/docs/crate/reference/en/5.6/general/ddl/alter-table.html) and [fulltext indices](https://cratedb.com/docs/crate/reference/en/5.6/general/ddl/fulltext-indices.html).

In [20]:
display(Markdown(chain.invoke("how to alter table and add vector type field that allows for KNN search?")))

To alter a table and add a vector type field that allows for KNN search in CrateDB, you can follow these steps:

1. Create a table with the desired vector type field, for example:

```sql
CREATE TABLE IF NOT EXISTS my_table (
   vector_field FLOAT_VECTOR(2)
);
```

2. Use the `ALTER TABLE` command with the `ADD COLUMN` clause to add the vector type field to an existing table:

```sql
ALTER TABLE my_table ADD COLUMN new_vector_field FLOAT_VECTOR(2);
```

By following these steps, you can successfully alter a table and add a vector type field in CrateDB that allows for KNN search. For more information, you can refer to the [CrateDB documentation](https://cratedb.com/docs/crate/reference/en/5.6/general/ddl/alter-table.html).

In [21]:
display(Markdown(chain.invoke("create table with fields ID, name, vector, and index vector field for KNN search?")))

To create a table with fields ID, name, vector, and index the vector field for KNN search in CrateDB, you can follow the example below:

```sql
CREATE TABLE my_table (
  ID INTEGER PRIMARY KEY,
  name TEXT,
  vector FLOAT_VECTOR(2) INDEX using fulltext,
);
```

In this example, `my_table` is created with fields ID (integer primary key), name (text), and vector (float vector with 2 dimensions) indexed for KNN search using fulltext method.

For further information, you can refer to the [CrateDB documentation](https://cratedb.com/docs/crate/reference/en/5.6/general/ddl/data-types.html#float_vector).

In [22]:
display(Markdown(chain.invoke("What are limits and limitations of CrateDB?")))

The limitations of a single-node CRFEE plan of CrateDB include the lack of high speed, scalability, and high-availability features found in a standard CrateDB distributed cluster. However, users can easily create a new cluster with another plan from within the Cloud Console to overcome these limitations. For more information, you can refer to the source [here](https://cratedb.com/lp-crfree?hsCtaTracking=43b563de-8b00-42d1-b008-73ca8a3353a1|398e0b9d-de53-4207-9b91-a092772b42e3#main-content).

In [23]:
display(Markdown(chain.invoke("What are the benefits of using CrateDB?")))

The benefits of using CrateDB include:
- High availability and fault tolerance due to built-in data replication and cluster rebalancing mechanisms. Source: [CrateDB Solutions for IoT](https://cratedb.com/solutions/iot-database)
- Reduced total cost of ownership (TCO) by delivering high performance, scalability, and flexibility. Source: [Data Consolidation Overview](https://cratedb.com/blog/what-is-data-consolidation-an-overview)
- Simplified data management, reduced development time, and seamless integrations with various data types like time series, geospatial, JSON, and full-text search. Source: [Best Vector Database for Your Business](https://cratedb.com/blog/the-best-vector-database-for-your-business)
- Hyper-fast query response time in milliseconds for efficient data processing and analysis. Source: [Query Performance Features](https://cratedb.com/product/features/query-performance)
- Scalability for handling large volumes of data and real-time analytics, making it ideal for modern data-intensive applications. Source: [Time Series Visualization in CrateDB](https://cratedb.com/blog/introduction-to-time-series-visualization-in-cratedb-and-explo)

These benefits make CrateDB a powerful distributed database technology suitable for various use cases.

In [24]:
display(Markdown(chain.invoke("What are technical limitations?")))

Technical limitations include the inability to directly nest an array within an array and the restriction of certain column definitions in CrateDB. These limitations are important to be aware of when working with the database. The details can be found [here](https://cratedb.com/docs/crate/reference/en/5.6/general/dql/selects.html#sql-dql-objects).

In [25]:
display(Markdown(chain.invoke("Does index creation block write operations?")))

No, index creation does not block write operations in CrateDB. Index creation is a separate process that does not interfere with write operations. However, during maintenance operations, you might want to temporarily disable writes using the `blocks.write` setting, which can be manually reset after the maintenance operation has been completed. You can find more information about this in the [CrateDB documentation](https://cratedb.com/docs/crate/reference/en/5.6/sql/statements/create-table.html).

In [26]:
display(Markdown(chain.invoke("Does crate supports conditional indices")))

Yes, CrateDB automatically generates indexes for all attributes regardless of their depth, enabling rapid search capabilities for stored objects and facilitating efficient updates. This feature allows for fast query responses for any query type. More information can be found [here](https://cratedb.com/solutions/json-database).

In [27]:
display(Markdown(chain.invoke("How to create ID field that is autoincremented?")))

To create an autoincremented ID field in CrateDB, you can define the field as an integer primary key with the AUTO_GENERATED option. Here is an example of how to do it:

```sql
CREATE TABLE my_table (
    id INTEGER PRIMARY KEY AUTO_GENERATED,
    name TEXT,
    description TEXT
);
```

This will automatically generate unique integer values for the `id` field whenever a new record is inserted into the table. You can find more information about creating tables with autoincremented fields in CrateDB in the [official documentation](https://cratedb.com/docs/crate/reference/en/5.6/general/ddl/constraints.html#auto-generate-constraint).

In [28]:
display(Markdown(chain.invoke("how to create analysers for fulltext search?")))

To create analyzers for fulltext search, you can follow these steps:

1. Define the analyzer with the desired tokenizer, token filters, and char filters. An analyzer consists of one tokenizer, zero or more token filters, and zero or more char filters. Here is a [simple example](https://cratedb.com/docs/crate/reference/en/5.6/general/ddl/fulltext-indices.html) of creating an analyzer.

2. You can use built-in analyzers like `standard`, `whitespace`, `language`, etc., or create custom analyzers by combining different tokenizers, token filters, and char filters based on your requirements.

3. Fulltext indices are created with analyzers to analyze the data and split it into tokens for efficient fulltext search. Make sure to define a fulltext index with an analyzer for the relevant columns in your table.

By following these steps, you can create and customize analyzers for fulltext search in CrateDB.

In [29]:
display(Markdown(chain.invoke("give me information about password and admin")))

To find information about password authentication methods and admin-related topics in CrateDB, you can refer to the following sources:

- **Password Authentication Method**: CrateDB uses the password authentication method for controlling access via the CrateDB REST and PostgreSQL wire protocol interfaces, the CrateDB Admin UI, and command-line tools. This method requires clients to provide a password in addition to the username. You can read more about this authentication method [here](https://cratedb.com/docs/crate/reference/en/5.6/admin/auth/index.html).

- **Admin-related Information**: CrateDB allows for authenticating as a superuser when the cluster is started, with one predefined superuser called 'crate'. For more details on managing users, roles, and authentication as a superuser, you can refer to this [link](https://cratedb.com/docs/crate/reference/en/5.6/admin/auth/hba.html).

These sources provide detailed information on password authentication methods and administrative tasks in CrateDB.

In [30]:
display(Markdown(chain.invoke("Shared file system implementation of the BlobStoreRepository")))

The shared file system implementation of the BlobStoreRepository in CrateDB is called `fs`. It is the default file system implementation that selects the best implementation based on the operating environment, currently `hybridfs` on all supported systems. Other file system implementations include `niofs` and `mmapfs`. You can find more information about these file system implementations in the CrateDB documentation [here](https://cratedb.com/docs/crate/reference/en/5.6/sql/statements/create-table.html).

In [31]:
display(Markdown(chain.invoke("Is Cloud UI opensource?")))

Yes, Cloud UI is open source. You can find more information about it [here](https://cratedb.com/blog/introducing-scheduled-jobs-and-cloud-query-console).