In [1]:
%pip install --upgrade --quiet pip setuptools wheel
%pip install --upgrade --quiet  langchain langchain-openai faiss-cpu tiktoken crate 'crate[sqlalchemy]' pandas jq 
%pip install --use-pep517 --quiet python-dotenv

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


# Use CrateDB as fulltext search retriver and Mistral-7B as language model

## Setup environment variables

In [2]:
import os

from dotenv import load_dotenv

load_dotenv()

True

## setup embeddings

In [3]:
from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings()
len(embeddings.embed_query("a"))

1536

In [4]:
conn_url = "crate://{user}:{password}@{server}".format(
    user=os.environ["CRATEDB_USER"],
    password=os.environ["CRATEDB_PASS"],
    server=os.environ["CRATEDB_SERVER"],
)
conn_url

'crate://crate:@localhost:4201'

In [5]:
# open file
from langchain_community.document_loaders import JSONLoader, DirectoryLoader


def metadata_func(record: dict, metadata: dict) -> dict:
    metadata["source_url"] = record.get("url")
    metadata["source_title"] = record.get("title")

    if "source" in metadata:
        metadata["source"] = metadata["source_url"]

    return metadata


loader = DirectoryLoader(
    './',
    glob="everything-*.json",
    loader_cls=JSONLoader,
    loader_kwargs={
        "jq_schema": ".[]",
        "text_content": False,
        "content_key": "html",
        "metadata_func": metadata_func,
    }
)

data = loader.load()
# data[:1]

In [6]:
# split documents
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    separators=[
        "\n\n",
        "\n",
        " ",
        ".",
        ",",
    ],
    chunk_size=500,
    chunk_overlap=50,
    length_function=len,
    is_separator_regex=False,
)

docs_splits = text_splitter.split_documents(data)
len(docs_splits)
# docs_splits[:2]

60441

## RAG search, indexing pipeline

In [7]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import ChatOpenAI

In [8]:
import time
from rag.vectorstore.crate import CrateVectorStore

start_time = time.time()
vectorstore = CrateVectorStore.from_documents(
    documents=docs_splits,
    embedding=embeddings,
    database_kwargs={
        "database_uri": conn_url,
    },
    vectorstore_kwargs={
       "drop_if_exists" : True,
    },
)

end_time = time.time()
execution_time = end_time - start_time
print(f"Execution time: {execution_time}")

vectorstore

Execution time: 419.7913148403168


<rag.vectorstore.crate.CrateVectorStore at 0x10d42c850>

In [9]:
retriever = vectorstore.as_retriever(
    search_kwargs={'k': 10, 'fetch_k': 100, "algorith": "knn"}
)

In [10]:
import json

In [11]:
template = """Answer the question based only on the following context, if possible use links inside answer to reference the source, use markdown:

today date is 2024 April 3rd

{context}

Question: {question}
"""
prompt = ChatPromptTemplate.from_template(template)
model = ChatOpenAI()


def format_docs(docs):
    breakpoint()
    return json.dumps([{"text": d.page_content, "source": d.metadata.get('source')} for d in docs])


chain = (
        {"context": retriever | format_docs,
         "question": RunnablePassthrough()}
        | prompt
        | model
        | StrOutputParser()
)

# result = chain.invoke("How to limit permissions?")
# result = chain.invoke(" How AWS marketplace works, and why I cannot see deployment in my account?")
# result = chain.invoke("What are edge regions and how to use them?")
result = chain.invoke("Write me example of using blobs?")
# result = chain.invoke("How to use BLOB store in CrateDB? and what are the benefits?")
result


'To use blobs in CrateDB, you first need to create a blob table using the Crate Shell. Here is an example of creating a blob table with the name "myblobs" clustered into 3 shards:\n\n```sh\ncrash -c "create blob table myblobs clustered into 3 shards with (number_of_replicas=1)"\n```\n\nAfter creating the blob table, you can upload a blob by issuing a PUT request. For example, you can use curl to upload a blob with the contents "contents" to the blob table:\n\n```sh\ncurl -isSX PUT \'127.0.0.1:4200/_blobs/myblobs/4a756ca07e9487f482465a99e8286abc86ba4dc7\' -d \'contents\' HTTP/1.1\n```\n\nTo list all blobs inside a blob table, you can use a SELECT statement. Additionally, you can delete a blob from a blob table by issuing a DELETE request:\n\n```sh\ncurl -isS -XDELETE \'127.0.0.1:4200/_blobs/myblobs/4a756ca07e9487f482465a99e8286abc86ba4dc7\' HTTP/1.1\n```\n\nYou can also delete the entire blob table by using the following command in the Crate Shell:\n\n```sh\ncrash -c "drop blob table my

In [12]:
from IPython.display import display, Markdown

display(Markdown(result))

To use blobs in CrateDB, you first need to create a blob table using the Crate Shell. Here is an example of creating a blob table with the name "myblobs" clustered into 3 shards:

```sh
crash -c "create blob table myblobs clustered into 3 shards with (number_of_replicas=1)"
```

After creating the blob table, you can upload a blob by issuing a PUT request. For example, you can use curl to upload a blob with the contents "contents" to the blob table:

```sh
curl -isSX PUT '127.0.0.1:4200/_blobs/myblobs/4a756ca07e9487f482465a99e8286abc86ba4dc7' -d 'contents' HTTP/1.1
```

To list all blobs inside a blob table, you can use a SELECT statement. Additionally, you can delete a blob from a blob table by issuing a DELETE request:

```sh
curl -isS -XDELETE '127.0.0.1:4200/_blobs/myblobs/4a756ca07e9487f482465a99e8286abc86ba4dc7' HTTP/1.1
```

You can also delete the entire blob table by using the following command in the Crate Shell:

```sh
crash -c "drop blob table myblobs"
```

These examples showcase the basic operations for using blobs in CrateDB. For more information, you can refer to the official documentation on [using Crate as a blobstore](https://cratedb.com/blog/using-crate-as-a-blobstore).

In [13]:
display(Markdown(chain.invoke("What are edge regions and how to use them?")))

Edge regions are custom regions created in the CrateDB Cloud Console for hosting database infrastructure locally without relying on existing cloud providers. These regions are useful for deploying CrateDB Cloud on Kubernetes clusters and ensuring reliable service even in locations with bad connectivity.

To create and use edge regions, sign up or log into the CrateDB Cloud Console, go to the Regions tab in the Subscription overview, and click on "Create Edge region." Once the region appears in the list, copy the script provided into your CLI to install CrateDB Edge on the correct cluster. Follow the prompts to install prerequisite tools as needed and configure necessary storage classes.

For more information, you can refer to the sources [here](https://cratedb.com/docs/cloud/en/latest/tutorials/edge/introduction.html#edge-disclaimer) and [here](https://cratedb.com/docs/cloud/en/latest/tutorials/edge/managed-kubernetes.html#edge-providers).

In [14]:
display(Markdown(chain.invoke("How AWS marketplace works, and why I cannot see deployment in my account?")))

To deploy a cluster on CrateDB Cloud through AWS Marketplace, you need to sign up via the AWS Marketplace and have an AWS Marketplace account. The hourly usage is billed by Amazon, not by Crate.io. It can take up to 10 minutes for the subscription to be confirmed and usable in the CrateDB Cloud console after signing up on AWS Marketplace. 

If you cannot see the deployment in your account, it may be because the subscription process is still pending confirmation. You can check the status of your subscription by logging into your AWS Marketplace account and ensuring that you are using the correct account that was used to subscribe to the offer. Once the subscription is confirmed, you should be able to view and manage your deployment in the CrateDB Cloud Console.

For more information, you can refer to the documentation on [AWS Marketplace deployment with CrateDB Cloud](https://cratedb.com/docs/cloud/en/latest/tutorials/deploy/marketplace/subscribe-aws.html).

In [15]:
display(Markdown(chain.invoke("What are recent blog posts about CrateDB?")))

Recent blog posts about CrateDB include:
- "Distributed query execution in CrateDB: What you need to know" published on July 20, 2022. [Source](https://cratedb.com/blog/distributed-query-execution-in-cratedb-what-you-need-to-know)
- "CrateDB v.5.5 release with the vector store and similarity search functionality" [Source](https://cratedb.com/blog/newsletter-lets-welcome-the-new-year-with-the-highlights-of-2023-%F0%9F%8E%89)
- "CrateDB v5.3 stable release available" [Source](https://cratedb.com/blog/newsletter-lets-welcome-may)

In [16]:
display(Markdown(chain.invoke("Write me example python code to use CrateDB?")))

To use CrateDB with Python, you can refer to the official CrateDB Python client documentation [here](https://cratedb.com/connect/python). The documentation provides detailed examples on how to interact with CrateDB using Python.

You can also explore the different kinds of examples on how to use the CrateDB Python client in the documentation [here](https://cratedb.com/docs/python/en/latest/by-example/index.html#by-example). This includes examples for DB API, HTTP, and BLOB interfaces.

Additionally, you can find executable code examples and sample applications in the cratedb-examples repository [here](https://cratedb.com/docs/python/en/latest/), which demonstrate the use of the driver for various applications.

If you are interested in using CrateDB with pandas, there are corresponding code snippets on how to connect to CrateDB using pandas and how to load and export data available in the documentation [here](https://cratedb.com/docs/python/en/latest/).

Overall, the official CrateDB documentation provides a comprehensive guide on using CrateDB with Python, including example code snippets and sample applications.

In [17]:
display(Markdown(chain.invoke("Write me example golang code to use CrateDB?")))

To connect to CrateDB using Go, you can utilize the pgx driver. Here's an example of Go code that connects to CrateDB:

```go
package main

import (
    "context"
    "fmt"
    "github.com/jackc/pgx/v5"
)

func main() {
    // Establish connection to CrateDB
    conn, err := pgx.Connect(context.Background(), "postgresql://user:password@localhost:5432/database")
    if err != nil {
        fmt.Fprintf(os.Stderr, "Unable to connect to database: %v\n", err)
        os.Exit(1)
    }
    defer conn.Close(context.Background())

    // Perform database operations (e.g., query, insert, update)
}
```

You can find more information on connecting to CrateDB with Go [here](https://cratedb.com/connect/go).

In [18]:
display(Markdown(chain.invoke("create RAG search with CrateDB and OpenAI?")))

To create a RAG search with CrateDB and OpenAI, you can leverage the Vector Search capabilities in CrateDB. The process involves using CrateDB as a vector store and integrating it with the OpenAI embedding model. This approach allows for efficient data storage of structured and unstructured data, as well as vector data generated by popular embedding algorithms.

To understand the key components and tools required for building a RAG system with CrateDB and OpenAI, you can refer to the following resources:

1. **Introduction to RAG workflow with CrateDB**: This blog post provides a high-level overview of the RAG workflow with CrateDB, including steps such as identifying key data sets, building a knowledge-based index, and fetching relevant documents from the vector store based on a search algorithm. You can access the post [here](https://cratedb.com/blog/leverage-vector-search-to-use-embeddings-and-generative-ai-retrieval-augmented-generation-rag-with-cratedb).

2. **Understanding the role of vector store and vector similarity search**: This post explains the importance of using a vector store and vector similarity search in the context of RAG search with CrateDB and OpenAI. You can find more information [here](https://cratedb.com/blog/leverage-vector-search-to-use-embeddings-and-generative-ai-retrieval-augmented-generation-rag-with-cratedb).

By following these resources, you can gain insights into how to create a RAG search system with CrateDB and OpenAI, leveraging the power of vector search and embedding algorithms.

In [19]:
display(Markdown(chain.invoke("how to alter table and add fulltext index?")))

To alter a table and add a fulltext index, you can use the `ALTER TABLE` command in CrateDB. 

First, you need to create the table with the desired columns. Then, you can alter the table and add a fulltext index by using the `ALTER TABLE` command with the `ADD INDEX` clause. Here is an example syntax:

```sql
ALTER TABLE table_name ADD INDEX index_name USING fulltext(column_name) WITH (analyzer = 'english');
```

Replace `table_name` with the name of your table, `index_name` with the name you want to give to the fulltext index, and `column_name` with the name of the column you want to create the fulltext index on.

For more information, you can refer to the official CrateDB documentation on [fulltext indices](https://cratedb.com/docs/crate/reference/en/5.6/general/ddl/fulltext-indices.html).

This command will allow you to alter the table and add a fulltext index to it in CrateDB.

In [20]:
display(Markdown(chain.invoke("how to alter table and add vector type field that allows for KNN search?")))

To alter a table and add a vector type field that allows for KNN search in CrateDB, you can follow these steps:

1. Use the `ALTER TABLE` command to modify the existing table schema.
2. Add a new column with the `FLOAT_VECTOR` data type to store dense vectors of float values of fixed length.
3. Ensure that the vector dimension does not exceed 2048, as this is the limit supported by CrateDB for vector fields.
4. After adding the vector type field, you can perform KNN search queries against the vectors stored in the table.

You can refer to the official CrateDB documentation for more details on creating tables with vector fields and performing KNN search queries:
- [CrateDB Documentation on Data Types](https://cratedb.com/docs/crate/reference/en/master/general/ddl/data-types.html)
- [CrateDB Blog on Vector Store Functionality](https://cratedb.com/blog/cratedb-v5.5-vector-store)

In [21]:
display(Markdown(chain.invoke("create table with fields ID, name, vector, and index vector field for KNN search?")))

To create a table with fields ID, name, vector, and index the vector field for KNN search in CrateDB, you can use the following SQL commands:

```sql
CREATE TABLE my_table (
  ID INTEGER PRIMARY KEY,
  name TEXT,
  vector FLOAT_VECTOR(2) INDEX using fulltext with (analyzer = 'english')
);
```

This SQL statement creates a table called `my_table` with fields `ID` as an integer primary key, `name` as text, and `vector` as a float vector with a full-text index for KNN search.

You can refer to the source [here](https://cratedb.com/solutions/multi-model-database) for more information on creating tables with different data types in CrateDB.

In [22]:
display(Markdown(chain.invoke("What are limits and limitations of CrateDB?")))

The limitations of CrateDB include the fact that the single-node CRFEE plan does not offer capabilities such as high speed, scalability, and high availability that you would get from a standard CrateDB distributed cluster. However, you can easily create a new cluster with another plan from within the Cloud Console. You can find more information about these limitations on the [CrateDB website](https://cratedb.com/lp-crfree).

In [23]:
display(Markdown(chain.invoke("What are the benefits of using CrateDB?")))

The benefits of using CrateDB include:

- High performance, scalability, and flexibility
- Horizontal scalability, allowing for the addition of nodes as needed
- Support for structured, semi-structured, and unstructured schemas
- Multi-platform support
- Simplified data infrastructure and overcoming challenges of complexity and technical debt
- Native SQL query language for data querying and manipulation
- Efficient management of extensive concurrent reads and writes
- Reduction of total cost of ownership (TCO)
- Fully distributed query engine and columnar storage for immediate data availability, ad-hoc queries, hyper-fast aggregations, and in-memory SQL query performance
- Query response time in milliseconds for processing and analyzing data efficiently
- Support for dynamic schemas, queryable objects, time-series data, and real-time full-text search
- Powerful REST API for managing and accessing CrateDB

Sources:
- [CrateDB Customers - SPGO](https://cratedb.com/customers/spgo)
- [CrateDB Customers - ABB](https://cratedb.com/customers/abb)
- [CrateDB Blog - Data Challenges ML AI](https://cratedb.com/blog/data-challenges-ml-ai)
- [CrateDB Solutions - Database Consolidation](https://cratedb.com/solutions/database-consolidation)
- [CrateDB Product Features - Distributed Database](https://cratedb.com/product/features/distributed-database)
- [CrateDB Blog - What is Data Consolidation](https://cratedb.com/blog/what-is-data-consolidation-an-overview)
- [CrateDB Product Features - Query Performance](https://cratedb.com/product/features/query-performance)
- [CrateDB Blog - Automating Export of CrateDB Data to S3](https://cratedb.com/blog/automating-export-of-cratedb-data-to-s3-using-apache-airflow)
- [CrateDB Product Features - REST API](https://cratedb.com/product/features/rest-api)

In [24]:
display(Markdown(chain.invoke("What are technical limitations?")))

Based on the provided context, technical limitations in the context of Industrial IoT (IIoT) projects include the challenges related to data requirements such as high volumes, real-time responses, and industry-specific tools and characteristics. Additionally, traditional data management solutions may not be able to handle the massive scale of sensor data produced in an industrial environment, leading to scalability issues and rising costs. Furthermore, adopting new technologies like Industry 4.0 with existing data technologies may result in poor results and unrealistic expectations. These technical limitations can hinder the successful implementation of digital transformation initiatives in industrial settings.

Sources:
- [Why IIoT projects are failing](https://cratedb.com/blog/why-iiot-projects-are-failing)
- [Digital transformation on the factory floor](https://cratedb.com/blog/digital-transformation-factory-floor)
- [Data historians vs. time series databases](https://cratedb.com/blog/data-historians-vs-time-series-databases)

In [25]:
display(Markdown(chain.invoke("Does index creation block write operations?")))

Based on the provided context, setting `write.wait_for_active_shards` to 2 and stopping a node would indeed block write operations until the replica is fully replicated again or until a timeout occurs if the replication is not fast enough. This can be referenced in the CrateDB documentation on [index creation](https://cratedb.com/docs/crate/reference/en/5.6/sql/statements/create-table.html#translog-durability).

In [26]:
display(Markdown(chain.invoke("Does crate supports conditional indices")))

Based on the provided context, CrateDB does not explicitly mention support for conditional indices. The focus seems to be on automatic indexing of all attributes by default using strategies like Inverted Index for text values, Block k-d trees for numeric, date, and geospatial values, and Hierarchical Navigable Small World (HNSW) graphs for high dimensional vectors. For more information on CrateDB's indexing capabilities, you can refer to their official website: [CrateDB Features](https://cratedb.com/product/features/data-storage).

In [27]:
display(Markdown(chain.invoke("How to create ID field that is autoincremented?")))

To create an autoincremented ID field in CrateDB, you cannot use an auto-generated primary key. Instead, you need to specify the primary key when inserting data, otherwise an error will be returned. 

Defining multiple columns with a primary key constraint is supported, for example:

```sql
CREATE TABLE my_table1pk (
   first_column integer primary key,
   second_column text primary key,
   third_column text
);
```

For more information, you can refer to the official CrateDB documentation on [constraints](https://cratedb.com/docs/crate/reference/en/5.6/general/ddl/constraints.html).

In [28]:
display(Markdown(chain.invoke("how to create analysers for fulltext search?")))

To create analyzers for fulltext search in CrateDB, you can use language-specific analyzers, tokenizers, and token-filters to get accurate search results for data provided in a certain language. Analyzers provide fine-grained control over building a token stream for fulltext search. You can refer to the official CrateDB documentation for more information and examples on creating custom analyzers or extending built-in analyzers. Additionally, you can check out the following resources:

1. [Fulltext Indices](https://cratedb.com/docs/crate/reference/en/master/general/ddl/fulltext-indices.html) for examples on creating tables utilizing analyzers.
2. [Create a Custom Analyzer](https://cratedb.com/docs/crate/reference/en/3.3/general/ddl/analyzers.html) for a guide on creating a custom analyzer.
3. [CREATE ANALYZER](https://cratedb.com/docs/crate/reference/en/3.3/general/ddl/analyzers.html) for syntax reference.
4. Built-in Analyzers like 'standard' which utilizes the Standard Tokenizer Tokenizer with standard Token Filter, lowercase Token Filter, and stop Token Filter.

By following these guidelines and references, you can effectively create analyzers for fulltext search in CrateDB.

In [29]:
display(Markdown(chain.invoke("give me information about password and admin")))

Based on the provided context, the password authentication method is used in CrateDB for accessing the Admin UI. When using this method, the client must provide a password in addition to the username. This password is associated with a 24-character password defined or auto-generated earlier on. If you forget the credentials, you can retrieve them by accessing the "Settings" section under your cluster name in the CrateDB Admin UI. 

For more information about password authentication and admin privileges in CrateDB, you can refer to the following sources:
- [CrateDB Documentation - Password Authentication Method](https://cratedb.com/docs/crate/reference/en/5.6/admin/auth/methods.html)
- [CrateDB Documentation - Admin UI](https://cratedb.com/docs/guide/admin/create-user.html)

In [30]:
display(Markdown(chain.invoke("Shared file system implementation of the BlobStoreRepository")))

The shared file system implementation of the BlobStoreRepository allows for defining a custom directory path for storing blob data, which can be different from the normal data path. This is useful for scenarios where normal data is stored on a fast SSD, while blob data is stored on a large, cheap spinning disk. This implementation simplifies administration work and ensures consistency in replication and sharding rules for both blob data and other types of data in the datastore. Crate provides BLOB storage for persistently storing and retrieving BLOBs, such as pictures, videos, or large unstructured files, in a fully distributed cluster solution. For more information, you can refer to the official Crate documentation on blob storage [here](https://cratedb.com/docs/crate/reference/en/5.6/general/blobs.html).

In [31]:
display(Markdown(chain.invoke("Is Cloud UI opensource?")))

Based on the provided context, the Cloud UI mentioned in the information is not open-source. The blog post mentions that the company has transitioned towards a cloud-based business model without abandoning open-source values, but it does not specify that the Cloud UI itself is open-source. You can find more information about the Cloud Query Console and other features on the CrateDB website [here](https://cratedb.com/blog/introducing-scheduled-jobs-and-cloud-query-console).

In [32]:
display(Markdown(chain.invoke("How to do fusion search and connect vector search with fulltext search")))

To perform fusion search and connect vector search with full-text search, you can leverage the advanced search capabilities offered by CrateDB. CrateDB allows users to combine vector, full-text, and keyword searches for improved semantic similarity and keyword matching, enhancing search precision and relevance. This fusion search approach enhances the search precision by combining the strengths of both vector and full-text search techniques, enabling efficient exploration and analysis of data.

You can learn more about vector similarity search and its benefits in improving accuracy and efficiency in various applications by reading this article: [Vector Similarity Search](https://cratedb.com/blog/vector-similarity-search).

Additionally, CrateDB's support for vector data types provides context aligned with your data and enhances explainability. By combining vector, full-text, and keyword searches, CrateDB enables users to perform fusion search effectively, improving the overall search experience.

For further information on open-source vector databases and their role in AI-powered applications, you can refer to this article: [Open Source Vector Database](https://cratedb.com/blog/open-source-vector-database).

In [33]:
display(Markdown(chain.invoke("How to MATH fulltext ")))

To perform a full-text search in a database, you need to create a full-text index with an analyzer for the specific column you want to search. Different types of full-text indices with various goals exist, but it's not possible to query multiple index columns with different index types within the same MATCH predicate. 

For detailed information on setting up your database for full-text search, creating the relevant indices, and efficiently querying your text data, you can refer to the documentation provided by CrateDB:
- [Full-Text Search Guide](https://cratedb.com/docs/guide/domain/search/index.html#fts)
- [Using Full-text Search Tutorial](https://cratedb.com/docs/cloud/en/latest/tutorials/full-text.html)

These resources will guide you through the process of setting up and querying full-text data effectively.

In [34]:
display(Markdown(chain.invoke("things to be caucus, results in failures, errors working with CrateDB ")))

Working with CrateDB, there can be various issues that may result in failures or errors due to the distributed nature of the system. These issues can include network failures, disk failures, unexpected termination of hosts, and other unpredictable events. CrateDB has mechanisms in place to cope with these issues and maintain availability, consistency, and durability, but occasionally, things can still go wrong. It is important to be cautious of potential non-deterministic test failures, especially in a distributed environment like CrateDB. For more information on working with CrateDB and handling potential failures, you can refer to the [official documentation on resiliency](https://cratedb.com/docs/crate/reference/en/master/concepts/resiliency.html).