In [1]:
%pip install --upgrade --quiet pip setuptools wheel
%pip install --upgrade --quiet  langchain langchain-openai faiss-cpu tiktoken crate 'crate[sqlalchemy]' pandas jq 
%pip install --use-pep517 --quiet python-dotenv

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


# Use CrateDB as fulltext search retriver and Mistral-7B as language model

## Setup environment variables

In [2]:
import os

from dotenv import load_dotenv

load_dotenv()

True

## setup embeddings

In [3]:
from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings()
len(embeddings.embed_query("a"))

1536

In [4]:
conn_url = "crate://{user}:{password}@{server}".format(
    user=os.environ["CRATEDB_USER"],
    password=os.environ["CRATEDB_PASS"],
    server=os.environ["CRATEDB_SERVER"],
)
conn_url

'crate://crate:@localhost:4201'

In [5]:
# open file
from langchain_community.document_loaders import JSONLoader, DirectoryLoader


def metadata_func(record: dict, metadata: dict) -> dict:
    metadata["source_url"] = record.get("url")
    metadata["source_title"] = record.get("title")

    if "source" in metadata:
        metadata["source"] = metadata["source_url"]

    return metadata


loader = DirectoryLoader(
    './',
    glob="everything-*.json",
    loader_cls=JSONLoader,
    loader_kwargs={
        "jq_schema": ".[]",
        "text_content": False,
        "content_key": "html",
        "metadata_func": metadata_func,
    }
)

data = loader.load()
# data[:1]

In [6]:
# split documents
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    separators=[
        "\n\n",
        "\n",
        " ",
        ".",
        ",",
    ],
    chunk_size=500,
    chunk_overlap=50,
    length_function=len,
    is_separator_regex=False,
)

docs_splits = text_splitter.split_documents(data)
# docs_splits[:2]

## RAG search, indexing pipeline

In [7]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_openai import ChatOpenAI

In [8]:
from langchain_community.utilities.sql_database import SQLDatabase
from rag.vectorstore.crate import CrateVectorStore

# vectorstore = CrateVectorStore(
#     embeddings=embeddings,
#     db=SQLDatabase.from_uri(conn_url),
#     drop_if_exists=True,
# )
vectorstore = CrateVectorStore.from_documents(
    # documents=[],
    documents=docs_splits,
    embedding=embeddings,
    database_kwargs={
        "database_uri": conn_url,
    },
    vectorstore_kwargs={
       "drop_if_exists" : True,
    },
)
vectorstore

<rag.vectorstore.crate.CrateVectorStore at 0x10b831550>

In [9]:
retriever = vectorstore.as_retriever(
    search_kwargs={'k': 10, 'fetch_k': 100, "algorith": "knn"}
)

In [10]:
import json

In [11]:
template = """Answer the question based only on the following context, if possible use links inside answer to reference the source, use markdown:

today date is 2024 April 3rd

{context}

Question: {question}
"""
prompt = ChatPromptTemplate.from_template(template)
model = ChatOpenAI()


def format_docs(docs):
    breakpoint()
    return json.dumps([{"text": d.page_content, "source": d.metadata.get('source')} for d in docs])


chain = (
        {"context": retriever | format_docs,
         "question": RunnablePassthrough()}
        | prompt
        | model
        | StrOutputParser()
)

# result = chain.invoke("How to limit permissions?")
# result = chain.invoke(" How AWS marketplace works, and why I cannot see deployment in my account?")
# result = chain.invoke("What are edge regions and how to use them?")
result = chain.invoke("Write me example of using blobs?")
# result = chain.invoke("How to use BLOB store in CrateDB? and what are the benefits?")
result


'To use blobs in Crate, you first need to create a blob table. This can be done by issuing a SQL statement using the Crate Shell, CraSh. Here is an example command to create a blob table:\n\n```sh\ncrash -c "create blob table myblobs clustered into 3 shards with (number_of_replicas=1)"\n```\n\nAfter creating the blob table, you can upload a blob by issuing a PUT request. Here is an example command to upload a blob:\n\n```sh\ncurl -isSX PUT \'127.0.0.1:4200/_blobs/myblobs/4a756ca07e9487f482465a99e8286abc86ba4dc7\' -d \'contents\'\n```\n\nAdditionally, you can retrieve, delete, and manage blobs using different commands like SELECT for querying, DELETE for deleting, and ALTER for altering the blob table. You can find more information about using blobs in CrateDB from the [official documentation](https://cratedb.com/docs/crate/reference/en/3.3/general/blobs.html).'

In [12]:
from IPython.display import display, Markdown

display(Markdown(result))

To use blobs in Crate, you first need to create a blob table. This can be done by issuing a SQL statement using the Crate Shell, CraSh. Here is an example command to create a blob table:

```sh
crash -c "create blob table myblobs clustered into 3 shards with (number_of_replicas=1)"
```

After creating the blob table, you can upload a blob by issuing a PUT request. Here is an example command to upload a blob:

```sh
curl -isSX PUT '127.0.0.1:4200/_blobs/myblobs/4a756ca07e9487f482465a99e8286abc86ba4dc7' -d 'contents'
```

Additionally, you can retrieve, delete, and manage blobs using different commands like SELECT for querying, DELETE for deleting, and ALTER for altering the blob table. You can find more information about using blobs in CrateDB from the [official documentation](https://cratedb.com/docs/crate/reference/en/3.3/general/blobs.html).

In [13]:
display(Markdown(chain.invoke("What are edge regions and how to use them?")))

Edge regions are components of a deployed Edge Region that are not updated automatically. To continue getting new features, bugfixes, and security updates, users should update their Edge Regions regularly. If a region is outdated, users will see an "Upgrade this Edge region" button next to the region. Clicking on this button will show a command that updates the Edge Region, which can be pasted into the environment where the Edge cluster is deployed to upgrade it. 

Additionally, users can create custom regions in the CrateDB Cloud Console by going to the Regions tab and clicking on "Create Edge region." This is useful for hosting a cluster locally without relying on existing cloud providers.

Sources:
- [CrateDB Documentation - Upgrade the Edge Region](https://cratedb.com/docs/cloud/en/latest/tutorials/edge/introduction.html#edge-disclaimer)
- [CrateDB Documentation - Create a custom region](https://cratedb.com/docs/cloud/en/latest/tutorials/edge/introduction.html#edge-disclaimer)

In [14]:
display(Markdown(chain.invoke("How AWS marketplace works, and why I cannot see deployment in my account?")))

To deploy a cluster on CrateDB Cloud via AWS Marketplace, you first need to sign up via AWS Marketplace and have an AWS Marketplace account. Your hourly usage will be billed directly by Amazon, not by Crate.io. After subscribing to the CrateDB Cloud offer on AWS Marketplace, it can take up to 10 minutes for the subscription to be confirmed and usable in the CrateDB Cloud console. If you cannot see the deployment in your account, you may need to unsubscribe from the AWS Marketplace offer by logging in to the AWS Marketplace website with the account used for the subscription and following the steps to unsubscribe.

For more information on how AWS Marketplace works and how to subscribe to CrateDB Cloud's offering on AWS Marketplace, you can refer to the official documentation [here](https://cratedb.com/docs/cloud/en/latest/tutorials/deploy/marketplace/subscribe-aws.html).

In [15]:
display(Markdown(chain.invoke("What are recent blog posts about CrateDB?")))

Recent blog posts about CrateDB include:

1. [Distributed query execution in CrateDB: What you need to know](https://cratedb.com/blog/distributed-query-execution-in-cratedb-what-you-need-to-know)
2. [Ingesting with CrateDB](https://cratedb.com/blog/cratedb-v4-6-is-now-stable-and-ready-to-use)
3. [Exciting news and tutorials for data lovers](https://cratedb.com/blog/exciting-news-and-tutorials-for-datalovers)
4. [Indexing and Storage in CrateDB](https://cratedb.com/product/features/indexing-columnar-storage-aggregations)

In [16]:
display(Markdown(chain.invoke("Write me example python code to use CrateDB?")))

To use CrateDB in Python, you can refer to the official CrateDB Python driver documentation [here](https://cratedb.com/connect/python). Additionally, you can explore different examples on how to use the CrateDB Python client [here](https://cratedb.com/docs/python/en/latest/by-example/index.html#by-example).

Here is an example of Python code using CrateDB:

```python
import crate
from crate import client

# Establish a connection to CrateDB
connection = client.connect('http://localhost:4200', username='your_username', password='your_password')

# Create a cursor object
cursor = connection.cursor()

# Execute a SQL query
cursor.execute("SELECT * FROM your_table")

# Fetch the results
results = cursor.fetchall()

# Print the results
for result in results:
    print(result)

# Close the cursor and connection
cursor.close()
connection.close()
```

This is a basic example of how to connect to CrateDB, execute a query, fetch results, and print them using the CrateDB Python driver. You can find more examples and detailed documentation on how to interact with CrateDB using Python on the official website.

In [17]:
display(Markdown(chain.invoke("Write me example golang code to use CrateDB?")))

To connect to CrateDB using Go, you can utilize the pgx driver. Below is an example of how to use pgx to connect to CrateDB:

```go
package main

import (
    "context"
    "fmt"
    "os"

    "github.com/jackc/pgx/v5"
)

func main() {
    // Define the connection string
    connString := "postgresql://localhost:5432/cratedb"

    // Establish a connection to CrateDB
    conn, err := pgx.Connect(context.Background(), connString)
    if err != nil {
        fmt.Fprintf(os.Stderr, "Unable to connect to database: %v\n", err)
        os.Exit(1)
    }
    defer conn.Close(context.Background())

    // Query the database
    rows, err := conn.Query(context.Background(), "SELECT * FROM your_table")
    if err != nil {
        fmt.Fprintf(os.Stderr, "Query failed: %v\n", err)
        os.Exit(1)
    }
    defer rows.Close()

    // Process the results
    for rows.Next() {
        var id int
        var name string
        err := rows.Scan(&id, &name)
        if err != nil {
            fmt.Fprintf(os.Stderr, "Scan failed: %v\n", err)
            os.Exit(1)
        }
        fmt.Printf("ID: %d, Name: %s\n", id, name)
    }
}
```

This code snippet demonstrates how to establish a connection to CrateDB using the pgx driver and execute a simple query. For more information on using CrateDB with Go, you can refer to the [official documentation](https://cratedb.com/connect/go).

In [18]:
display(Markdown(chain.invoke("create RAG search with CrateDB and OpenAI?")))

To create a Retrieval Augmented Generation (RAG) search with CrateDB and OpenAI, you can follow the steps outlined in the blog post titled "Leverage Vector Search to Use Embeddings and Generative AI: Retrieval Augmented Generation (RAG) with CrateDB." This post introduces the RAG approach based on CrateDB as a vector store and the OpenAI embedding model. It explains the role of the vector store and vector similarity search in the RAG workflow with CrateDB.

You can find more details and the complete guide on how to leverage CrateDB and OpenAI for RAG search by visiting the following source link: [Leverage Vector Search to Use Embeddings and Generative AI: Retrieval Augmented Generation (RAG) with CrateDB](https://cratedb.com/blog/leverage-vector-search-to-use-embeddings-and-generative-ai-retrieval-augmented-generation-rag-with-cratedb).

Additionally, the blog post provides a high-level overview of the RAG workflow with CrateDB, detailing the key steps involved in building a knowledge-based index, optimizing information retrieval, and fetching relevant documents from the vector store using a search algorithm. This information can be crucial in setting up a successful RAG search system with CrateDB and OpenAI.

For further insights and examples on using CrateDB for efficient data storage and implementing RAG systems, you can refer to the related blog posts and resources available on the CrateDB website.

In [19]:
display(Markdown(chain.invoke("how to alter table and add fulltext index?")))

To alter a table and add a fulltext index, you can use the following syntax in CrateDB:

```sql
ALTER TABLE table_name ADD INDEX index_name USING fulltext(column_name) WITH (analyzer = 'english');
```

Replace `table_name` with the name of your table, `index_name` with the desired name for your index, and `column_name` with the column you want to create the fulltext index on. Make sure to specify the analyzer you want to use, in this case, 'english'.

You can refer to the [CrateDB documentation on fulltext indices](https://cratedb.com/docs/crate/reference/en/master/general/ddl/fulltext-indices.html) for more information on this topic.

In [20]:
display(Markdown(chain.invoke("how to alter table and add vector type field that allows for KNN search?")))

To alter a table and add a vector type field that allows for KNN search in CrateDB, you can follow the steps below:

1. Create a table with the desired fields, including the vector type field:
   ```sql
   CREATE TABLE my_data (
     xs FLOAT_VECTOR(2)
   );
   ```

2. Insert data into the table:
   ```sql
   INSERT INTO my_data VALUES ([1.6,2.7]), ([4.6, 7.8]);
   ```

3. With CrateDB version 5.5, you can now leverage vector support and KNN search. This allows you to store dense vectors of float values and perform approximate nearest neighbor search queries against them. The current dimension limit for a float_vector data type can be configured for is 2048. 

You can find more information about vector data type and vector store functionality in CrateDB in this [source](https://cratedb.com/blog/cratedb-v5.5-vector-store).

By following these steps, you can alter a table and add a vector type field that enables KNN search functionality in CrateDB.

In [21]:
display(Markdown(chain.invoke("create table with fields ID, name, vector, and index vector field for KNN search?")))

To create a table with fields ID, name, vector, and index the vector field for KNN search in CrateDB, you can use the following SQL query:

```sql
CREATE TABLE my_table (
  ID INTEGER PRIMARY KEY,
  name TEXT,
  vector FLOAT_VECTOR(2),
  INDEX vector_index USING KNN WITH (dimension = 2)
);
```

This query creates a table named `my_table` with fields ID (as the primary key), name, vector (containing the float vector data), and an index on the vector field using KNN for efficient nearest neighbor search.

You can learn more about KNN search and vector support in CrateDB from [CrateDB Documentation](https://cratedb.com/docs/crate/reference/en/master/general/ddl/data-types.html) and [CrateDB Blog](https://cratedb.com/blog/unlocking-the-power-of-vector-support-and-knn-search-in-cratedb).

In [22]:
display(Markdown(chain.invoke("What are limits and limitations of CrateDB?")))

The limitations of CrateDB include the fact that the single-node CRFEE plan does not offer capabilities such as high speed, scalability, and high availability that you would get from a standard CrateDB distributed cluster. However, you can easily create a new cluster with another plan from within the Cloud Console. 

Source: [CrateDB Limitations](https://cratedb.com/lp-crfree)

In [23]:
display(Markdown(chain.invoke("What are the benefits of using CrateDB?")))

The benefits of using CrateDB include:
- Horizontal scalability, allowing users to add nodes as needed
- Support for structured, semi-structured, and unstructured schemas
- Multi-platform support, enabling CrateDB to run anywhere
- Simplified data infrastructure and overcoming challenges of complexity and technical debt
- Native SQL query language for data querying and manipulation, reducing the learning curve
- Efficient management of extensive concurrent reads and writes in a distributed system
- High performance, scalability, and flexibility, reducing total cost of ownership (TCO)
- Fully distributed query engine and columnar storage for immediate data availability, ad-hoc queries, hyper-fast aggregations, and in-memory SQL query performance
- Query response time in milliseconds for efficient data analysis
- Support for dynamic schemas, queryable objects, time-series data, and real-time full-text search
- Powerful REST API for managing and accessing CrateDB programmatically

Sources:
- [CrateDB Customers](https://cratedb.com/customers/)
- [CrateDB Blog](https://cratedb.com/blog/)
- [CrateDB Solutions](https://cratedb.com/solutions/database-consolidation)
- [CrateDB Features](https://cratedb.com/product/features/)

In [24]:
display(Markdown(chain.invoke("What are technical limitations?")))

Technical limitations include the size and complexity of machine-generated data, diverse architecture of machine data pipelines, dated or proprietary communication protocols used by historians, the constraints of strong consistency in relational databases, and the scalability limits and high costs associated with traditional data solutions in industrial environments. These limitations can hinder real-time analytics, integration with other systems and tools, and efficient database performance. 

Sources:
- [Restrictions](https://cratedb.com/subscription-agreement)
- [IIoT data challenges](https://cratedb.com/blog/new-partner-on-board-welcome-roosi)
- [Historians limitations](https://cratedb.com/blog/time-series-databases-operational-historians)
- [Strong consistency constraints](https://cratedb.com/blog/myths-relational-databases-operational-historians)
- [Industrial data challenges](https://cratedb.com/blog/digital-transformation-factory-floor)

In [25]:
display(Markdown(chain.invoke("Does index creation block write operations?")))

Yes, index creation can block write operations if the `write.wait_for_active_shards` setting is configured to wait for a certain number of replica shards to be fully replicated. This can cause write operations to block until the replica shards are fully replicated again or until a timeout occurs if replication is not fast enough. You can find more information about this in the CrateDB documentation on [index creation](https://cratedb.com/docs/crate/reference/en/5.6/sql/statements/create-table.html).

In [26]:
display(Markdown(chain.invoke("Does crate supports conditional indices")))

Yes, CrateDB supports automatic indexing for all attributes regardless of their depth, enabling rapid search capabilities for stored objects and facilitating efficient updates. This indexing strategy is based on a Lucene index and automatically generates indexes, including strategies like Inverted Index for text values, Block k-d trees for numeric, date, and geospatial values, and Hierarchical Navigable Small World (HNSW) graphs for high dimensional vectors. You can find more information about CrateDB's indexing capabilities [here](https://cratedb.com/product/features/data-storage).

In [27]:
display(Markdown(chain.invoke("How to create ID field that is autoincremented?")))

Based on the provided context, it seems that in CrateDB, primary keys cannot be auto-generated. Instead, you would need to specify the primary key when inserting data. If you want a field that is autoincremented, you would need to handle the autoincrement logic in the application code that interacts with the database.

You can refer to the official CrateDB documentation on primary keys for more information:
- [CrateDB Primary Keys Documentation](https://cratedb.com/docs/guide/performance/inserts/tuning.html)

In [28]:
display(Markdown(chain.invoke("how to create analysers for fulltext search?")))

To create analyzers for fulltext search, you can use language-specific analyzers, tokenizers, and token-filters to achieve proper search results for data provided in a certain language. For example, you can refer to the documentation on CrateDB which provides information on how to define a fulltext index with an analyzer. 

You can also create custom analyzers or extend built-in analyzers according to your requirements. For detailed steps and examples, you can check out the Fulltext Indices section in the CrateDB documentation [here](https://cratedb.com/docs/crate/reference/en/4.8/general/ddl/fulltext-indices.html). Additionally, you can refer to the Create a Custom Analyzer section for guidance on creating a custom analyzer and the CREATE ANALYZER section for syntax reference [here](https://cratedb.com/docs/crate/reference/en/3.3/general/ddl/analyzers.html).

In [29]:
display(Markdown(chain.invoke("give me information about password and admin")))

Based on the provided context, information about passwords and admin access in CrateDB can be found in the following sources:

1. Password authentication is used in CrateDB to control access via the CrateDB Admin UI and other interfaces. When the password authentication method is used, the client has to provide a password in addition to the username. This is important for securing access to the cluster and its data. Source: [CrateDB Documentation](https://cratedb.com/docs/crate/reference/en/5.6/admin/auth/methods.html)

2. Credentials for accessing the Admin UI of the cluster consist of a username (which is always 'admin') and a password. The password can be changed for security reasons. This information is crucial for managing access to the administrative functions of the cluster. Source: [CrateDB Cloud Documentation](https://cratedb.com/docs/cloud/en/latest/reference/overview.html#import)

These sources provide insights into how passwords are used for authentication purposes and how they are related to accessing the Admin UI in CrateDB.

In [30]:
display(Markdown(chain.invoke("Shared file system implementation of the BlobStoreRepository")))

The shared file system implementation of the BlobStoreRepository in CrateDB allows for defining a custom directory path for storing blob data, which can be different from the normal data path. This can be useful for scenarios where normal data is stored on a fast SSD and blob data is stored on a large, cheap spinning disk. This approach simplifies administration work by applying the same replication and sharding rules to both blob data and other types of data in the datastore. 

For more information, you can refer to the official documentation on Blob Storage in CrateDB: [CrateDB Blob Storage Documentation](https://cratedb.com/docs/crate/reference/en/master/general/blobs.html)

In [31]:
display(Markdown(chain.invoke("Is Cloud UI opensource?")))

Based on the provided context, the Cloud UI is not open source. The blog post mentions that the company has shifted towards fully managed SaaS solutions and acknowledges that they have not been optimally serving open-source users. Additionally, the company made a decision to split organization management and cluster management into two distinct contexts as part of a foundational improvement to how users interact with the platform, indicating a focus on proprietary features rather than open-source components. 

Source: [CrateDB Blog](https://cratedb.com/blog/farewell-to-the-cratedb-enterprise-license-faq)

In [32]:
display(Markdown(chain.invoke("How to do fusion search and connect vector search with fulltext search")))

To perform fusion search and connect vector search with full-text search, one can leverage the advanced search capabilities offered by CrateDB. CrateDB allows users to combine vector, full-text, and keyword searches for improved semantic similarity and keyword matching, enhancing search precision and relevance. By integrating vector search with full-text search, users can achieve a more comprehensive search experience that enhances semantic similarity and keyword matching, thus improving search precision and relevance.

Source: [CrateDB - Solutions: Vector Database](https://cratedb.com/solutions/vector-database)

In [33]:
display(Markdown(chain.invoke("How to MATH fulltext ")))

To perform a full-text search in a database, you need to create a full-text index with an analyzer for the specific column you want to search. You can learn how to set up your database for full-text search, create the relevant indices, and query your text data efficiently by following the guidelines provided in the documentation on the CrateDB website. 

For example, you can refer to the documentation on full-text search in the Netflix Catalog [here](https://cratedb.com/docs/guide/domain/search/index.html#fts) to understand the process better. Additionally, you can start with a basic SELECT statement on all columns and limit the output to display only a few records to explore the data quickly, as shown in this [tutorial](https://cratedb.com/docs/cloud/en/latest/tutorials/full-text.html).

Remember that it's essential to create a full-text index with an analyzer for the column you want to search, as explained in the CrateDB documentation on fulltext indices [here](https://cratedb.com/docs/crate/reference/en/5.6/general/dql/fulltext.html). Keep in mind that querying multiple index columns with different index types within the same MATCH predicate is not possible, as mentioned in the documentation.