[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/weaviate/recipes/blob/main/integrations/Weaviate-Import-Example.ipynb)

# Weaviate Import

This notebook is used to populate the `WeaviateBlogChunk` collection.

You can connect to Weaviate through local host, or create a free 14-day sandbox on [WCD](https://console.weaviate.cloud/)!

1. Create a cluster on WCD and grab your cluster URL and auth key (if enabled)

2. We're using the Weaviate Embeddings to vectorize our data. Please note it is only available through WCD at the moment. For other model options, refer [here](https://weaviate.io/developers/weaviate/model-providers).

3. Make sure the `llm-agent-frameworks/data` folder is accessible

4. Run this notebook and the 1102 blog chunks will be loaded into your Weaviate instance.

## Connect to Client

In [1]:
import weaviate
import os
from weaviate.classes.init import Auth
import weaviate.classes.config as wvcc
import re
from weaviate.util import get_valid_uuid
from uuid import uuid4

In [None]:
# Connect to the client

WCD_CLUSTER_URL = os.getenv("WCD_CLUSTER_URL")
WCD_CLUSTER_KEY = os.getenv("WCD_CLUSTER_KEY")

client = weaviate.connect_to_weaviate_cloud(
    cluster_url=WCD_CLUSTER_URL,
    auth_credentials=Auth.api_key(WCD_CLUSTER_KEY),
)

print(client.is_ready())

## Create Schema

In [13]:
# CAUTION: Running this will delete the collection along with the objects

# client.collections.delete_all()

In [3]:
collection = client.collections.create(
    name="WeaviateBlogChunk",
    vectorizer_config=wvcc.Configure.Vectorizer.text2vec_weaviate
    (
        model="Snowflake/snowflake-arctic-embed-l-v2.0", # default model
    ),
    properties=[
            wvcc.Property(name="content", data_type=wvcc.DataType.TEXT),
            wvcc.Property(name="author", data_type=wvcc.DataType.TEXT),
      ]
)

## Chunk Blogs

In [None]:
def chunk_list(lst, chunk_size):
    """Break a list into chunks of the specified size."""
    return [lst[i:i + chunk_size] for i in range(0, len(lst), chunk_size)]

def split_into_sentences(text):
    """Split text into sentences using regular expressions."""
    sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text)
    return [sentence.strip() for sentence in sentences if sentence.strip()]

def read_and_chunk_index_files(main_folder_path):
    """Read index.md files from subfolders, split into sentences, and chunk every 5 sentences."""
    blog_chunks = []
    for folder_name in os.listdir(main_folder_path):
        subfolder_path = os.path.join(main_folder_path, folder_name)
        if os.path.isdir(subfolder_path):
            index_file_path = os.path.join(subfolder_path, 'index.mdx')
            if os.path.isfile(index_file_path):
                with open(index_file_path, 'r', encoding='utf-8') as file:
                    content = file.read()
                    sentences = split_into_sentences(content)
                    sentence_chunks = chunk_list(sentences, 5)
                    sentence_chunks = [' '.join(chunk) for chunk in sentence_chunks]
                    blog_chunks.extend(sentence_chunks)
    return blog_chunks

# Example usage
main_folder_path = "llm-agent-frameworks/data"
blog_chunks = read_and_chunk_index_files(main_folder_path)


In [None]:
len(blog_chunks)

In [None]:
blog_chunks[0]

## Import Objects

In [8]:
blogs = client.collections.get("WeaviateBlogChunk")

for idx, blog_chunk in enumerate(blog_chunks):
    upload = blogs.data.insert(
        properties={
            "content": blog_chunk
        }
    )