In [2]:
# Connect to Ollama running Llama3
import dspy
llama3_ollama = dspy.OllamaLocal(model="llama3:instruct", max_tokens=4000, timeout_s=480)

dspy.settings.configure(lm=llama3_ollama)

llama3_ollama("say hello")

["Hello! It's nice to meet you. Is there something I can help you with, or would you like to chat?"]

In [3]:
# Load blogs into Weaviate
import weaviate

weaviate_client = weaviate.connect_to_local()

weaviate_client.collections.delete("WeaviateBlogChunk")

In [4]:
import os
import re


def chunk_list(lst, chunk_size):
    """Break a list into chunks of the specified size."""
    return [lst[i:i + chunk_size] for i in range(0, len(lst), chunk_size)]


def split_into_sentences(text):
    """Split text into sentences using regular expressions."""
    sentences = re.split(r'(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<=\.|\?)\s', text)
    return [sentence.strip() for sentence in sentences if sentence.strip()]


def read_and_chunk_index_files(main_folder_path):
    """Read index.md files from subfolders, split into sentences, and chunk every 5 sentences."""
    blog_chunks = []
    for folder_name in os.listdir(main_folder_path):
        subfolder_path = os.path.join(main_folder_path, folder_name)
        if os.path.isdir(subfolder_path):
            index_file_path = os.path.join(subfolder_path, 'index.mdx')
            if os.path.isfile(index_file_path):
                with open(index_file_path, 'r', encoding='utf-8') as file:
                    content = file.read()
                    sentences = split_into_sentences(content)
                    sentence_chunks = chunk_list(sentences, 5)
                    sentence_chunks = [' '.join(chunk) for chunk in sentence_chunks]
                    blog_chunks.extend(sentence_chunks)
    return blog_chunks


# Example usage
main_folder_path = '../datasets/weaviate-blogs'
blog_chunks = read_and_chunk_index_files(main_folder_path)


print(f"{len(blog_chunks)}\n")
print(blog_chunks[0])

1182

---
title: Combining LangChain and Weaviate
slug: combining-langchain-and-weaviate
authors: [erika]
date: 2023-02-21
tags: ['integrations']
image: ./img/hero.png
description: "LangChain is one of the most exciting new tools in AI. It helps overcome many limitations of LLMs, such as hallucination and limited input lengths."
---
![Combining LangChain and Weaviate](./img/hero.png)

Large Language Models (LLMs) have revolutionized the way we interact and communicate with computers. These machines can understand and generate human-like language on a massive scale. LLMs are a versatile tool that is seen in many applications like chatbots, content creation, and much more. Despite being a powerful tool, LLMs have the drawback of being too general.


In [5]:
import weaviate.classes.config as wvcc

collection = weaviate_client.collections.create(
   name="WeaviateBlogChunk",
   vectorizer_config=wvcc.Configure.Vectorizer.text2vec_ollama
   (
       api_endpoint="http://host.docker.internal:11434",
       model="snowflake-arctic-embed:335m"
   ),
   properties=[
           wvcc.Property(name="content", data_type=wvcc.DataType.TEXT),
           wvcc.Property(name="query", data_type=wvcc.DataType.TEXT, skip_vectorization=True),
           wvcc.Property(name="is_high_quality_query", data_type=wvcc.DataType.BOOL),
     ]
)

In [6]:
from weaviate.util import get_valid_uuid
from uuid import uuid4
import time

blogs = weaviate_client.collections.get("WeaviateBlogChunk")

blog_chunk_uuids = []

start = time.time()
for idx, blog_chunk in enumerate(blog_chunks):
    id = get_valid_uuid(uuid4())
    blog_chunk_uuids.append(id)
    upload = blogs.data.insert(
        properties={
           "content": blog_chunk
        },
        uuid=id
    )

print(f"Uploaded and vectorized {len(blog_chunks)} blog chunks in {time.time() - start} seconds.")

Uploaded and vectorized 1182 blog chunks in 110.77285599708557 seconds.


In [18]:
from pydantic import BaseModel
from typing import Any
import functools

class UpdateProperty(dspy.Signature):
    """I need your help to generate the value of a property by following the instruction using the provided name-value property references. VERY IMPORTANT!! Please follow this next instruction carefully. It is EXTREMELY IMPORTANT that you only output the property value and nothing else. Do not start your response with something like `Sure, I can help with that!` or anything of the sort. JUST OUTPUT THE PROPERTY VALUE!!
    """

    property_name = dspy.InputField(
        desc="The name of the property that you should update."
    )
    references = dspy.InputField(
        desc="The name-value property pairs that you should refer to while updating the property."
    )
    instruction = dspy.InputField(
        desc="The prompt to use when generating the updated property value."
    )
    property_value = dspy.OutputField(
        desc="The value of the updated property. VERY IMPORTANT!! ONLY OUTPUT THIS VALUE!! Do not output anything other than this value."
    )


class Program(dspy.Module):
    def __init__(self, property_value_type: Any) -> None:
        self.property_value_type = property_value_type
        self.update_property = dspy.Predict(UpdateProperty)
 
    def assert_property_value_type(self, property_value: str) -> bool:
        try:
            self.property_value_type(property_value)
            return True
        except (ValueError, TypeError):
            return False


    def failed_assertion_message(self, property_name: str) -> str:
        return f"""
        The value of the '{property_name}' property does not match the expected type: {self.property_value_type}.
        Please ensure that the generated value adheres to the specified type.
        """

    def forward(self, property_name: str, references: str, instruction: str) -> Any:
        prediction: dspy.Prediction = self.update_property(
            property_name=property_name, references=references, instruction=instruction
        )

        dspy.Suggest(
            self.assert_property_value_type(prediction.property_value),
            self.failed_assertion_message(property_name),
        )
        
        if self.property_value_type == bool:
            return prediction.property_value.lower() == "true"
        
        return self.property_value_type(prediction.property_value)

In [19]:
from dspy.primitives.assertions import assert_transform_module, backtrack_handler


program = Program(property_value_type=int)
program_with_assertions = assert_transform_module(
   program, functools.partial(backtrack_handler, max_backtracks=1)
)


property_name = "age"
references = "name: John, occupation: Engineer"
instruction = "Update the 'age' property to a random integer between 25 and 35."


result = program_with_assertions(
   property_name=property_name, references=references, instruction=instruction
)
print(result)
print(type(result))

29
<class 'int'>


In [20]:
program = Program(property_value_type=str)

instruction = """
Given a snippet from a blog post published by Weaviate, a Vector Database company, construct a question that delves deeply into the underlying concepts and explores new dimensions beyond the provided information. 
VERY IMPORTANT!! These queries should emphasize the advantages of semantic search with vector embeddings over traditional keyword-based methods like BM25. Ensure that the query explores related concepts, implications, or applications without directly repeating any keywords from the source document.
"""

In [10]:
import time
import json

start_gfl = time.time()

for idx, chunk_uuid in enumerate(blog_chunk_uuids):
    if idx % 100 == 99:
        print(f"\nLOG: {idx+1} queries generated in {time.time() - start_gfl} seconds.\n")
  
    # Get the object
    obj = blogs.query.fetch_object_by_id(chunk_uuid, return_properties="content")
  
    # Format the references
    references = " ".join(f"{k}: {v}" for k, v in obj.properties.items())
  
    # Run GFL
    query = program(
        property_name="query",
        references=references,
        instruction=instruction,
    )
  
    if idx < 10:
        if idx == 0:
            print("Printing some queries to illustrate what this is doing... \n")
        print(f"{query}\n")

    # Update property in Weaviate
    blogs.data.update(
        properties={
            "query": query
        },
        uuid=chunk_uuid
    )
    
print(f"{len(blog_chunk_uuids)} objects have been updated in {time.time() - start_gfl} seconds.")

Printing some queries to illustrate what this is doing... 

What are the limitations of relying solely on LLMs for information retrieval and how can combining LangChain with Weaviate's vector embeddings overcome these limitations to provide more accurate and relevant search results?

What are the limitations of LLMs and how do emerging technologies like LangChain help overcome them?

What are the implications of combining sequential chains with Weaviate's vector embeddings for building more accurate and efficient LLM chatbots?

What type of mammal lays the biggest eggs?

What are the limitations of traditional keyword-based methods in processing and storing large sequences of tokens, and how do vector embeddings address these challenges?

What are the underlying benefits of using Weaviate's vector database for semantic search, and how does it enable more accurate and relevant results compared to traditional keyword-based methods like BM25?

What is the relationship between local memory

In [42]:
import time
import json

program = Program(property_value_type=bool)

instruction = """
Critically examine the query. Output 'True' ONLY if ALL of the following conditions are met:

The query directly asks about a technological concept (e.g., LLMs, AI, databases, programming).
The query is not merely mentioned as an example in the content, but genuinely seeks information about technology.
The query demonstrates an understanding of and engagement with the technological themes in the content.
If ANY of these conditions are not met, output 'False'.
"""
start_gfl = time.time()

for idx, chunk_uuid in enumerate(blog_chunk_uuids):
    if idx % 100 == 99:
        print(f"\nLOG: {idx+1} queries labeled in {time.time() - start_gfl} seconds.\n")
  
    # Get the object
    obj = blogs.query.fetch_object_by_id(chunk_uuid, return_properties=["content", "query"])
  
    # Format the references
    references = " ".join(f"{k}: {v}" for k, v in obj.properties.items())
    
    # Run GFL
    is_high_quality_query = program(
        property_name="is_high_quality_query",
        references=references,
        instruction=instruction,
    )
  
    if idx < 10:
        if idx == 0:
            print("Printing some queries to illustrate what this is doing... \n")
        query = obj.properties["query"]
        print(f"Is this a high quality query?\n{query}")
        print(f"{is_high_quality_query}\n")

    # Update property in Weaviate
    blogs.data.update(
        properties={
            "is_high_quality_query": is_high_quality_query
        },
        uuid=chunk_uuid
    )
    
print(f"{len(blog_chunk_uuids)} objects have been updated in {time.time() - start_gfl} seconds.")

Printing some queries to illustrate what this is doing... 

Is this a high quality query?
What are the limitations of relying solely on LLMs for information retrieval and how can combining LangChain with Weaviate's vector embeddings overcome these limitations to provide more accurate and relevant search results?
True

Is this a high quality query?
What are the limitations of LLMs and how do emerging technologies like LangChain help overcome them?
True

Is this a high quality query?
What are the implications of combining sequential chains with Weaviate's vector embeddings for building more accurate and efficient LLM chatbots?
True

Is this a high quality query?
What type of mammal lays the biggest eggs?
False

Is this a high quality query?
What are the limitations of traditional keyword-based methods in processing and storing large sequences of tokens, and how do vector embeddings address these challenges?
True

Is this a high quality query?
What are the underlying benefits of using Wea