In [None]:
# Install python client resources

!pip3 install --pre -I "weaviate-client==4.*"


In [1]:
# GET SAMPLE DATA

import requests
import json

# Download the data
resp = requests.get(
    "https://raw.githubusercontent.com/weaviate-tutorials/intro-workshop/main/data/jeopardy_1k.json"
)

# Load the data so you can see what it is
data = json.loads(resp.text)

# Parse the JSON and preview it
print(type(data), len(data))
print(json.dumps(data[1], indent=2))

<class 'list'> 1000
{
  "Air Date": "2005-11-18",
  "Round": "Jeopardy!",
  "Value": 200,
  "Category": "RHYME TIME",
  "Question": "Any pigment on the wall so faded you can barely see it",
  "Answer": "faint paint"
}


In [51]:
# OLD CLIENT
# Connect

import weaviate
import os

client = weaviate.Client(
    url="http://localhost:8080/",  # Replace with your endpoint
    additional_headers={
        "X-OpenAI-Api-Key": os.getenv(
            "OPENAI_API_KEY"
        )  # Replace with your OpenAI API key
    },
)

print(client.is_ready())

True


In [3]:
# CLEAR PRIOR

if client.schema.exists("Question"):
    client.schema.delete_class("Question")


In [4]:
# CONFIGURE INITIAL

class_definition = {
    
    "class": "Question",
    "vectorizer":"text2vec-openai",
    "vectorIndexConfig": {
        "distance" : "cosine"
    },
    
    'properties' : [
        {
            'name' : "question",
            "dataType" : [ "text" ]
        },
        {
            'name' : "answer",
            "dataType" : [ "text" ]
        },
        {
            'name' : 'round',
            'dataType': [ "text" ]
        }
    ]
}

client.schema.create_class(class_definition)

In [5]:
# LOAD TRAINING DATA


with client.batch() as batch:
    for o in data:
        obj_body = {
            'question':o[ "Question" ],
            'answer':o[ "Answer" ],
            'round':o[ "Round" ]
        }
        
        batch.add_data_object(
           data_object=obj_body,
           class_name="Question"
        )

            Please instead use the `client.batch.configure()` method to configure your batch and `client.batch` to enter the context manager.
            See https://weaviate.io/developers/weaviate/client-libraries/python for details.


In [65]:
# CHECK CONFIG

# print(json.dumps(client.schema.get("Question"), indent=2))
# print(client.schema.get())
# print(json.dumps(client.schema.get(), indent=2))
# print(json.dumps(client.schema.get("JeopardyCategory"), indent=2))
#type( client.schema.get())

response = client.schema.get()
# print(json.dumps(response, indent=2))

class_names = [c["class"] for c in response["classes"]]

print(json.dumps(client.schema.get("Question")["vectorIndexConfig"]["pq"], indent=2))



{
  "enabled": true,
  "bitCompression": false,
  "segments": 96,
  "centroids": 256,
  "trainingLimit": 100000,
  "encoder": {
    "type": "kmeans",
    "distribution": "log-normal"
  }
}


In [37]:

client.schema.update_config(
    "Question",
    {
        "vectorIndexConfig": {
            "pq": {
                "enabled": True,  # Enable PQ
                "trainingLimit": 100000,
                "segments": 96
            }
        }
    },
)

In [None]:
print(json.dumps(client.schema.get(), indent=2))

In [None]:
### NEW CLIENT
### NEW CLIENT
### NEW CLIENT
### NEW CLIENT
### NEW CLIENT
### NEW CLIENT
### NEW CLIENT
### NEW CLIENT
### NEW CLIENT
### NEW CLIENT


In [42]:
# GET SAMPLE DATA

import requests
import json

# Download the data
resp = requests.get(
    "https://raw.githubusercontent.com/weaviate-tutorials/intro-workshop/main/data/jeopardy_1k.json"
)

# Load the data so you can see what it is
data = json.loads(resp.text)

# Parse the JSON and preview it
print(type(data), len(data))
print(json.dumps(data[1], indent=2))

<class 'list'> 1000
{
  "Air Date": "2005-11-18",
  "Round": "Jeopardy!",
  "Value": 200,
  "Category": "RHYME TIME",
  "Question": "Any pigment on the wall so faded you can barely see it",
  "Answer": "faint paint"
}


In [43]:
# CONNECT

import weaviate, os, json
import weaviate.classes as wvc

client = weaviate.connect_to_local(
    headers={
        "X-OpenAI-Api-Key": os.environ[
            "OPENAI_API_KEY"
        ]  # Replace with your OpenAI API key
    }
)

client.is_ready()

True

In [45]:
# CLEAR PRIOR

if (client.collections.exists("JeopardyCategory")):
    client.collections.delete("JeopardyCategory")


In [46]:
# CONFIGURE SCHEMA

client.collections.create(
    name="JeopardyCategory",
    description="A Jeopardy! category",
    vectorizer_config=wvc.Configure.Vectorizer.text2vec_openai(),
    generative_config=wvc.Configure.Generative.openai(),
    properties=[
        wvc.Property(name="title", data_type=wvc.DataType.TEXT),
    ],
)

<weaviate.collections.collection.Collection at 0x127f7ce80>

In [47]:
# LOAD DATA

def parse_data():
    object_list = []
    for obj in data:
        object_list.append(
            {
                "question": obj["Question"],
                "answer": obj["Answer"],
                "round": obj["Round"],
            }
        )

    return object_list


jeopardy = client.collections.get("JeopardyCategory")
jeopardy.data.insert_many(parse_data())

# Check upload
response = jeopardy.aggregate.over_all(total_count=True)

# Should equal the number of objects uploaded
print(response.total_count)


1000


In [48]:
# ENABLE AND TRAIN

import weaviate.classes as wvc

jeopardy = client.collections.get("JeopardyCategory")
jeopardy.config.update(
    vector_index_config=wvc.Reconfigure.vector_index(
        pq_enabled=True, pq_segments=96, pq_training_limit=100000
    )
)


In [49]:
# CHECK RESULTS

jeopardy = client.collections.get("JeopardyCategory")
config = jeopardy.config.get()
pq_config = config.vector_index_config.pq

# print some of the config properties
print(f"Enabled: { pq_config.enabled }")
print(f"Training: { pq_config.training_limit }")
print(f"Segments: { pq_config.segments }")
print(f"Centroids: { pq_config.centroids }")

Enabled: True
Training: 100000
Segments: 96
Centroids: 256
