## Load the data

In [100]:
import requests
import json
import weaviate
import os
from weaviate.embedded import EmbeddedOptions
from dotenv import load_dotenv

In [101]:
# download the data

url = "https://raw.githubusercontent.com/weaviate-tutorials/quickstart/main/data/jeopardy_tiny.json"
response = requests.get(url)
data = json.loads(response.text)

# Preview the data type and length
print(type(data))
print(len(data))

# Preview the first entry
print(data[0])


<class 'list'>
10
{'Category': 'SCIENCE', 'Question': 'This organ removes excess glucose from the blood & stores it as glycogen', 'Answer': 'Liver'}


In [102]:
# print the rest of the data using function json_data

import json
def json_data(data, indent=2):
    print(json.dumps(data, indent=indent))


json_data(data)

[
  {
    "Category": "SCIENCE",
    "Question": "This organ removes excess glucose from the blood & stores it as glycogen",
    "Answer": "Liver"
  },
  {
    "Category": "ANIMALS",
    "Question": "It's the only living mammal in the order Proboseidea",
    "Answer": "Elephant"
  },
  {
    "Category": "ANIMALS",
    "Question": "The gavial looks very much like a crocodile except for this bodily feature",
    "Answer": "the nose or snout"
  },
  {
    "Category": "ANIMALS",
    "Question": "Weighing around a ton, the eland is the largest species of this animal in Africa",
    "Answer": "Antelope"
  },
  {
    "Category": "ANIMALS",
    "Question": "Heaviest of all poisonous snakes is this North American rattlesnake",
    "Answer": "the diamondback rattler"
  },
  {
    "Category": "SCIENCE",
    "Question": "2000 news: the Gunnison sage grouse isn't just another northern sage grouse, but a new one of this classification",
    "Answer": "species"
  },
  {
    "Category": "SCIENCE",
   

## Initialize Weaviate

In [138]:
load_dotenv()

openai_api_key = os.getenv("OPENAI_API_KEY")

ai_studio_api_key = os.getenv("AI-STUDIO-API-KEY")

waeivate_api_key = os.getenv("WEAVIATE_API_KEY")

huggingfacehub_api_token = os.getenv("HUGGINGFACE-API-KEY")

cohere_api_key = os.getenv("COHERE_APIKEY")

auth_config = weaviate.AuthApiKey(api_key=waeivate_api_key)

client = weaviate.Client(
    url="https://e2pxfwhqioinxijlmnqxw.c0.europe-west3.gcp.weaviate.cloud",
    auth_client_secret=auth_config,
    additional_headers={
        "X-Cohere-Api-Key": cohere_api_key
        #"X-Google-Studio-Api-Key": ai_studio_api_key
    }
)

In [139]:
# Check weaviate is running
json_data(client.get_meta())

{
  "hostname": "http://[::]:8080",
  "modules": {
    "backup-gcs": {
      "bucketName": "weaviate-wcs-prod-cust-europe-west3-workloads-backups",
      "rootName": "7b6a577d-61ff-4083-a29d-78a394c9d05f"
    },
    "generative-anyscale": {
      "documentationHref": "https://docs.anyscale.com/endpoints/overview",
      "name": "Generative Search - Anyscale"
    },
    "generative-aws": {
      "documentationHref": "https://docs.aws.amazon.com/bedrock/latest/APIReference/welcome.html",
      "name": "Generative Search - AWS"
    },
    "generative-cohere": {
      "documentationHref": "https://docs.cohere.com/reference/chat",
      "name": "Generative Search - Cohere"
    },
    "generative-mistral": {
      "documentationHref": "https://docs.mistral.ai/api/",
      "name": "Generative Search - Mistral"
    },
    "generative-octoai": {
      "documentationHref": "https://octo.ai/docs/text-gen-solution/getting-started",
      "name": "Generative Search - OctoAI"
    },
    "generative-

In [140]:
# Delete the schema if it already exists
if client.schema.exists("Question"):
    client.schema.delete_class("Question")

In [141]:
# Create the schema
class_obj = {
    "class": "Question",
    "vectorizer": "text2vec-cohere",
}

In [142]:
# Use the object above to create the schema
client.schema.create_class(class_obj)

In [143]:
with client.batch.configure() as batch:
    for i, question in enumerate(data):
        print(f"Adding question {i} to the batch")
        # Add the data to the batch
        batch.add_data_object(
            data_object={
                "answer": question["Answer"],
                "question": question["Question"],
                "category": question["Category"]
            },
            class_name="Question",
        )

Adding question 0 to the batch
Adding question 1 to the batch
Adding question 2 to the batch
Adding question 3 to the batch
Adding question 4 to the batch
Adding question 5 to the batch
Adding question 6 to the batch
Adding question 7 to the batch
Adding question 8 to the batch
Adding question 9 to the batch


In [144]:
# Check how many object we have loaded
json_data(client.query.aggregate("Question").with_meta_count().do())

{
  "data": {
    "Aggregate": {
      "Question": [
        {
          "meta": {
            "count": 10
          }
        }
      ]
    }
  }
}


In [145]:
# Extract and show any 3 questions and answers
json_data(client.query.get("Question", ['question', 'answer']).with_limit(3).do())

{
  "data": {
    "Get": {
      "Question": [
        {
          "answer": "wire",
          "question": "A metal that is ductile can be pulled into this while cold & under pressure"
        },
        {
          "answer": "DNA",
          "question": "In 1953 Watson & Crick built a model of the molecular structure of this, the gene-carrying substance"
        },
        {
          "answer": "Antelope",
          "question": "Weighing around a ton, the eland is the largest species of this animal in Africa"
        }
      ]
    }
  }
}


### Extract the Vector that represents each question

In [146]:
# Query to extract the vector for a question
result = client.query.get("Question", ['category', 'question', 'answer']).with_additional('vector').with_limit(1).do()

In [147]:
# Extract the category
result['data']['Get']['Question'][0]['category']

'SCIENCE'

In [148]:
# Extract the question
result['data']['Get']['Question'][0]['question']

'A metal that is ductile can be pulled into this while cold & under pressure'

In [149]:
# Extract the answer
result['data']['Get']['Question'][0]['answer']

'wire'

In [150]:
# Display vector representation
result['data']['Get']['Question'][0]['_additional']['vector']

[0.0035762787,
 0.020950317,
 0.012229919,
 0.029129028,
 -0.0501709,
 -0.04473877,
 -0.021438599,
 -0.013381958,
 -0.0107040405,
 0.03656006,
 -0.02116394,
 0.019134521,
 0.023361206,
 -0.0063591003,
 0.032165527,
 0.018722534,
 0.029891968,
 -7.9512596e-05,
 0.027359009,
 -0.0079193115,
 -0.010231018,
 0.032226562,
 0.041107178,
 -0.03161621,
 0.004135132,
 0.024124146,
 0.013069153,
 -0.01259613,
 0.06213379,
 -0.004711151,
 0.034606934,
 -0.030181885,
 0.004623413,
 0.026672363,
 -0.039886475,
 -0.05331421,
 0.015335083,
 -0.02557373,
 0.0028820038,
 -0.004058838,
 -0.0047035217,
 0.011306763,
 -0.016906738,
 0.015144348,
 -0.041137695,
 -0.019424438,
 0.045440674,
 0.03326416,
 0.04537964,
 0.03756714,
 0.006046295,
 0.00066804886,
 0.01348114,
 -0.003692627,
 0.010093689,
 0.0022354126,
 0.0061416626,
 0.028930664,
 0.03414917,
 0.018295288,
 0.026321411,
 0.012580872,
 -0.011985779,
 0.016448975,
 -0.03479004,
 -0.02104187,
 -0.026260376,
 0.093566895,
 0.042144775,
 0.06378174,

In [151]:
# How many numbers are in the Vector
def count_numbers(vector):
    """Count the number of elements in a vector"""
    count = 0
    for element in vector:
        if isinstance(element, (int, float)):
            count += 1
    return count

count_numbers(result['data']['Get']['Question'][0]['_additional']['vector'])

1024

In [152]:
len(vector:=result['data']['Get']['Question'][0]['_additional']['vector'])

1024

### Search for a relevant answer using vector search

In [164]:
# Vector search query to extract questions, answers and categories related to "Science" category
result = (
    client.query.get("Question", ['question', 'answer', 'category'])
    .with_near_text({"concepts": ["science"]})
    .with_limit(2)
    .do())

# Display the results
json_data(result, indent=4)

{
    "data": {
        "Get": {
            "Question": [
                {
                    "answer": "the atmosphere",
                    "category": "SCIENCE",
                    "question": "Changes in the tropospheric layer of this are what gives us weather"
                },
                {
                    "answer": "DNA",
                    "category": "SCIENCE",
                    "question": "In 1953 Watson & Crick built a model of the molecular structure of this, the gene-carrying substance"
                }
            ]
        }
    }
}


In [165]:
# Write code to extract the distance between the query and returned vectors objects
result = (
    client.query.get("Question", ['question', 'answer', 'category'])
    .with_near_text({"concepts": ["science"]})
    .with_additional('distance')
    .with_limit(2)
    .do())

# Display the results
json_data(result, indent=4)

{
    "data": {
        "Get": {
            "Question": [
                {
                    "_additional": {
                        "distance": 0.47632474
                    },
                    "answer": "the atmosphere",
                    "category": "SCIENCE",
                    "question": "Changes in the tropospheric layer of this are what gives us weather"
                },
                {
                    "_additional": {
                        "distance": 0.49815953
                    },
                    "answer": "DNA",
                    "category": "SCIENCE",
                    "question": "In 1953 Watson & Crick built a model of the molecular structure of this, the gene-carrying substance"
                }
            ]
        }
    }
}


In [166]:
# Extract all questions and analyze them based on distance
result = (
    client.query.get("Question", ['question', 'answer', 'category'])
    .with_near_text({"concepts": ["animals"]})
    .with_additional('distance')
    .with_limit(10)
    .do()
    )

# Display the results
json_data(result, indent=4)

{
    "data": {
        "Get": {
            "Question": [
                {
                    "_additional": {
                        "distance": 0.46997482
                    },
                    "answer": "Elephant",
                    "category": "ANIMALS",
                    "question": "It's the only living mammal in the order Proboseidea"
                },
                {
                    "_additional": {
                        "distance": 0.47995377
                    },
                    "answer": "Antelope",
                    "category": "ANIMALS",
                    "question": "Weighing around a ton, the eland is the largest species of this animal in Africa"
                },
                {
                    "_additional": {
                        "distance": 0.5098027
                    },
                    "answer": "the nose or snout",
                    "category": "ANIMALS",
                    "question": "The gavial looks very much lik

In [167]:
# set a max distance threshold
max_distance = 0.60

# Extract all questions and analyze them based on distance
result = (
    client.query.get("Question", ['question', 'answer', 'category'])
    .with_near_text({"concepts": ["animals"],
                     "distance": max_distance})
    .with_additional('distance').with_limit(10)
    .do()
    )

json_data(result, indent=4)

{
    "data": {
        "Get": {
            "Question": [
                {
                    "_additional": {
                        "distance": 0.46997482
                    },
                    "answer": "Elephant",
                    "category": "ANIMALS",
                    "question": "It's the only living mammal in the order Proboseidea"
                },
                {
                    "_additional": {
                        "distance": 0.47995377
                    },
                    "answer": "Antelope",
                    "category": "ANIMALS",
                    "question": "Weighing around a ton, the eland is the largest species of this animal in Africa"
                },
                {
                    "_additional": {
                        "distance": 0.5098027
                    },
                    "answer": "the nose or snout",
                    "category": "ANIMALS",
                    "question": "The gavial looks very much lik