# Load Data with Vectors

## Get keys and urls

In [11]:
import os
from dotenv import load_dotenv

load_dotenv()

WEAVIATE_KEY = os.getenv("WEAVIATE_KEY")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
OPENAI_URL = os.getenv("OPENAI_URL")

print(f"Weaviate Key:{WEAVIATE_KEY}")
print(f"OpenAI API Key: {OPENAI_API_KEY}")
print(f"OpenAI URL: {OPENAI_URL}")



Weaviate Key:root-user-key
OpenAI API Key: sk-dummy-key-for-local-testing
OpenAI URL: http://host.docker.internal:11434


## Connect to Weaviate

In [12]:
import weaviate
from weaviate.classes.init import Auth

# Connect to the local instance
client = weaviate.connect_to_local(
  host="127.0.0.1", # the address to the learner's instance
  port=8080,
  grpc_port=50051,
  auth_credentials=Auth.api_key(WEAVIATE_KEY),
  headers={
    "X-OpenAI-Api-Key": OPENAI_API_KEY
  }
)

print(client.is_ready())

True


In [13]:
from weaviate.classes.config import Configure

def create_wiki_collection():
    if client.collections.exists("Wiki"):
        client.collections.delete("Wiki")

    # Create a collection here - with OpenAI vectorizer and define source properties
    client.collections.create(
        name="Wiki",

        vector_config=[
            Configure.Vectors.text2vec_ollama(
                name="main_vector",
                api_endpoint=OPENAI_URL,
                model="nomic-embed-text",
                source_properties=['title', 'text'] # which properties should be used to generate a vector
            )
        ],
    )

create_wiki_collection()

## Load the data from parquet files

In [14]:
from datasets import load_dataset

def prepare_dataset():
    return load_dataset('parquet', data_files={'train': ['../wiki-data/weaviate/nomic-embed-text/*.parquet']}, split="train", streaming=True)
    # return load_dataset("weaviate/wiki-sample", "weaviate-snowflake-arctic-v2", split="train", streaming=True)

### Dataset Test
<!-- The parquet files should be located in "datasets/openai". -->

In [15]:
dataset = prepare_dataset()

counter = 10
for i in dataset:
    print(i)

    counter -= 1
    if(counter == 0): break

### The import function

`TODO:`
* add a function to add objects to batch

In [16]:
from tqdm import tqdm
from weaviate.util import generate_uuid5

def import_wiki_data(max_rows=10_000):
    print(f"Importing {max_rows} data items")

    dataset = prepare_dataset()
    wiki = client.collections.get("Wiki")

    counter = 0

    with wiki.batch.fixed_size(batch_size=2000, concurrent_requests=2) as batch:
        for item in tqdm(dataset, total=max_rows):

            data_to_insert = {   
                "wiki_id": item["wiki_id"],
                "text": item["text"],
                "title": item["title"],
                "url": item["url"],
            }

            item_id = generate_uuid5(item["wiki_id"])

            # vector = item["vector"]
            item_vector = {
                "main_vector": item["vector"]
            }

            batch.add_object(
                properties=data_to_insert,
                
                uuid=item_id,
                vector=item_vector
            )

            # Check number of errors while running
            if(batch.number_errors > 10):
                print(f"Reached {batch.number_errors} Errors during batch import")
                break
            
            # stop after the request number reaches = max_rows
            counter += 1
            if(counter >= max_rows):
                break
    
    # check for errors at the end
    if (len(wiki.batch.failed_objects)>0):
        print("Final error check")
        print(f"Some errors {len(wiki.batch.failed_objects)}")
        print(wiki.batch.failed_objects[-1])
    
    print(f"Imported {counter} items")
    print("-----------------------------------")

In [17]:
import_wiki_data(10_000)

Importing 10000 data items


  0%|          | 0/10000 [00:00<?, ?it/s]

Imported 0 items
-----------------------------------





## Check if data loaded correctly

In [8]:
wiki = client.collections.get("Wiki")
len(wiki)

10000

In [9]:
res = wiki.query.fetch_objects(limit=1, include_vector=True)
print(res.objects[0].properties)
print(res.objects[0].vector)

{'text': "The Unicode Standard includes more than just the base code. Alongside the character encodings, the Consortium's official publication includes a wide variety of details about the scripts and how to display them: normalization rules, decomposition, collation, rendering, and bidirectional text display order for multilingual texts, and so on.", 'title': 'Unicode', 'wiki_id': '20231101.simple_64846_4', 'url': 'https://simple.wikipedia.org/wiki/Unicode'}
{'main_vector': [-0.0174560546875, 0.041229248046875, -0.050750732421875, 0.03729248046875, 0.03704833984375, -0.0025463104248046875, 0.037109375, 0.059783935546875, 0.0577392578125, -0.0246734619140625, 0.004825592041015625, -0.0219879150390625, 0.0221405029296875, -0.055633544921875, -0.04803466796875, -0.0160369873046875, 0.028045654296875, -0.07794189453125, -0.050048828125, -0.027923583984375, -0.06829833984375, -0.031524658203125, -0.0011138916015625, -0.006793975830078125, 0.048736572265625, -0.001667022705078125, 0.01010131

## Close the client

In [10]:
client.close()