# Load Data with Vectors

## Get keys and urls

In [None]:
import os
from dotenv import load_dotenv

load_dotenv()

WEAVIATE_HTTP_URL = os.getenv("WEAVIATE_URL")
# OPENAI_API_KEY = os.getenv("OPENAI_API_KEY") # we don't need the OpenAI key for this
OPENAI_URL = os.getenv("OPENAI_URL")

USERNAME = os.getenv("USERNAME")
WIKI_NAME = f"{USERNAME}_wiki"

print(WIKI_NAME)

## Connect to Weaviate

In [None]:
import weaviate

client = weaviate.connect_to_local(
    host=WEAVIATE_HTTP_URL,

    # we don't need the OpenAI key for this - we already have the vectors
    # headers={
    #     "X-OpenAI-Api-Key": OPENAI_API_KEY,
    # },
)

client.is_ready()

In [None]:
from weaviate.classes.config import Configure

def create_wiki_collection():
    if client.collections.exists(WIKI_NAME):
        client.collections.delete(WIKI_NAME)

    # Create a collection here - with OpenAI vectorizer and define source properties
    client.collections.create(
        name=WIKI_NAME,

        vectorizer_config=[
            Configure.NamedVectors.text2vec_openai(
                name="main_vector",

                model="text-embedding-3-small",
                base_url=OPENAI_URL,

                # we don't need source_properties in this example - unless we expect to add data without providing vectors
                # source_properties=['title', 'text']
            )
        ],
    )

create_wiki_collection()

## Load the data from parquet files

In [None]:
from datasets import load_dataset

def prepare_parquet_dataset():
    return load_dataset('parquet', data_files={'train': ['../wiki-data/openai/text-embedding-3-small/*.parquet']}, split="train")
    # return load_dataset("weaviate/wiki-sample", "openai-text-embedding-3-small", split="train", streaming=True)

### Dataset Test
The parquet files should be located in "wiki-data".

In [None]:
dataset = prepare_parquet_dataset()
print(f"The dataset contains {len(dataset)} objects")

counter = 10
for i in dataset:
    print(i)

    counter -= 1
    if(counter == 0): break

### The import function

`TODO:`
* add a function to add objects to batch

In [None]:
from tqdm import tqdm
from weaviate.util import generate_uuid5

def import_wiki_data(max_rows=10_000):
    print(f"Importing {max_rows} data items")

    dataset = prepare_parquet_dataset()
    wiki = client.collections.get(WIKI_NAME)

    counter = 0

    with wiki.batch.fixed_size(batch_size=2000, concurrent_requests=2) as batch:
        for item in tqdm(dataset, total=max_rows):

            data_to_insert = {   
                "wiki_id": item["wiki_id"],
                "text": item["text"],
                "title": item["title"],
                "url": item["url"],
            }

            item_id = generate_uuid5(item["wiki_id"])

            item_vector = {
                "main_vector": item["vector"]
            }

            # TODO: add objects to batch using
            batch.add_object(
                # * data_to_insert
                # * item_id
                # * item_vector
            )

            # Check number of errors while running
            if(batch.number_errors > 10):
                print(f"Reached {batch.number_errors} Errors during batch import")
                break
            
            # stop after the request number reaches = max_rows
            counter += 1
            if counter >= max_rows:
                break
    
    # check for errors at the end
    if (len(wiki.batch.failed_objects)>0):
        print("Final error check")
        print(f"Some errors {len(wiki.batch.failed_objects)}")
        print(wiki.batch.failed_objects[-1])
    
    print(f"Imported {counter} items")
    print("-----------------------------------")

In [None]:
import_wiki_data(10_000)

## Check if data loaded correctly

In [None]:
wiki = client.collections.get(WIKI_NAME)
len(wiki)

In [None]:
res = wiki.query.fetch_objects(limit=1, include_vector=True)
print(res.objects[0].properties)
print(res.objects[0].vector)

## Close the client

In [None]:
client.close()