# Load Data with Vectors

## Get keys and urls

In [None]:
import os
from dotenv import load_dotenv

load_dotenv()

weaviate_url = os.getenv("WEAVIATE_URL")
# openai_key = os.getenv("OPENAI_API_KEY") # we don't need the OpenAI key for this
openai_url = os.getenv("OPENAI_URL")

print(weaviate_url, openai_url)

## Connect to Weaviate

In [None]:
import weaviate

client = weaviate.connect_to_local(
    host=weaviate_url
)

client.is_ready()

In [None]:
from weaviate.classes.config import Configure

def create_wiki_collection():
    if client.collections.exists("Wiki"):
        client.collections.delete("Wiki")

    # Create a collection here - with OpenAI vectorizer and define source properties
    client.collections.create(
        name="Wiki",

        vectorizer_config=[
            Configure.NamedVectors.text2vec_openai(
                name="main_vector",

                model="text-embedding-3-small",
                base_url=openai_url,

                # we don't need source_properties in this example - unless we expect to add data without providing vectors
                # source_properties=['title', 'text']
            )
        ],
    )

create_wiki_collection()

## Load the data from parquet files

In [17]:
from datasets import load_dataset

def prepare_dataset():
    # return load_dataset('parquet', data_files={'train': ['../dataset/openai/*.parquet']}, split="train")
    return load_dataset("weaviate/wiki-sample", "openai-text-embedding-3-small", split="train", streaming=True)

### Dataset Test
<!-- The parquet files should be located in "datasets/openai". -->

In [18]:
from tqdm import tqdm

dt = prepare_dataset()

counter = 10
for item in tqdm(dt):
    print(item)

    counter -= 1
    if(counter == 0): break

9it [00:01,  6.08it/s]

{'title': 'Unicode', 'text': "The Unicode Standard includes more than just the base code. Alongside the character encodings, the Consortium's official publication includes a wide variety of details about the scripts and how to display them: normalization rules, decomposition, collation, rendering, and bidirectional text display order for multilingual texts, and so on.", 'wiki_id': '20231101.simple_64846_4', 'url': 'https://simple.wikipedia.org/wiki/Unicode'}
{'title': 'Book of Genesis', 'text': 'The people of the world attempted to build a high tower (Tower of Babel) to show the power of mankind and to reach God. God felt insulted and gave people different languages to prevent the tower from ever being finished.', 'wiki_id': '20231101.simple_11278_4', 'url': 'https://simple.wikipedia.org/wiki/Book%20of%20Genesis'}
{'title': 'Rock Demers', 'text': 'Rock Demers,  (December 11, 1933 – August 17, 2021) was a Canadian movie producer.  He was the founder of the movie company Les Productions 




### The import function

`TODO:`
* add a function to add objects to batch

In [None]:
from tqdm import tqdm
from weaviate.util import generate_uuid5

def import_wiki_data(max_rows=100_000):
    print(f"Importing {max_rows} data items")

    dataset = prepare_dataset()
    wiki = client.collections.get("Wiki")

    counter = 0

    with wiki.batch.fixed_size(batch_size=2500, concurrent_requests=4) as batch:
        for item in tqdm(dataset, total=max_rows):

            data_to_insert = {   
                "wiki_id": item["wiki_id"],
                "text": item["text"],
                "title": item["title"],
                "url": item["url"],
            }

            item_id = generate_uuid5(item["wiki_id"])

            # vector = item["vector"]
            item_vector = {
                "main_vector": item["vector"]
            }

            batch.add_object(
                properties=data_to_insert,
                
                uuid=item_id,
                vector=item_vector
            )

            # Check number of errors while running
            if(batch.number_errors > 10):
                print(f"Reached {batch.number_errors} Errors during batch import")
                break
            
            # stop after the request number reaches = max_rows
            counter += 1
            if(counter >= max_rows):
                break
    
    # check for errors at the end
    if (len(wiki.batch.failed_objects)>0):
        print("Final error check")
        print(f"Some errors {len(wiki.batch.failed_objects)}")
        print(wiki.batch.failed_objects[-1])
    
    print(f"Imported {counter} items")
    print("-----------------------------------")

In [None]:
import_wiki_data(100_000)

## Check if data loaded correctly

In [None]:
wiki = client.collections.get("Wiki")
len(wiki)

In [None]:
res = wiki.query.fetch_objects(limit=1, include_vector=True)
print(res.objects[0].properties)
print(res.objects[0].vector)

## Close the client

In [None]:
client.close()