## Connect to a WCS instance of Weaviate
> this way we will see the number of used vector dimensions

* Make sure to update the cluster_url and your WCS key

In [1]:
import weaviate, os

# Connect to a cloud instance of Weaviate (with WCS)
client = weaviate.connect_to_wcs(
    cluster_url=os.getenv("WORKSHOP_DEMO_URL"),
    auth_credentials=weaviate.auth.AuthApiKey(os.getenv("WORKSHOP_DEMO_KEY_ADMIN")),
)

client.is_ready()

True

## Adding PQ configuration to the Wikipedia example
In this step, we will use:
* named vectors ("text_vector")
* PQ configuration on `vector_index_config`

```
vector_index_config=Configure.VectorIndex.hnsw(
    quantizer=Configure.VectorIndex.Quantizer.pq(
        segments=128, # 768/6
    )
),
```

In [2]:
from weaviate.classes.config import Configure, Property, DataType

# client.collections.delete("WikipediaPQ")

# Create a collection here - with Cohere as a vectorizer
client.collections.create(
    name="WikipediaPQ",

    # Named Vector example – will be covered in another lesson
    vectorizer_config=[
        Configure.NamedVectors.text2vec_cohere(
            name="text_vector",
            model="embed-multilingual-v2.0",
            source_properties=["text"],

            # AutoPQ - this only works with Async Indexing
            vector_index_config=Configure.VectorIndex.hnsw(
                quantizer=Configure.VectorIndex.Quantizer.pq(
                    segments=128, # 768/6
                )
            ),
    )],

    generative_config=Configure.Generative.openai("gpt-4"),
)

Collection already exists


<weaviate.collections.collection.Collection at 0x10570a540>

## The rest is the same

In [9]:
from datasets import load_dataset
from tqdm import tqdm

def import_wiki_data(lang, lang_id, max_rows, skip_rows=0):
    print(f"Importing {max_rows} data items for {lang}")

    dataset = load_dataset(f"Cohere/wikipedia-22-12-{lang}-embeddings", split="train", streaming=True)
    dataset = dataset.skip(skip_rows)

    counter = 0
    counter = skip_rows

    wikipedia = client.collections.get("WikipediaPQ")

    with wikipedia.batch.fixed_size(batch_size=5000, concurrent_requests=2) as batch:
        for item in tqdm(dataset, initial=skip_rows, total=max_rows):
            vector = item["emb"]
            data_to_insert = {   
                "text": item["text"],
                "wiki_id": item["wiki_id"],
                "title": item["title"],
                "url": item["url"],
                "views": item["views"],
                "lang": lang,
                "lang_id": lang_id,
            }

            batch.add_object(
                properties=data_to_insert,
                # vector=vector
                
                # Named Vector example – will be covered in another lesson
                vector={
                    "text_vector": vector   
                }
            )
            
            counter += 1
            if counter >= max_rows:
                break
    
    # check for errors at the end
    if (len(wikipedia.batch.failed_objects)>0):
        print("Final error check")
        print(f"Some errors {len(wikipedia.batch.failed_objects)}")
        print(wikipedia.batch.failed_objects[-1])
    
    print(f"Imported {counter} items for {lang}")
    print("-----------------------------------")

In [None]:
import_per_country = 10_000

import_wiki_data("en", 0, import_per_country, 0)
import_wiki_data("de", 1, import_per_country, 0)
import_wiki_data("fr", 2, import_per_country, 0)
# import_wiki_data("es", 3, import_per_country, 0)
# import_wiki_data("it", 4, import_per_country, 0)

In [None]:
wikipedia = client.collections.get("WikipediaPQ")
wikipedia.aggregate.over_all()