# Compression – Load Data and compress vectors

## Get keys and urls

In [None]:
import os
from dotenv import load_dotenv

load_dotenv()

WEAVIATE_KEY = os.getenv("WEAVIATE_KEY")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")
OPENAI_URL = os.getenv("OPENAI_URL")

print(f"Weaviate Key:{WEAVIATE_KEY}")
print(f"OpenAI API Key: {OPENAI_API_KEY[:20]}")
print(f"OpenAI URL: {OPENAI_URL}")

## Connect to Weaviate

In [None]:
import weaviate
from weaviate.classes.init import Auth

# Connect to the local instance
client = weaviate.connect_to_local(
  host="127.0.0.1", # the address to the learner's instance
  port=8080,
  grpc_port=50051,
  auth_credentials=Auth.api_key(WEAVIATE_KEY),
  headers={
    "X-OpenAI-Api-Key": OPENAI_API_KEY
  }
)

print(client.is_ready())

## Create Collection with PQ configuration

[Docs: Product Quantization (PQ)](https://weaviate.io/developers/weaviate/configuration/compression/pq-compression)

> Note: Product Quantization includes a training phase, which is required to create codebooks (codebooks are used to generate centroids for compressed vectors).<br/>
> In other words, based on your data, it figures out how to best compress your vectors.
>
> The compression training starts when the collection reaches `training_limit` number of objects.<br/>
> Before that, the vectors remain uncompressed, and search happens on uncompressed vectors.

In [None]:
from weaviate.classes.config import Configure

client.collections.delete("WikiQ")

# Create a collection here - with Weaviate as a vectorizer
client.collections.create(
    name="WikiQ",

    vector_config=[
        Configure.Vectors.text2vec_openai(
            name="main_vector",

            model="text-embedding-3-small",
            base_url=OPENAI_URL,
            source_properties=['title', 'text'],

            # Configure PQ
            vector_index_config=Configure.VectorIndex.hnsw(
                quantizer=Configure.VectorIndex.Quantizer.pq(
                    segments=256, # 1536/6 # new number of dimension segments
                    training_limit=10_000  # (default 100k) number of objects needed to train the codebook
                )
            ),
        )
    ],
)

## The rest is the same

In [None]:
from data_loader import import_wiki_data
import_wiki_data(client, "WikiQ", 25000)

In [None]:
WikiQ = client.collections.get("WikiQ")
WikiQ.aggregate.over_all()

## Clean up

In [None]:
# client.collections.delete("WikiQ")

## Close the client

In [None]:
client.close()