# Compression – Load Data and compress vectors

## Get keys and urls

In [None]:
import os
from dotenv import load_dotenv

load_dotenv()

WEAVIATE_HTTP_URL = os.getenv("WEAVIATE_HTTP_URL")
WEAVIATE_GRPC_URL = os.getenv("WEAVIATE_GRPC_URL")
AZURE_BASE_URL = os.getenv("AZURE_BASE_URL")
AZURE_RESOURCE_NAME = os.getenv("AZURE_RESOURCE_NAME")
USERNAME = os.getenv("USERNAME")

print(WEAVIATE_HTTP_URL)
print(WEAVIATE_GRPC_URL)
print(AZURE_BASE_URL)
print(AZURE_RESOURCE_NAME)

## Connect to Weaviate

In [None]:
import weaviate
# from weaviate.classes.init import AdditionalConfig, Timeout

client = weaviate.connect_to_custom(
    http_host=WEAVIATE_HTTP_URL,
    http_port=80,
    http_secure=False,
    grpc_host=WEAVIATE_GRPC_URL,
    grpc_port=50051,
    grpc_secure=False,
)

client.is_ready()

## Create Collection with SQ configuration

[Docs: Scalar Quantization (SQ)](https://weaviate.io/developers/weaviate/configuration/compression/sq-compression)

> Note: Scalar Quantization includes a training phase, which is required to determine scalar bucket boundaries.<br/>
> In other words, based on your data, it figures out how to best compress your vectors.
>
> The compression training starts when the collection reaches `training_limit` number of objects.<br/>
> Before that, the vectors remain uncompressed, and search happens on uncompressed vectors.

In [None]:
from weaviate.classes.config import Configure

client.collections.delete(USERNAME+"_WikiQ")

# Create a collection here - with Cohere as a vectorizer
client.collections.create(
    name=USERNAME+"_WikiQ",

    vectorizer_config=[
        Configure.NamedVectors.text2vec_openai(
            name="main_vector",

            deployment_id="text-embedding-3-small",
            base_url=AZURE_BASE_URL,

            source_properties=['title', 'text'],

            # Configure SQ
            vector_index_config=Configure.VectorIndex.hnsw(
                quantizer=Configure.VectorIndex.Quantizer.sq(
                    rescore_limit=200,    # the number of overfeteched candidates used for rescoring
                    training_limit=10_000  # (default 100k) number of objects needed to train the codebook
                )
            ),
        )
    ],
)

## The rest is the same

In [None]:
from data_loader import import_wiki_data
import_wiki_data(client, USERNAME+"_WikiQ", 25000)

In [None]:
WikiQ = client.collections.get(USERNAME+"_WikiQ")
WikiQ.aggregate.over_all()

## Clean up

In [None]:
# client.collections.delete(USERNAME+"_WikiQ")

## Close the client

In [None]:
client.close()