[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/weaviate/recipes/blob/main/weaviate-features/reranking/cohere-ranking/simple-reranking/cohere-ranking.ipynb)

## Dependencies

In [None]:
!pip install weaviate-client

## Configuration

In [1]:
import weaviate, os

cohere_key = os.environ["COHERE_API_KEY"] # Replace with your Cohere key

# Connect to your local Weaviate instance deployed with Docker
client = weaviate.connect_to_local(
  headers={
    "X-COHERE-Api-Key": cohere_key
  }
)

# Option 2
# Connect to your Weaviate Client Service cluster
# client = weaviate.connect_to_wcs(
#     cluster_id="WCS-CLUSTER-ID", # Replace with your WCS cluster ID
#     auth_credentials=weaviate.AuthApiKey(
#       api_key="WCS-API-KEY" # Replace with your WCS API KEY
#     ),
#     headers={
#       "X-COHERE-Api-Key": cohere_key
#     }
# )

client.is_ready()

embedded weaviate is already listening on port 8079


## Create a collection
> Collection stores your data and vector embeddings.

In [None]:
# Note: in practice, you shouldn't rerun this cell, as it deletes your data
# in "BlogPost", and then you need to re-import it again.
import weaviate.classes.config as wc

# Delete the collection if it already exists
if (client.collections.exists("BlogPost")):
    client.collections.delete("BlogPost")

client.collections.create(
    name="BlogPost",

    vectorizer_config=wc.Configure.Vectorizer.text2vec_cohere( # specify the vectorizer and model type you're using
        model="embed-multilingual-v2.0", # defaults to embed-multilingual-v2.0 if not set
        truncate="RIGHT", # defaults to RIGHT if not set 
    ),

    vector_index_config=wc.Configure.VectorIndex.hnsw(
        distance_metric=wc.VectorDistances.COSINE
    ),

    # Configure the reranker here
    reranker_config=wc.Configure.Reranker.cohere(
        model="rerank-multilingual-v2.0"
    ),

    properties=[ # defining properties (data schema) is optional
        wc.Property(name="Content", data_type=wc.DataType.TEXT),
        wc.Property(name="URL", data_type=wc.DataType.TEXT), 
    ]
)

print("Successfully created collection: BlogPost.")

## Upload Data

In [6]:
blogs = ['./ranking-models.mdx', './ref2vec-centroid.mdx'] 

data = {}

# Loop through each file path and read the file
for blog in blogs:
    with open(blog, 'r') as file:
        data[blog] = file.read()

Manually chunk up the document into smaller chunks. This results in the chunks being a bit messy, but this can be improved by using an external tool like LlamaIndex, Haystack, LangChain, etc. 

In [7]:
# Get a collection object for "BlogPost"
blogs = client.collections.get("BlogPost")

chunks = []

for source in data.keys():
    for i in range(0,len(data[source]), 500):
        item = {
            "source": source,
            "content": data[source][i:i+500]
        }

        chunks.append(item)

        # when chunks reach 100, insert chunks Weaviate
        if(len(chunks) >= 100):
            blogs.data.insert_many(chunks)
            chunks.clear()

# insert remaining chunks
if(len(chunks) > 0):
    blogs.data.insert_many(chunks)
    chunks.clear()

## Query

##### Query without reranking

In [None]:
import json

# note, you can reuse the collection object from the previous cell.
# Get a collection object for "BlogPost"
blogs = client.collections.get("BlogPost")

response = blogs.query.near_text(
    "Low hanging fruit to improve relevance",
    limit=5
)

for item in response.objects:
    print("ID:", item.uuid)
    print("Data:", json.dumps(item.properties, indent=2), "\n")

##### The first few results from the above query aren't exactly what we're looking for. Let's run the query again, but rerank the top 10 documents with the text in the content property. 

##### Query with Ranking

In [None]:
import json
import weaviate.classes.query as wq

# note, you can reuse the collection object from the previous cell.
# Get a collection object for "BlogPost"
blogs = client.collections.get("BlogPost")

response = blogs.query.near_text(
    "Low hanging fruit to improve relevance",
    limit=5,
    rerank=wq.Rerank(
        prop="content",
        query="Low hanging fruit"
    ),
    return_metadata=wq.MetadataQuery(score=True)
)

for item in response.objects:
    print("ID:", item.uuid)
    print("Data:", json.dumps(item.properties, indent=2), "\n")