Refresh credentials and load the Weaviate IP. 

If the credentials expire, run this cell again to refresh them.

In [1]:
from helpers import update_creds

AWS_ACCESS_KEY, AWS_SECRET_KEY, AWS_SESSION_TOKEN = update_creds()

%store -r WEAVIATE_IP

# Working with Weaviate

![Weaviate architecture](images/weaviate-explainer-architecture.png)

Connect to a Weaviate instance

In [None]:
import weaviate
import os
from weaviate.classes.init import Auth

client = weaviate.connect_to_local(
    # STUDENT TODO:
    # Add WEAVIATE_IP as the first positional argument
    # Add the AWS credentials as the headers argument like below:
    # headers ={
    #     "X-AWS-Access-Key": AWS_ACCESS_KEY,
    #     "X-AWS-Secret-Key": AWS_SECRET_KEY,
    #     "X-AWS-Session-Token": AWS_SESSION_TOKEN,
    # }
    # BEGIN_SOLUTION
    WEAVIATE_IP,
    headers = {
        "X-AWS-Access-Key": AWS_ACCESS_KEY,
        "X-AWS-Secret-Key": AWS_SECRET_KEY,
        "X-AWS-Session-Token": AWS_SESSION_TOKEN,
    }
    # END_SOLUTION
)

client.is_ready()

## Create collection with vectorizer

Create a collection for financial articles with separate embeddings for title and content.

[Docs - Collection configuration](https://weaviate.io/developers/weaviate/manage-data/collections)

![Weaviate collections](images/weaviate-explainer-collections.png)

In [None]:
from weaviate.classes.config import Configure, Property, DataType

# Delete collection if it exists
if client.collections.exists("FinancialArticles"):
    client.collections.delete("FinancialArticles")

# Create collection with named vectors for title and content
client.collections.create(
    name="FinancialArticles",
    # Define property schema
    properties=[
        Property(name="article_title", data_type=DataType.TEXT),
        # STUDENT TODO:
        # Add two additional properties - "article" and "url"
        # Both should be of data_type DataType.TEXT
        # BEGIN_SOLUTION
        Property(name="article", data_type=DataType.TEXT),
        Property(name="url", data_type=DataType.TEXT),
        # END_SOLUTION
    ],
    # Configure separate embeddings for title and article content
    vector_config=[
        Configure.Vectors.text2vec_aws(
            name="title",
            source_properties=["article_title"],
            region="us-west-2",
            service="bedrock",
            model="amazon.titan-embed-text-v2:0"
        ),
        # STUDENT TODO:
        # Add a second vector configuration
        # Call it "content"
        # Use the source property "article"
        # using the same region, service, and model as above
        # BEGIN_SOLUTION
        Configure.Vectors.text2vec_aws(
            name="content",
            source_properties=["article"],
            region="us-west-2",
            service="bedrock",
            model="amazon.titan-embed-text-v2:0"
        )
        # END_SOLUTION
    ],
)

## Load financial articles data

Import the financial news dataset using batch processing for efficient loading.

In [None]:
import pandas as pd
from tqdm import tqdm
from weaviate.util import generate_uuid5

# Load the financial articles dataset
df = pd.read_parquet("data/fin_news_articles_5000.parquet")

print(f"Loaded {len(df)} articles")
print("Sample data:")
df.head()

In [None]:
# Get collection reference
# STUDENT TODO:
# Use client.collections.use to get a reference to the "FinancialArticles" collection
# Assign it to the variable 'articles'
# BEGIN_SOLUTION
articles = client.collections.use("FinancialArticles")
# END_SOLUTION

In [None]:
# Import data with batch processing
with articles.batch.fixed_size(batch_size=100) as batch:
    for _, row in tqdm(df.iterrows(), total=len(df)):
        # Create object from dataframe row
        obj = {
            "article_title": row["article_title"],
            "article": row["article"],
            "url": row["url"] if "url" in row else ""
        }

        # Generate UUID to prevent duplicates
        uuid = generate_uuid5(row["article_title"] + str(row.get("url", "")))

        # STUDENT TODO:
        # Add the object to the batch using batch.add_object
        # Use the obj dictionary and the generated uuid
        # BEGIN_SOLUTION
        batch.add_object(
            properties=obj,
            uuid=uuid
        )
        # END_SOLUTION

        # Check for errors during import
        # STUDENT TODO:
        # If batch.number_errors exceeds 10
        # print "Too many errors during import" and `break` the loop
        # BEGIN_SOLUTION
        if batch.number_errors > 10:
            print("Too many errors during import")
            break
        # END_SOLUTION

print(f"\nImported {len(articles)} articles")

## Check for import errors

In [None]:
if len(articles.batch.failed_objects) > 0:
    print(f"Import completed with {len(articles.batch.failed_objects)} errors")
    for err in articles.batch.failed_objects[:5]:  # Show first 5 errors
        print(err)
else:
    print("Import completed successfully with no errors")

## Verify the data

Check that articles were imported correctly with embeddings.

In [None]:
# Check collection size
# STUDENT TODO:
# Use the built-in len() function on the 'articles' collection to get the total number of articles
# BEGIN_SOLUTION
print(f"Total articles in collection: {len(articles)}")
# END_SOLUTION

In [None]:
# View sample data
# STUDENT TODO:
# Fetch 3 sample objects from the 'articles' collection and assign to 'response'
# using articles.query.fetch_objects, with a limit of 3
# START_SOLUTION
response = articles.query.fetch_objects(limit=3)
# END_SOLUTION

# STUDENT TODO:
# For each item in response.objects,
# print the article title and a preview of the article content (first 100 characters)
# Hint: `.properties` attribute contains the object properties
# START_SOLUTION
for item in response.objects:
    print(f"Title: {item.properties['article_title']}")
    print(f"Content preview: {item.properties['article'][:100]}...\n")
# END_SOLUTION

In [None]:
# Verify embeddings were created
response = articles.query.fetch_objects(
    limit=2,
    # STUDENT TODO:
    # Add the include_vector argument to fetch the vectors for both "title" and "content"
    # START_SOLUTION
    include_vector=["title", "content"]
    # END_SOLUTION
)

for item in response.objects:
    print(f"Article: {item.properties['article_title']}")
    # STUDENT TODO:
    # Print the dimensions of the title and content vectors
    # And the first 5 dimensions of each vector
    # START_SOLUTION
    print(f"Title vector dimension: {len(item.vector['title'])}")
    print(f"Title vector: {item.vector['title'][:5]}")
    print(f"Content vector dimension: {len(item.vector['content'])}\n")
    print(f"Content vector: {item.vector['content'][:5]}")
    # END_SOLUTION

## Test basic search

Quick test to verify the collection works with semantic search.

In [None]:
# Try semantic search on titles
# STUDENT TODO:
# Use `.query.near_text` to search for articles related to "technology earnings"
# Supply the target_vector argument as "title"
# Limit results to 3
# START_SOLUTION
response = articles.query.near_text(
    query="technology earnings",
    target_vector="title",
    limit=3
)
# END_SOLUTION

print("Search results for 'technology earnings':")
for item in response.objects:
    print(f"- {item.properties['article_title']}")

## Close the client

Always close your connection when finished.

In [None]:
# STUDENT TODO:
# Close the Weaviate client connection using client.close()
# START_SOLUTION
client.close()
# END_SOLUTION