# Introduction to Weaviate - Demo


## Setup

<a target="_blank" href="https://colab.research.google.com/github/weaviate-tutorials/intro-workshop/blob/main/1_weaviate_examples.ipynb">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>

⬇️ This is just a bunch of helper/predefined functions :)

In [None]:
# ============================================================
# ===== HELPER FUNCTIONS FOR THE SEARCH DEMO =====
# ============================================================
def truncate_item(item_in, trunc_len=100):
    return str(item_in)[:trunc_len] + "..." * (trunc_len < len(item_in))

def getprint(weaviate_result, truncate=True):
    for k, results in weaviate_result["data"]["Get"].items():
        print(f"========== {k} Results: ==========")
        for r in results:
            for item_k, item_v in r.items():
                if truncate:
                    item_v = truncate_item(item_v)
                print(f"{item_k}: {item_v}")
            print("\n")

# ============================================================
# ===== HELPER FUNCTIONS FOR THE 3D EMBEDDINGS DEMO =====
# ============================================================
import openai, os
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px
from sklearn.decomposition import PCA
import pandas as pd

openai_key = os.environ["OPENAI_APIKEY"]
openai.api_key = openai_key


def get_emb(sent_inputs):
    oai_resp = openai.Embedding.create(
        input=sent_inputs,
        model="text-embedding-ada-002"
    )
    return oai_resp


def plot_embs(df_in):
    fig = px.scatter(df_in, 
               template="ggplot2",
               x="PC1", y="PC2", color="category",
               hover_data="sentence")
    fig.update_layout(
        margin=dict(
            l=20,
            r=20,
            b=20,
            t=20,
            pad=4
        )
    )
    fig.update_traces(marker_size=20)
    return fig


def plot_vectors(arr_in):
    pca = PCA(n_components=2)
    embeddings_pca = pca.fit_transform(arr_in)

    df = pd.DataFrame(embeddings_pca, columns=["PC1", "PC2"])
    df["sentence"] = sent_inputs
    df["category"] = "other"
    df.loc[:4, "category"] = "cats"
    df.loc[5:9, "category"] = "dogs"
    fig = plot_embs(df)
    return fig


def add_new_emb(sents_in, arr_in, sent_inputs_in):
    resp = get_emb(sents_in)
    for d in resp["data"]:
        arr_in = np.vstack([arr_in, d["embedding"]])
    sent_inputs_in = sent_inputs_in + sents_in
    return arr_in, sent_inputs_in


# ============================================================
# ===== HELPER FUNCTIONS FOR THE 3D PLOTLY DEMO =====
# ============================================================
def preproc_data():
    sizes = [1, 20, 30]
    df = pd.read_csv('data/colors.csv', names=['simple_name', 'name', 'hex', 'r', 'g', 'b'])

    # Preprocessing
    df['rgb'] = df.apply(lambda x: f'rgb({x.r}, {x.g}, {x.b})', axis=1)

    # Get top 'basic' color names
    df = df.assign(category=df.simple_name.apply(lambda x: x.split('_')[-1]))

    # Set default size attribute
    df['size'] = sizes[0]
    return df


def build_chart(df_in):
    fig = px.scatter_3d(df_in, x='r', y='g', z='b',
                        template='plotly_white',
                        color=df_in['simple_name'],
                        color_discrete_sequence=df_in['rgb'],
                        size='size',
                        hover_data=['name'])
    fig.update_layout(
        showlegend=False,
        margin=dict(l=5, r=5, t=20, b=5)
    )
    return fig

In [None]:
import weaviate
import os
import json

client = weaviate.Client(
    "https://edu-demo.weaviate.network",
    auth_client_secret=weaviate.AuthApiKey(api_key="learn-weaviate"),  # Note: Read-only key
    additional_headers={  # After the demo, uncomment this and pass your own API credentials
        "X-OpenAI-Api-Key": os.environ["OPENAI_APIKEY"],
        "X-Cohere-Api-Key": os.environ["COHERE_APIKEY"]
    }
)

### Semantic search around the world

In [None]:
# WikiCity dataset
client.query.aggregate("WikiCity").with_meta_count().do()

Search can be tricky. 

In [None]:
res = (
    client.query
    .get("WikiCity", ["city_name"])
    .with_limit(50).do()
)
display(res["data"]["Get"]["WikiCity"])

How would you search through data like this by common themes? 🤔

With vector search, you can do so by "concepts" (i.e. meaning)

In [None]:
res = client.query.get(
    "WikiCity", ["city_name", "wiki_summary"]
).with_near_text({
    "concepts": ["Major European city"]
}).with_limit(5).do()

getprint(res)

In [None]:
res = client.query.get(
    "WikiArticle", ["title", "wiki_summary"]
).with_near_text({
    "concepts": ["Formula 1 driver"]
}).with_limit(1).do()

getprint(res, truncate=True)

## How does this work?

## Visual demo - vector embeddings

In [None]:
sent_inputs = [
    # Cat-related sentences
    "The Bengal showed off its striking coat pattern.",
    "A lion's powerful roar echoed through the plains.",
    "A leopard's spots provided perfect camouflage in the dappled light.",
    "A cheetah's unmatched speed allowed it to outrun its prey.",
    "The Sphynx basked in the warmth of its owner's lap.",
    # Dog-related sentences    
    "The golden retriever chased after the frisbee.",
    "The playful puppy rolled in the grass.",
    "A loyal companion is always by your side.",
    "The Labrador retriever enjoyed playing in the water.",
    "The family adopted a furry friend from the shelter."    
]

resp = get_emb(sent_inputs)  # Helper function to get "embeddings"
arr = np.array([i["embedding"] for i in resp["data"]])  # Add to an array

In [None]:
fig = plot_vectors(arr)
fig.show()

Similar sentences were *magically* grouped together - this is the power of "vector embeddings", capturing meaning in a bunch of numbers.

### Conceptually - it's similar to this:

You can embed colors into numbers (e.g. RGB) like: ⬇️

In [None]:
df = preproc_data()
colors_fig = build_chart(df)
colors_fig.show()

Modern deep learning models can do the same with text, images, audio, and more!

In [None]:
fig.show()

## Weaviate helps you leverage these

## At scale!

### Semantic search

In [None]:
res = client.query.get(
    "JeopardyQuestion", ["question", "answer"]
).with_near_text({
    "concepts": ["around the world"]
}).with_limit(5).do()

getprint(res, truncate=False)

### Keyword search

In [None]:
res = client.query.get(
    "JeopardyQuestion", ["question", "answer"]
).with_bm25(
    query="peninsula", properties=["answer"]
).with_limit(3).do()

getprint(res, truncate=False)

(You can also combine these into "hybrid" searches!)

### Filtering

In [None]:
res = client.query.get(
    "JeopardyQuestion", ["question", "answer"]
).with_near_text({
    "concepts": ["around the world"]
}).with_where({
    "path": ["question"],
    "operator": "Like",
    "valueText": "*city*"
}).with_limit(3).do()

getprint(res, truncate=False)

### Beyond simple retrieval

With Weaviate, you can do more than just **retrieve** data. 

Weaviate + modern AI tools → **dynamic** data.

### Search + Generative model

Search + `generative-openai` module → **magic**

Transform information like:

In [None]:
res = client.query.get(
    "WikiCity", ["city_name", "wiki_summary"]
).with_near_text({
    "concepts": ["Popular European tourist destination"]
}).with_limit(5).with_generate(
    single_prompt=\
    "Write a tweet with a potentially surprising fact from {wiki_summary}"
).do()

In [None]:
for wa in res["data"]["Get"]["WikiCity"]:
    print(wa["_additional"]["generate"]["singleResult"], "\n")

In [None]:
res = client.query.get(
    "WikiCity", ["city_name", "wiki_summary"]
).with_near_text({
    "concepts": ["Popular European tourist destination"]
}).with_limit(3).with_generate(
    grouped_task="Write a short 2-day travel plan to visit these destinations, to see a few of these landmarks shown in these passages:"
).do()

print(res["data"]["Get"]["WikiCity"][0]["_additional"]["generate"]["groupedResult"])

## Weaviate empowers you...

To do all of these things and more, *at scale*.

Easily handle up to tens of millions, or hundreds of millions of objects.