In [1]:
import sys
!{sys.executable} -m pip install --upgrade pip
!{sys.executable} -m pip install mistralai pinecone-client

Collecting pip
  Using cached pip-24.1.2-py3-none-any.whl.metadata (3.6 kB)
Using cached pip-24.1.2-py3-none-any.whl (1.8 MB)
Installing collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 24.0
    Uninstalling pip-24.0:
      Successfully uninstalled pip-24.0
Successfully installed pip-24.1.2


In [4]:
from NDLAIUtils import Utils
import NDLAIUtils

utils = Utils()
PINECONE_API_KEY = utils.get_pinecone_api_key()
MISTRAL_API_KEY = utils.get_mistral_api_key()

# Create Index

In [11]:
from pinecone import Pinecone, ServerlessSpec

pc = Pinecone(api_key=PINECONE_API_KEY)

# Create Index
index_name = "mistral-embed"

if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=1024,
        metric="cosine",
        spec=ServerlessSpec(
            cloud='aws',
            region='us-east-1'
        )
    )

index = pc.Index(index_name)

# Embed & Upsert

In [12]:
# Embed data
data = [
    {"id": "vec1", "text": "Apple is a popular fruit known for its sweetness and crisp texture."},
    {"id": "vec2", "text": "The tech company Apple is known for its innovative products like the iPhone."},
    {"id": "vec3", "text": "Many people enjoy eating apples as a healthy snack."},
    {"id": "vec4", "text": "Apple Inc. has revolutionized the tech industry with its sleek designs and user-friendly interfaces."},
    {"id": "vec5", "text": "An apple a day keeps the doctor away, as the saying goes."},
]

In [13]:
from mistralai.client import MistralClient

client = MistralClient(api_key=MISTRAL_API_KEY)


def embed(docs: list[str]) -> list[list[float]]:

  embeddings = client.embeddings(
      model="mistral-embed",
      input=docs,
  )
  return embeddings.data

embeddings = embed([d["text"] for d in data])

In [9]:
# pull out embedding objects
embeddings = [e.embedding for e in embeddings]

vectors = []
for d, e in zip(data, embeddings):
    vectors.append({
        "id": d['id'],
        "values": e,
        "metadata": {'text': d['text']}
    })

index.upsert(
    vectors=vectors,
    namespace="ns1"
)

{'upserted_count': 5}

# Query

In [10]:
query = "Tell me about the tech company known as Apple"

x = embed([query])[0].embedding

results = index.query(
    namespace="ns1",
    vector=x,
    top_k=3,
    include_values=False,
    include_metadata=True
)

print(results)

{'matches': [{'id': 'vec2',
              'metadata': {'text': 'The tech company Apple is known for its '
                                   'innovative products like the iPhone.'},
              'score': 0.84495759,
              'values': []},
             {'id': 'vec4',
              'metadata': {'text': 'Apple Inc. has revolutionized the tech '
                                   'industry with its sleek designs and '
                                   'user-friendly interfaces.'},
              'score': 0.803735614,
              'values': []},
             {'id': 'vec1',
              'metadata': {'text': 'Apple is a popular fruit known for its '
                                   'sweetness and crisp texture.'},
              'score': 0.743400156,
              'values': []}],
 'namespace': 'ns1',
 'usage': {'read_units': 6}}
