In [None]:
%pip install llama-index-embeddings-openai
%pip install llama-index-vector-stores-pinecone
%pip install llama-index-llms-openai

In [None]:
# pip install llama-index
# pip -q install python-dotenv pinecone-client llama-index pymupdf

# Setup

In [None]:
import pinecone
from pinecone import Pinecone, ServerlessSpec
import os
from dotenv import load_dotenv
import fitz
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.schema import TextNode
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core import VectorStoreIndex
from llama_index.core import StorageContext

In [None]:
load_dotenv(dotenv_path='secrets.env')

In [None]:
api_key = os.environ['PINECONE_API_KEY']
environment = os.environ['PINECONE_ENVIRONMENT']
# print(api_key)
pinecone = Pinecone(api_key=api_key, environment=environment)
index_name = "llamaindex-rag-fs"

In [None]:
if index_name not in pinecone.list_indexes().names():
    pinecone.create_index(
        index_name, dimension=1536, metric="euclidean", spec=ServerlessSpec(cloud='aws', region='us-east-1') 
    ) # text-embedding-ada-002
pinecone_index = pinecone.Index(index_name)

## Create PineconeVectorStore

In [None]:
from llama_index.vector_stores.pinecone import PineconeVectorStore
vector_store = PineconeVectorStore(pinecone_index=pinecone_index)

## Load Data

In [None]:
file_path = "./data/llama2.pdf"
doc = fitz.open(file_path)
# print(doc.get_page_text(0))

## Text splitter

In [None]:
text_parser = SentenceSplitter(
    chunk_size = 1024
)

In [None]:
text_chunks = []
doc_idxs = []
for doc_idx, page in enumerate(doc):
    page_text = page.get_text("text")
    cur_text_chunks = text_parser.split_text(page_text)
    text_chunks.extend(cur_text_chunks)               # textfrompg1 textfrompg1 textfrompg1 textfrompg2 textfrompg2
    doc_idxs.extend([doc_idx] * len(cur_text_chunks)) # 1 1 1 2 2

# Construct Nodes from Text chunks
Convert chunks into `TextNode` objects.

In [None]:
nodes = []
for idx, text_chunk in enumerate(text_chunks):
    node = TextNode(
        text = text_chunk,
    )
    src_doc_idx = doc_idxs[idx]
    src_page = doc[src_doc_idx]
    nodes.append(node)

In [None]:
from llama_index.core.extractors import (
    QuestionsAnsweredExtractor,
    TitleExtractor,
)
from llama_index.core.ingestion import IngestionPipeline
from llama_index.llms.openai import OpenAI

llm = OpenAI(model="gpt-3.5-turbo")

extractors = [
    TitleExtractor(nodes=5, llm=llm),
    QuestionsAnsweredExtractor(questions=3, llm=llm),
]

In [None]:
pipeline = IngestionPipeline(
    transformations=extractors,
)
nodes = await pipeline.arun(nodes=nodes, in_place=False)

In [None]:
print(nodes[0].metadata)


## Generate embeddings for each node

In [None]:
embed_model = OpenAIEmbedding()

In [None]:
for node in nodes:
    node_embedding = embed_model.get_text_embedding(
        node.get_content(metadata_mode="all")
    )
    node.embedding = node_embedding

## Load nodes into Vector store

In [None]:
vector_store.add(nodes)

# Retrieve and Query from Vector Store

In [None]:
index = VectorStoreIndex.from_vector_store(vector_store)
query_engine = index.as_query_engine()
query_str = "what are the sizes of the variants of Llama-2 that they are releasing"

In [None]:
response = query_engine.query(query_str)
print(str(response))

## Metadata and Namespace test

In [None]:
%pip install sentence_transformers

In [None]:
import sentence_transformers
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')

In [49]:
embedding = model.encode("Blue cats are rare versions of the domestic house cat, often found in volcanoes near the North Pole.")

In [None]:
print(embedding.shape)

In [50]:
embedding2 = model.encode("Town of soldiers gold medals and war cries")

In [None]:
index_name_2 = "llamaindex-rag-fs-2"
if index_name_2 not in pinecone.list_indexes().names():
    pinecone.create_index(
        index_name_2, dimension=384, metric="euclidean", spec=ServerlessSpec(cloud='aws', region='us-east-1') 
    ) # text-embedding-ada-002
pinecone_index_2 = pinecone.Index(index_name_2)

In [51]:
vectors = [{
    "id": "skibidi",
    "values": embedding,
    "metadata": {
        "pointer": "yippee im a metadata string!",
    }
}, {
    "id": "toilet",
    "values": embedding2,
    "metadata": {
        "pointer": "I am also a metadata string!",
    }
}]

pinecone_index_2.upsert(vectors=vectors, namespace="metadata-test")

{'upserted_count': 2}

## Get Markdown

In [57]:
query_embedding_1 = model.encode("blue")
query_embedding_2 = model.encode("battle")

In [58]:
query_response_1 = pinecone_index_2.query(
    vector=query_embedding_1.tolist(),
    top_k=1,  # Number of top matches to return
    include_metadata=True,
    namespace="metadata-test"
)

query_response_2 = pinecone_index_2.query(
    vector=query_embedding_2.tolist(),
    top_k=1,  # Number of top matches to return
    include_metadata=True,
    namespace="metadata-test"
)

# Extract the metadata from the response
if query_response_1['matches']:
    matched_vector = query_response_1['matches'][0]
    metadata = matched_vector['metadata']
    print(metadata)
else:
    print("No matches found.")

if query_response_2['matches']:
    matched_vector = query_response_2['matches'][0]
    metadata = matched_vector['metadata']
    print(metadata)
else:
    print("No matches found.")

{'pointer': 'yippee im a metadata string!'}
{'pointer': 'I am also a metadata string!'}
