In [None]:
%pip install llama-index-embeddings-openai
%pip install llama-index-vector-stores-pinecone
%pip install llama-index-llms-openai

In [16]:
# pip install llama-index
# pip -q install python-dotenv pinecone-client llama-index pymupdf

# Setup

In [67]:
import pinecone
from pinecone import Pinecone, ServerlessSpec
import os
from dotenv import load_dotenv
import fitz
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.schema import TextNode
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core import VectorStoreIndex
from llama_index.core import StorageContext

In [None]:
load_dotenv(dotenv_path='secrets.env')

In [54]:
api_key = os.environ['PINECONE_API_KEY']
environment = os.environ['PINECONE_ENVIRONMENT']
# print(api_key)
pinecone = Pinecone(api_key=api_key, environment=environment)
index_name = "llamaindex-rag-fs"

In [55]:
if index_name not in pinecone.list_indexes().names():
    pinecone.create_index(
        index_name, dimension=1536, metric="euclidean", spec=ServerlessSpec(cloud='aws', region='us-east-1') 
    ) # text-embedding-ada-002
pinecone_index = pinecone.Index(index_name)

## Create PineconeVectorStore

In [56]:
from llama_index.vector_stores.pinecone import PineconeVectorStore
vector_store = PineconeVectorStore(pinecone_index=pinecone_index)

## Load Data

In [None]:
file_path = "./data/llama2.pdf"
doc = fitz.open(file_path)
# print(doc.get_page_text(0))

## Text splitter

In [57]:
text_parser = SentenceSplitter(
    chunk_size = 1024
)

In [58]:
text_chunks = []
doc_idxs = []
for doc_idx, page in enumerate(doc):
    page_text = page.get_text("text")
    cur_text_chunks = text_parser.split_text(page_text)
    text_chunks.extend(cur_text_chunks)               # textfrompg1 textfrompg1 textfrompg1 textfrompg2 textfrompg2
    doc_idxs.extend([doc_idx] * len(cur_text_chunks)) # 1 1 1 2 2

# Construct Nodes from Text chunks
Convert chunks into `TextNode` objects.

In [59]:
nodes = []
for idx, text_chunk in enumerate(text_chunks):
    node = TextNode(
        text = text_chunk,
    )
    src_doc_idx = doc_idxs[idx]
    src_page = doc[src_doc_idx]
    nodes.append(node)

In [61]:
from llama_index.core.extractors import (
    QuestionsAnsweredExtractor,
    TitleExtractor,
)
from llama_index.core.ingestion import IngestionPipeline
from llama_index.llms.openai import OpenAI

llm = OpenAI(model="gpt-3.5-turbo")

extractors = [
    TitleExtractor(nodes=5, llm=llm),
    QuestionsAnsweredExtractor(questions=3, llm=llm),
]

In [None]:
pipeline = IngestionPipeline(
    transformations=extractors,
)
nodes = await pipeline.arun(nodes=nodes, in_place=False)

In [None]:
print(nodes[0].metadata)


## Generate embeddings for each node

In [64]:
embed_model = OpenAIEmbedding()

In [65]:
for node in nodes:
    node_embedding = embed_model.get_text_embedding(
        node.get_content(metadata_mode="all")
    )
    node.embedding = node_embedding

## Load nodes into Vector store

In [None]:
vector_store.add(nodes)

# Retrieve and Query from Vector Store

In [73]:
index = VectorStoreIndex.from_vector_store(vector_store)
query_engine = index.as_query_engine()
query_str = "what are the sizes of the variants of Llama-2 that they are releasing"

In [None]:
response = query_engine.query(query_str)
print(str(response))