# Diving into Pinecone

## Vector database

Learn with https://docs.pinecone.io/docs/overview

## Install dependencies

In [None]:
pip install -r ./requirements.txt

### Verify Pinecone is installed

In [5]:
import tqdm
import os
from dotenv import load_dotenv, find_dotenv
import pinecone

# loading the API Keys (Cohere, Pinecone) from .env
load_dotenv(find_dotenv(), override=True)

# Initialize Pinecone library with API key and environment
pinecone.init(
    api_key=os.environ.get('PINECONE_API_KEY'),
    environment=os.environ.get('PINECONE_ENV')
)

# Print Pinecone version info to confirm initialization
pinecone.info.version()

VersionResponse(server='2.0.11', client='2.2.4')

## Pinecone Indexes

Learn with https://docs.pinecone.io/docs/indexes

In [7]:
# List all indexes in the Pinecone environment
pinecone.list_indexes()

['doc-index']

### Creating an index

In [12]:
# Specify name for index
index_name = 'langchain-pinecone'

#  Check if index already exists
if index_name not in pinecone.list_indexes():
    print(f'Creating index {index_name} ....')

    # Create index with parameters
    pinecone.create_index(index_name, 
                          # Vector dimension - The number of dimensions for vectors in this index
                          dimension=1536, 
                          # Similarity metric 
                          # Distance measure used to compare vectors
                          # 'cosine' measures the cosine similarity between vectors
                          metric='cosine')
    print('Done')
else:
    print(f'Index {index_name} already exists!')

Creating index langchain-pinecone ....
Done


In [13]:
# Retrieve metadata about the index 
pinecone.describe_index(index_name)

IndexDescription(name='langchain-pinecone', metric='cosine', replicas=1, dimension=1536.0, shards=1, pods=1, pod_type='starter', status={'ready': True, 'state': 'Ready'}, metadata_config=None, source_collection='')

### Deleting an index

In [11]:
# Get index name to delete from user input
index_name = input('Enter Pinecone index to delete : ')

# Check if index exists
if index_name in pinecone.list_indexes():
    print(f'Deleting index {index_name} ... ')
    pinecone.delete_index(index_name)
    print('Done')
else:
    print(f'Index {index_name} does not exist!')

Enter Pinecone index to delete :  doc-index


Deleting index doc-index ... 
Done


### Getting index statistics

In [14]:
index_name = 'langchain-pinecone'

# Create index object
index = pinecone.Index(index_name)

# Retrieve usage statistics for the index
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

### Inserting into an index

In [15]:
import random

# inserting some random vectors into a Pinecone index

# Generate 5 random 1536-dim vectors 
vectors = [[random.random() for _ in range(1536)] for v in range(5)]

# Create a list of IDs to associate with each vector
ids = list('abcde')

# Specify Pinecone index name 
index_name = 'langchain-pinecone'

# Create index object
index = pinecone.Index(index_name)

# Upsert vectors into index
index.upsert(vectors=zip(ids, vectors))

{'upserted_count': 5}

### Updating a vector

In [16]:
# Upsert a single vector to update it

# Vector ID to update
id_to_update = 'c'  

# New vector data 
new_vector = [0.3] * 1536

# Upsert the new vector data with the same ID
index.upsert(vectors=[(id_to_update, new_vector)])

{'upserted_count': 1}

### Fetching a vector

In [17]:
# Get index object
index = pinecone.Index('langchain-pinecone')  

# Specify IDs of vectors to fetch
ids_to_fetch = ['c', 'd']

# Fetch vector data for the provided IDs
index.fetch(ids=ids_to_fetch)

{'namespace': '',
 'vectors': {'c': {'id': 'c',
                   'values': [0.3,
                              0.3,
                              0.3,
                              0.3,
                              0.3,
                              0.3,
                              0.3,
                              0.3,
                              0.3,
                              0.3,
                              0.3,
                              0.3,
                              0.3,
                              0.3,
                              0.3,
                              0.3,
                              0.3,
                              0.3,
                              0.3,
                              0.3,
                              0.3,
                              0.3,
                              0.3,
                              0.3,
                              0.3,
                              0.3,
                              0.3,
       

### Deleting vectors

In [18]:
# Specify IDs of vectors to delete
ids_to_delete = ['b', 'c']  

# Delete the vectors for those IDs
index.delete(ids=ids_to_delete)

{}

In [26]:
# Get index statistics
index.describe_index_stats()

{'dimension': 1536,
 'index_fullness': 0.0,
 'namespaces': {},
 'total_vector_count': 0}

In [19]:
# Try to fetch a deleted vector 
index.fetch(ids=['b'])

{'namespace': '', 'vectors': {}}

In [20]:
# Delete all remaining vectors
index.delete(delete_all=True)

{}

## Splitting and Embedding Text Using LangChain

https://python.langchain.com/docs/modules/data_connection/document_transformers/
https://python.langchain.com/docs/integrations/text_embedding/cohere

**Text Splitting**

- Splitting large text documents into smaller pieces called chunks
- Makes large texts more manageable to process 
- Common splitting approaches:
  - Split by fixed character length 
  - Split at semantic boundaries like sentences or topics
  - Use a sliding window to create overlapping chunks
- Output is a list of text chunks from the original document

**Text Embedding**

- Encoding text into numeric vectors that capture semantic meaning
- Steps:
  1. Turn text into chunks (splitting)
  2. Map chunks to vector embeddings
  3. Aggregate chunks embeddings into a vector database
 

**Goals**
- The goal of splitting is to divide large documents into manageable sizes for processing
- The goal of embedding is to encode semantic meaning in a way that allows for semantic search 
and comparison
- Together, splitting and embedding enable semantic search, QA, and analysis of large text corpora by indexing the vectorized content


### Split document

In [22]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Open text file and read contents into churchill_speech
with open('documents/churchill_speech.txt') as f:
    churchill_speech = f.read()

# Create text splitter instance
# check this video about chunk - https://youtu.be/n0uPzvGTFI0?feature=shared
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, # maximum size of text chunk in number of characters
    chunk_overlap=20, # pecifies the number of overlapping characters between adjacent chunks.
                      # if chunk 1 ends at character 100, chunk 2 will start at character 80
)

# Split the text into chunks
chunks = text_splitter.create_documents([churchill_speech])

# Print specific chunks - you can test it
# print(chunks[2]) 
# print(chunks[10].page_content)
print(f'Now you have {len(chunks)}')

Now you have 300


### Create Embeddings

In [25]:
from langchain.embeddings import CohereEmbeddings

# Create embeddings instance
embeddings = CohereEmbeddings()

# Take first text chunk 
first_chunk = chunks[0]

# Embed the text into a vector 
vector = embeddings.embed_query(first_chunk.page_content)


# Print the chunk
print(first_chunk.page_content)
# Print the vector
print(vector)

Winston Churchill Speech - We Shall Fight on the Beaches
We Shall Fight on the Beaches
June 4, 1940
[0.13000488, 0.69921875, -1.4414062, 1.2167969, 2.3964844, -0.20178223, 0.6435547, -0.22680664, 0.24780273, 0.39575195, -0.20959473, 1.5703125, 0.27563477, -0.6557617, 1.9101562, -0.0067634583, -0.17626953, -0.61328125, -0.07366943, -0.9008789, 0.103393555, 0.35302734, -1.1953125, 0.3125, 0.6191406, -5.078125, 0.43969727, 0.67529297, 1.2851562, -1.0576172, -0.41625977, -0.6635742, -2.3515625, -0.51171875, 1.6914062, -1.7685547, -0.5161133, -1.0488281, 0.84033203, 2.1035156, -0.36132812, -0.02305603, 0.69384766, -1.3710938, 0.9116211, -0.8442383, 0.78466797, 1.5576172, 1.2333984, 0.028137207, 1.9755859, -2.0429688, -0.032714844, 0.5761719, -2.1074219, -0.24072266, -2.3671875, -0.9042969, 0.68847656, 1.4951172, -2.3652344, -1.1542969, 1.0644531, -0.9667969, 1.2587891, -0.73828125, -1.3691406, -0.39990234, 0.5961914, -1.1640625, 4.203125, -0.6489258, -0.16503906, 0.42651367, 2.5546875, -2.0

### Inserting the Embeddings into a Pinecone Index

In [27]:
import os
import pinecone
from langchain.vectorstores import Pinecone

# Initialize Pinecone client 
pinecone.init(api_key=os.environ.get('PINECONE_API_KEY'), environment=os.environ.get('PINECONE_ENV'))

# Delete any existing indexes
indexes = pinecone.list_indexes()
for i in indexes:
  print('Deleting all indexes ... ', end='')
  pinecone.delete_index(i)
  print('Done')

# Create a new index
index_name = 'churchill-speech'
if index_name not in pinecone.list_indexes():
  print(f'Creating index {index_name} ...')
  pinecone.create_index(index_name, dimension=4096, metric='cosine')
  print('Done!')

# Index the text chunks into Pinecone 
vector_store = Pinecone.from_documents(chunks, embeddings, index_name=index_name)
print("Vector store created !")

Deleting all indexes ... Done
Creating index churchill-speech ...
Done!
Vector store created !


## Asking Questions (Similarity Search)

In [31]:
# Query text 
query = 'Where should we fight?'

# Semantic search against indexed chunks
result = vector_store.similarity_search(query)

# Print top result 
print(result)

# Clean output
print('-' * 50)
for r in result:
    print(r.page_content)
    print('-' * 50)

[Document(page_content='front, now on that, fighting'), Document(page_content='end, we shall fight in France, we shall fight on the seas and oceans, we shall fight with growing'), Document(page_content='shall fight on the beaches, we shall fight on the landing grounds, we shall fight in the fields and'), Document(page_content='streets, we shall fight in the hills; we shall never surrender, and even if, which I do not for a')]
--------------------------------------------------
front, now on that, fighting
--------------------------------------------------
end, we shall fight in France, we shall fight on the seas and oceans, we shall fight with growing
--------------------------------------------------
shall fight on the beaches, we shall fight on the landing grounds, we shall fight in the fields and
--------------------------------------------------
streets, we shall fight in the hills; we shall never surrender, and even if, which I do not for a
-----------------------------------------

## Answering in Natural Language using an LLM

In [36]:
# Import RetrievalQA and Chat model
from langchain.chains import RetrievalQA  
from langchain.chat_models import ChatCohere

# Create Cohere model 
llm = ChatCohere(temperature=0.75, cohere_api_key=os.environ.get('COHERE_API_KEY'))

# Create retriever from vector store
retriever = vector_store.as_retriever(
    # specifies to use semantic similarity search against the Pinecone index
    search_type='similarity', 
    # Here we set k=3 to retrieve the top 10 most similar results
    search_kwargs={'k': 10}) 

# Build QA chain with retriever 
chain = RetrievalQA.from_chain_type(llm=llm, 
                                    # builds a "stuff" chain that retrieves context for questions
                                    # you can have more details with - https://chat.langchain.com/
                                    chain_type="stuff", 
                                    retriever=retriever)

# Query the chain
query = 'What is the goal of this speech?'
answer = chain.invoke(query)

print(answer)

{'query': 'What is the goal of this speech?', 'result': "It is not possible for me to ascertain the goal of a speech without further context or details. The purpose of a speech can vary greatly depending on the context in which it is given, including the speaker's intent, the audience, the occasion, and the subject matter. \n\nSome potential goals that a speaker may aim to achieve through a speech include:\n\n- Educating or informing the audience about a particular topic: The speaker may aim to share knowledge, insights, or facts with the audience to increase their understanding of a subject.\n- Influencing or persuading the audience to take a particular action or adopt a certain viewpoint: The speaker may use rhetorical devices and persuasive techniques to encourage the audience to support a particular cause, change their behavior, or believe in a certain idea.\n- Entertaining the audience through the use of humor, stories, or other engaging elements: The speaker may aim to create an 