## Langchain & Vector Stores (Pinecode)
Examples: Pinecone, Chroma, Milvus, Qdrant, FAISS

In [1]:
pip install pinecone-client

[0mNote: you may need to restart the kernel to use updated packages.


In [2]:
import os
from dotenv import find_dotenv, load_dotenv

load_dotenv(find_dotenv(), override=True)

True

In [24]:
from pinecone import Pinecone, ServerlessSpec
import tiktoken

pc = Pinecone()

### Create Index

In [5]:
pc.create_index(
    name="quickstart",
    dimension=8, # Replace with your model dimensions
    metric="euclidean", # Replace with your model metric
    spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    ) 
)

### List and Check

In [10]:
pc.list_indexes()

{'indexes': [{'dimension': 8,
              'host': 'quickstart-yab0jtx.svc.aped-4627-b74a.pinecone.io',
              'metric': 'euclidean',
              'name': 'quickstart',
              'spec': {'serverless': {'cloud': 'aws', 'region': 'us-east-1'}},
              'status': {'ready': True, 'state': 'Ready'}}]}

In [11]:
pc.list_indexes().names()

['quickstart']

### Upsert

In [12]:
index = pc.Index("quickstart")

In [13]:
index.upsert(
    vectors=[
        {
            "id": "vec1", 
            "values": [0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1], 
            "metadata": {"genre": "drama"}
        }, {
            "id": "vec2", 
            "values": [0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2], 
            "metadata": {"genre": "action"}
        }, {
            "id": "vec3", 
            "values": [0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3], 
            "metadata": {"genre": "drama"}
        }, {
            "id": "vec4", 
            "values": [0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4], 
            "metadata": {"genre": "action"}
        }
    ],
    namespace= "ns1"
)

{'upserted_count': 4}

### Query

In [16]:
index.query(
    namespace="ns1",
    vector=[0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3],
    top_k=2,
    include_values=True,
    include_metadata=True,
    filter={"genre": {"$eq": "action"}}
)

{'matches': [{'id': 'vec4',
              'metadata': {'genre': 'action'},
              'score': 0.0799999237,
              'values': [0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4]},
             {'id': 'vec2',
              'metadata': {'genre': 'action'},
              'score': 0.0800000429,
              'values': [0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2]}],
 'namespace': 'ns1',
 'usage': {'read_units': 6}}

### Splitting and Embedding Text Using Langchain

In [18]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

with open('vivekananda_speech.txt', 'r') as f:
    vivekandand_speech = f.read()


text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=100,
    chunk_overlap=20,
    length_function=len
)

In [21]:
chunks = text_splitter.create_documents([vivekandand_speech])
chunks[0]

Document(page_content='Swami Vivekananda (1863–1902) is best known in the United States for his groundbreaking speech to')

In [23]:
'Total chunks: {}'.format(len(chunks))

'Total chunks: 42'

#### Embedding Cost

In [31]:
def print_embedding_cost(texts):
    enc = tiktoken.encoding_for_model('text-embedding-ada-002')
    total_tokens = sum([len(enc.encode(page.page_content)) for page in texts])
    print(f'Total Tokens: {total_tokens}')
    print(f'Embedding Cost in USD: {total_tokens / 1000*0.0004:.6f}')
print_embedding_cost(chunks)

Total Tokens: 857
Embedding Cost in USD: 0.000343


In [39]:
from langchain.embeddings import OpenAIEmbeddings
embeddings = OpenAIEmbeddings()

In [35]:
vector = embeddings.embed_query(chunks[0].page_content)
print(vector)

[-0.006536526472445787, -0.027713221976736847, 0.006509033362982826, -0.0007900020975291714, -0.02985769848468715, 0.023809177149930805, -0.03156228151593959, -0.004938479764231301, -0.012873726620716501, -0.018612948262123303, 0.008522916087584802, 0.011320356855663629, 0.012640033560466743, -0.03499894022224611, 0.010825477160039839, 0.0149425955642444, -0.0013119447479798827, -0.004883493079644067, -0.0017526963220586016, -0.01711456378467373, -0.02298437920942886, 0.018021843381871117, -0.004718533491543678, 0.02273693982727828, -0.01011065258871236, 0.01745823114542058, 0.00479414012464346, -0.03686848470424418, 0.010275612176812748, 0.0076568773187351375, -0.01201456194413987, -0.022764432471079926, -0.0373358708247437, -0.02776820912698539, -0.018915374794522433, -0.0008385449026870736, -0.03879301447384555, 0.0006516765901882696, 0.026091118739534604, 0.0003752402744746705, -0.0014451153846079749, -0.014000951093617575, 0.0040690048805677404, -0.023163085119430073, -0.017004591

### Inserting the Embeddings into a Pinecone Index

In [36]:
import pinecone
from langchain_community.vectorstores import Pinecone
pc = pinecone.Pinecone()

In [37]:
for i in pc.list_indexes().names():
    print('Deleting all indexes...', end='')
    pc.delete_index(i)
    print('Done')

Deleting all indexes...Done


In [38]:
index_name = 'vivekananda-speech'
if index_name not in pc.list_indexes().names():
    print(f'Creating index {index_name}...')
    pc.create_index(
        name=index_name,
        dimension=1536,
        metric='cosine',
        spec=pinecone.PodSpec(
            environment='gcp-starter'
        )
    )
    print('Done')

Creating index vivekananda-speech...
Done


In [41]:
vector_store = Pinecone.from_documents(documents=chunks, embedding=embeddings, index_name=index_name)

In [42]:
# Loading the vectors from existing index
vector_store = Pinecone.from_existing_index(index_name='vivekananda-speech', embedding=embeddings)

### Asking Questings (Similarity Search)

In [43]:
query = 'Where Vivekanda gave the speech?'
result = vector_store.similarity_search(query)
print(result)

[Document(page_content='Speech delivered by Swami Vivekananda on September 11, 1893, at the first World’s Parliament of'), Document(page_content='speech to the 1893 World’s Parliament of Religions in which he introduced Hinduism to America and'), Document(page_content='Swami Vivekananda (1863–1902) is best known in the United States for his groundbreaking speech to'), Document(page_content='and the founder of Ramakrishna Mission. Swami Vivekananda is also considered a key figure in the')]


In [44]:
for r in result:
    print(r.page_content)
    print('-'*50)
    print()

Speech delivered by Swami Vivekananda on September 11, 1893, at the first World’s Parliament of
--------------------------------------------------

speech to the 1893 World’s Parliament of Religions in which he introduced Hinduism to America and
--------------------------------------------------

Swami Vivekananda (1863–1902) is best known in the United States for his groundbreaking speech to
--------------------------------------------------

and the founder of Ramakrishna Mission. Swami Vivekananda is also considered a key figure in the
--------------------------------------------------



In [45]:
from langchain.chains import RetrievalQA
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(model='gpt-3.5-turbo', temperature=1)

retriever = vector_store.as_retriever(search_type='similarity', search_kwargs={'k':3})

chain = RetrievalQA.from_chain_type(llm=llm, chain_type='stuff', retriever=retriever)

In [46]:
answer = chain.run(query)

In [48]:
print(answer)

Swami Vivekananda gave his famous speech at the first World’s Parliament of Religions in Chicago, on September 11, 1893.


In [49]:
query = 'What was main theme of the speech?'
answer = chain.run(query)
print(answer)

The main theme of Swami Vivekananda's speech at the 1893 World's Parliament of Religions was the introduction of Hinduism to America and the promotion of interfaith understanding and harmony among different religions.
