In [None]:
!pip install langchain
!pip install langchain-community
!pip install pinecone-client



Collecting langchain-community
  Downloading langchain_community-0.3.5-py3-none-any.whl.metadata (2.9 kB)
Collecting SQLAlchemy<2.0.36,>=1.4 (from langchain-community)
  Downloading SQLAlchemy-2.0.35-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.6 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting httpx-sse<0.5.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting langchain<0.4.0,>=0.3.6 (from langchain-community)
  Downloading langchain-0.3.7-py3-none-any.whl.metadata (7.1 kB)
Collecting langchain-core<0.4.0,>=0.3.15 (from langchain-community)
  Downloading langchain_core-0.3.15-py3-none-any.whl.metadata (6.3 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.6.1-py3-none-any.whl.metadata (3.5 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from datac

In [None]:
from langchain.document_loaders import CSVLoader
from langchain.text_splitter import CharacterTextSplitter

from langchain.embeddings import HuggingFaceEmbeddings
from langchain.chains.question_answering import load_qa_chain
from langchain import HuggingFaceHub

from pinecone import Pinecone
from pinecone import ServerlessSpec

import os

import warnings
warnings.filterwarnings('ignore')

In [None]:
hugging_face_api = 'huggingface_api' #want to use trained model and datasets from huggingface
pinecone_api = 'pinecone_api'
os.environ["HUGGINGFACEHUB_API_TOKEN"] = hugging_face_api
os.environ['PINECONE_API_TOKEN'] = pinecone_api

In [None]:
import pandas as pd
file_path = '/content/drive/MyDrive/LangChainDataset/books_data/books.csv'
documents = CSVLoader(file_path, encoding='ISO-8859-1')  # load docs by using csv loader and encode it
documents = documents.load()# read data from each row and return as a list of docus (each row is a one doc)
print(documents[:5])

text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
docs = text_splitter.split_documents(documents)


[Document(metadata={'source': '/content/drive/MyDrive/LangChainDataset/books_data/books.csv', 'row': 0}, page_content='ISBN;"Book-Title";"Book-Author";"Year-Of-Publication";"Publisher";"Image-URL-S";"Image-URL-M";"Image-URL-L": 0195153448;"Classical Mythology";"Mark P. O. Morford";"2002";"Oxford University Press";"http://images.amazon.com/images/P/0195153448.01.THUMBZZZ.jpg";"http://images.amazon.com/images/P/0195153448.01.MZZZZZZZ.jpg";"http://images.amazon.com/images/P/0195153448.01.LZZZZZZZ.jpg"'), Document(metadata={'source': '/content/drive/MyDrive/LangChainDataset/books_data/books.csv', 'row': 1}, page_content='ISBN;"Book-Title";"Book-Author";"Year-Of-Publication";"Publisher";"Image-URL-S";"Image-URL-M";"Image-URL-L": 0002005018;"Clara Callan";"Richard Bruce Wright";"2001";"HarperFlamingo Canada";"http://images.amazon.com/images/P/0002005018.01.THUMBZZZ.jpg";"http://images.amazon.com/images/P/0002005018.01.MZZZZZZZ.jpg";"http://images.amazon.com/images/P/0002005018.01.LZZZZZZZ.jp

In [None]:
import pinecone
embeddings = HuggingFaceEmbeddings()  #all-MiniLM-L6-v2
print('Dimensions of embeddings: ', len(embeddings.embed_query("How are you?")))
pc = Pinecone(api_key=os.environ['PINECONE_API_TOKEN']) #connection of pinecone
index_name = 'documentembeddings'



if index_name not in pc.list_indexes():
  spec = ServerlessSpec(cloud="aws", region="us-east-1")
  pc.create_index(
          index_name,
          dimension=768,  # dimensionality of text-embed-3-small
          metric='cosine',
          spec=spec
      )
else:
    print(f"Index '{index_name}' already exists.")


index = pc.Index(index_name) # connection with created index
embeds = []
for i, doc in enumerate(docs):
    embeds.append((str(i), embeddings.embed_query(str(doc))))
print(embeds[0])



Dimensions of embeddings:  768
('0', [0.05591369792819023, 0.013298743404448032, -0.04436260461807251, 0.07115311920642853, -0.02017015963792801, 0.06223687157034874, -0.02253447286784649, 0.012157297693192959, -0.06523162126541138, -0.024587251245975494, 0.024844394996762276, 0.05003148317337036, -0.029648711904883385, 0.05613120272755623, -0.06230752915143967, 0.00364079256542027, 0.027569593861699104, 0.02069680206477642, 0.07156318426132202, -0.01605060324072838, -0.09998375177383423, 0.004075629636645317, 0.001826003659516573, -0.05745306611061096, 0.026092765852808952, -0.003170790383592248, -0.01310101430863142, 0.0365825779736042, 0.0039695692248642445, -0.04059067741036415, -0.055810730904340744, -0.008993588387966156, 0.0041031185537576675, 0.0016214149072766304, 1.6835185761010507e-06, -0.0143088698387146, -0.050031762570142746, -0.024328356608748436, 0.029896564781665802, -0.006840182468295097, 0.053198739886283875, 0.031770650297403336, -0.08707490563392639, -0.07209274917

In [None]:
def retrieve_relevant_context(question, index, docs, top_k, embeddings):
    question_embedding = embeddings.embed_query(question) # generate embeddings of query
    results = index.query(vector=[question_embedding], top_k=top_k, include_metadata=False) # extract relevant documents related to query plus metadata
    indices = [int(result.id) for result in results['matches']] # index of the above results
    contexts = [docs[i] for i in indices] # exact content of relevant docus
    return contexts

In [None]:
llm= HuggingFaceHub(
    repo_id="google/flan-t5-base",
    model_kwargs={
        "temperature":0.7,
        "max_length":512
    },
)
chain = load_qa_chain(llm, chain_type="stuff")

question = 'what is the author of Classical Mythology'
context = retrieve_relevant_context(question, index, docs, 5, embeddings)
answer = chain.run(input_documents=context, question=question)
print(f'As per your questionn\nQuestion: {question}\nThe answer is\nAnswer: {answer}')

As per your questionn
Question: what is the author of Classical Mythology
The answer is
Answer: a sage
