## Set Up Environment

In [None]:
from langchain_text_splitters import MarkdownHeaderTextSplitter
from langchain_pinecone import PineconeEmbeddings, PineconeVectorStore
from pinecone import Pinecone, ServerlessSpec

from langchain_openai import ChatOpenAI
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain import hub

from dotenv import load_dotenv
import os
import time

In [26]:
# Get environment variables from .env.local file

load_dotenv(dotenv_path="../.env.local")
PINECONE_API_KEY = os.getenv("PINECONE_API_KEY")
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

## Store knowledge in Pinecone

In [27]:
# Chunk the markdown content into sections based on the headers

with open('data.md', 'r') as file:
    markdown_document = file.read()

headers_to_split_on = [
    ("##", "Header 2")
]

markdown_splitter = MarkdownHeaderTextSplitter(
    headers_to_split_on=headers_to_split_on, strip_headers=False
)
md_header_splits = markdown_splitter.split_text(markdown_document)

print(md_header_splits)
print("\n")


[Document(metadata={'Header 2': 'Contact'}, page_content='## Contact\nmailto:yash.chennawar@gmail.com\n(732) 997-2018\nhttps://www.linkedin.com/in/yashchennawar\nhttps://www.github.com/yashc73080'), Document(metadata={'Header 2': 'Education'}, page_content='## Education  \nRutgers University-New Brunswick Honors College\nGraduating May 2027\nBachelor of Science: Computer Science and Mathematics Double Major, Data Science Minor\nGPA: 3.7/4.0\nCoursework: Artificial Intelligence, Computer Architecture, Data Structures, Data Management, Algorithms, Discrete Structures, Linear Algebra, Differential Equations, Calculus III, Probability Theory, Statistics, Quantum Computing'), Document(metadata={'Header 2': 'Technical/Work Experience'}, page_content='## Technical/Work Experience\nUndergraduate Research Assistant\nMay 2024 - Present\nRutgers University - CABM\nPiscataway, NJ\n- Engineer neural networks inspired by biological brain mechanisms to advance pattern discrimination tasks as solution

In [28]:
# Initialize a LangChain embedding object

model_name = 'multilingual-e5-large'
embeddings = PineconeEmbeddings(
    model=model_name,
    pinecone_api_key=PINECONE_API_KEY
)

In [29]:
# Make a Pinecone index to store document

pc = Pinecone(api_key=PINECONE_API_KEY)

cloud = 'aws'
region = 'us-east-1'
spec = ServerlessSpec(cloud=cloud, region=region)

index_name = "portfolio"

if index_name not in pc.list_indexes().names():
    pc.create_index(
        name=index_name,
        dimension=embeddings.dimension,
        metric="cosine",
        spec=spec
    )
    # Wait for index to be ready
    while not pc.describe_index(index_name).status['ready']:
        time.sleep(1)

# See that it is empty
print("Index before upsert:")
print(pc.Index(index_name).describe_index_stats())
print("\n")

Index before upsert:
{'dimension': 1024,
 'index_fullness': 0.0,
 'namespaces': {'data': {'vector_count': 3}},
 'total_vector_count': 3}




In [30]:
# Embed and upsert each record in a namespace

namespace = "data"

docsearch = PineconeVectorStore.from_documents(
    documents=md_header_splits,
    index_name=index_name,
    embedding=embeddings,
    namespace=namespace
)

time.sleep(5)

# See how many vectors have been upserted
print("Index after upsert:")
print(pc.Index(index_name).describe_index_stats())
print("\n")
time.sleep(2)

Unclosed client session
client_session: <aiohttp.client.ClientSession object at 0x000001C837912970>


Index after upsert:
{'dimension': 1024,
 'index_fullness': 0.0,
 'namespaces': {'data': {'vector_count': 3}},
 'total_vector_count': 3}




In [31]:
# Look at a record

index = pc.Index(index_name)
namespace = "data"

for ids in index.list(namespace=namespace):
    query = index.query(
        id=ids[0], 
        namespace=namespace, 
        top_k=1,
        include_values=True,
        include_metadata=True
    )
    print(query)
    print("\n")

{'matches': [{'id': '0e166c21-0b74-487e-bca8-7b48c2781b55',
              'metadata': {'Header 2': 'Education',
                           'text': '## Education  \n'
                                   'Rutgers University-New Brunswick Honors '
                                   'College\n'
                                   'Graduating May 2027\n'
                                   'Bachelor of Science: Computer Science and '
                                   'Mathematics Double Major, Data Science '
                                   'Minor\n'
                                   'GPA: 3.7/4.0\n'
                                   'Coursework: Artificial Intelligence, '
                                   'Computer Architecture, Data Structures, '
                                   'Data Management, Algorithms, Discrete '
                                   'Structures, Linear Algebra, Differential '
                                   'Equations, Calculus III, Probability '
             

## Use a chatbot

In [33]:
# Initialize LangChain object to chat with OpenAI LLM

retrieval_qa_chat_prompt = hub.pull("langchain-ai/retrieval-qa-chat")
retriever=docsearch.as_retriever()

llm = ChatOpenAI(
    openai_api_key=OPENAI_API_KEY,
    model_name='gpt-4o-mini',
    temperature=0.0
)

combine_docs_chain = create_stuff_documents_chain(
    llm, retrieval_qa_chat_prompt
)
retrieval_chain = create_retrieval_chain(retriever, combine_docs_chain)

  """Verify API key is provided if url not localhost.


In [34]:
# Define query

query1 = "Where did Yash study and what does he study there?"

In [35]:
# Test without using Pinecone knowledge

answer1_without_knowledge = llm.invoke(query1)

print("Query 1:", query1)
print("\nAnswer without knowledge:\n\n", answer1_without_knowledge.content)
print("\n")
time.sleep(2)

Query 1: Where did Yash study and what does he study there?

Answer without knowledge:

 I'm sorry, but I need more context to provide an accurate answer. "Yash" is a common name, and without additional information about which Yash you are referring to, I cannot determine where he studied or what he studied. If you can provide more details, I would be happy to help!




In [36]:
# Now answer using Pinecone knowledge

answer1_with_knowledge = retrieval_chain.invoke({"input": query1})

print("Answer with knowledge:\n\n", answer1_with_knowledge['answer'])
print("\nContext used:\n\n", answer1_with_knowledge['context'])
print("\n")
time.sleep(2)

Answer with knowledge:

 Yash studies at Rutgers University-New Brunswick, where he is pursuing a Bachelor of Science with a double major in Computer Science and Mathematics, along with a minor in Data Science.

Context used:

 [Document(id='97efdceb-c188-42b4-b524-75a5e023e541', metadata={'Header 2': 'Contact'}, page_content='## Contact\nmailto:yash.chennawar@gmail.com\n(732) 997-2018\nhttps://www.linkedin.com/in/yashchennawar\nhttps://www.github.com/yashc73080'), Document(id='47eddf9a-e726-4928-92b3-0d3690a3e798', metadata={'Header 2': 'Technical/Work Experience'}, page_content='## Technical/Work Experience\nUndergraduate Research Assistant\nMay 2024 - Present\nRutgers University - CABM\nPiscataway, NJ\n- Engineer neural networks inspired by biological brain mechanisms to advance pattern discrimination tasks as solutions for the deep learning synaptic credit assignment problem under the guidance of Dr. Aaron Milstein\n- Develop with PyTorch in Python for building and fine-tuning neur