In [51]:
import os
from dotenv import load_dotenv

load_dotenv()

pinecone_api_key = os.environ.get('PINECONE_API_KEY')
openai_api_key = os.environ.get('OPENAI_API_KEY')

In [52]:
import time
from pinecone import Pinecone, ServerlessSpec

pc = Pinecone(api_key=pinecone_api_key)

index_name = "sop-index"

pc.create_index(
    name=index_name,
    dimension=1536,
    metric="cosine",
    spec=ServerlessSpec(
        cloud="aws",
        region="us-east-1"
    )
)

while not pc.describe_index(index_name).status['ready']:
    time.sleep(1)

In [53]:
index = pc.Index(index_name)
index.describe_index_stats

<bound method Index.describe_index_stats of <pinecone.data.index.Index object at 0x13c22b190>>

In [54]:
from langchain_openai import OpenAIEmbeddings

model_name = 'text-embedding-3-small'
embeddings = OpenAIEmbeddings(
    model=model_name,
    openai_api_key=openai_api_key
)

In [55]:
from langchain_pinecone import PineconeVectorStore

text_field = "page_content"
vectorstore = PineconeVectorStore(
    index, embeddings, text_field
)

In [56]:
import json

with open("../data/data.json", "r", encoding="utf-8") as file:
    data = json.load(file)
len(data)

95

In [57]:
from text_to_doc import get_doc_chunks

for site in data:
    document = get_doc_chunks(site["page_content"], site["metadata"])
    vectorstore.add_documents(document)

In [58]:
query = "what are some scholarships that I can apply for in 2024? Can you include links and relevant sources when applicable."
vectorstore.similarity_search(
    query,  # our search query
    k=3  # return 3 most relevant docs
)

[Document(page_content='for Fall 2020... [The Harry S. Truman Scholarship](https://scholars.uci.edu/2022/01/05/the-harry-s-truman-scholarship/) ##  [The Harry S. Truman Scholarship](https://scholars.uci.edu/2022/01/05/the-harry-s-truman-scholarship/) Jan 5, 2022 UCI (Big 7 Pre-Application) Mandatory Advising Deadline: October 29, 2019 at 12:00 noon Pacific Time for Fall 2020... [The Donald A. Strauss Foundation Scholarship](https://scholars.uci.edu/2022/01/05/the-donald-a-strauss-foundation-scholarship/) ##  [The Donald A. Strauss Foundation Scholarship](https://scholars.uci.edu/2022/01/05/the-donald-a-strauss-foundation-scholarship/) Jan 5, 2022 UCI (Big 7 Pre-Application) Mandatory Advising Deadline: November 14, 2019 at 12:00 noon Pacific Time for Fall 2020...', metadata={'description': 'The Knight-Hennessy Scholars | SOP | Scholarship Opportunities Program', 'source': 'https://scholars.uci.edu/2022/01/05/knight-hennessy-scholars/', 'title': 'The Knight-Hennessy Scholars | SOP | Sch

In [59]:
from langchain_openai import ChatOpenAI
from langchain.chains import RetrievalQA

llm = ChatOpenAI(
    openai_api_key=openai_api_key,
    model_name='gpt-4o',
    temperature=0.0
)
qa = RetrievalQA.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vectorstore.as_retriever()
)
qa.run(query)

'Based on the provided context, here are some scholarships you might consider applying for in 2024:\n\n1. **The Harry S. Truman Scholarship**\n   - **Description**: This scholarship is for students who are committed to public service.\n   - **Deadline**: The advising deadline for Fall 2020 was October 29, 2019. Check the official website for updated deadlines for 2024.\n   - **Link**: [The Harry S. Truman Scholarship](https://scholars.uci.edu/2022/01/05/the-harry-s-truman-scholarship/)\n\n2. **The Donald A. Strauss Foundation Scholarship**\n   - **Description**: This scholarship promotes public service among its recipients and provides $15,000, divided into $8,000 for the public service project and $7,000 for educational expenses.\n   - **Deadline**: The advising deadline for Fall 2020 was November 14, 2019. Check the official website for updated deadlines for 2024.\n   - **Link**: [The Donald A. Strauss Foundation Scholarship](https://scholars.uci.edu/2022/01/05/the-donald-a-strauss-f

In [60]:
from langchain.chains import RetrievalQAWithSourcesChain
qa_with_sources = RetrievalQAWithSourcesChain.from_chain_type(
    llm=llm,
    chain_type="stuff",
    retriever=vectorstore.as_retriever(),
    
)
qa_with_sources(query)

{'question': 'what are some scholarships that I can apply for in 2024? Can you include links and relevant sources when applicable.',
 'answer': 'Here are some scholarships that you can apply for in 2024:\n\n1. **The Harry S. Truman Scholarship**\n   - Description: This scholarship is for students who are committed to public service.\n   - More Information: [The Harry S. Truman Scholarship](https://scholars.uci.edu/2022/01/05/the-harry-s-truman-scholarship/)\n   - ',
 'sources': '[scholars.uci.edu](https://scholars.uci.edu/2022/01/05/the-harry-s-truman-scholarship/)'}

In [50]:
pc.delete_index(index_name)

NotFoundException: (404)
Reason: Not Found
HTTP response headers: HTTPHeaderDict({'content-type': 'text/plain; charset=utf-8', 'x-pinecone-api-version': '2024-04', 'access-control-allow-origin': '*', 'vary': 'origin,access-control-request-method,access-control-request-headers', 'access-control-expose-headers': '*', 'X-Cloud-Trace-Context': '9ec7a5dc495a8337ff842f13a526a8a4', 'Date': 'Sun, 02 Jun 2024 05:26:40 GMT', 'Server': 'Google Frontend', 'Content-Length': '84', 'Via': '1.1 google', 'Alt-Svc': 'h3=":443"; ma=2592000,h3-29=":443"; ma=2592000'})
HTTP response body: {"error":{"code":"NOT_FOUND","message":"Resource sop-index not found"},"status":404}
