In [None]:
import os
from dotenv import load_dotenv
load_dotenv("../.env")

print(os.environ.get("OPENAI_API_KEY"))

In [5]:
import getpass
import os

if not os.environ.get("OPENAI_API_KEY"):
  os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter API key for OpenAI: ")

from langchain.chat_models import init_chat_model

model = init_chat_model("gpt-4o-mini", model_provider="openai")

In [None]:
model.invoke("Hello, world!")


In [None]:
from langchain_community.document_loaders import PyPDFLoader

file_path = "../data/nke-10k-2023.pdf"
loader = PyPDFLoader(file_path)

docs = loader.load()

print(len(docs))

In [None]:
print(f"{docs[0].page_content[:200]}\n")
print(docs[0].metadata)

In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, chunk_overlap=200, add_start_index=True
)
all_splits = text_splitter.split_documents(docs)

len(all_splits)

In [13]:
import getpass
import os

if not os.environ.get("OPENAI_API_KEY"):
  os.environ["OPENAI_API_KEY"] = getpass.getpass("Enter API key for OpenAI: ")

from langchain_openai import OpenAIEmbeddings

embeddings = OpenAIEmbeddings(model="text-embedding-3-large")

In [None]:
vector_1 = embeddings.embed_query(all_splits[0].page_content)
vector_2 = embeddings.embed_query(all_splits[1].page_content)

assert len(vector_1) == len(vector_2)
print(f"Generated vectors of length {len(vector_1)}\n")
print(vector_1[:10])

In [16]:
from langchain_core.vectorstores import InMemoryVectorStore

vector_store = InMemoryVectorStore(embeddings)

In [None]:
ids = vector_store.add_documents(documents=all_splits)

print(f"Added {len(ids)} documents to the vector store")

In [None]:
results = vector_store.similarity_search(
  "How many distribution centers does Nike have in the US?"
)

print(results[0])

In [None]:
results = await vector_store.asimilarity_search("When was Nike incorporated?")

print(results[0])

In [None]:
embedding = embeddings.embed_query("How were Nike's margins impacted in 2023?")

results = vector_store.similarity_search_by_vector(embedding)
print(results[0])

In [None]:
from typing import List

from langchain_core.documents import Document
from langchain_core.runnables import chain


@chain
def retriever(query: str) -> List[Document]:
    return vector_store.similarity_search(query, k=1)


retriever.batch(
  [
    "How many distribution centers does Nike have in the US?",
    "When was Nike incorporated?",
  ],
)