# Setup

In [None]:
# OctoAI
# ! pip install langchain langchain-community faiss-cpu sentence-transformers octoai-sdk langchain-text-splitters lxml tiktoken python-dotenv

In [1]:
from dotenv import load_dotenv
import os

load_dotenv()
OCTOAI_API_TOKEN = os.environ["OCTOAI_API_TOKEN"]

# Ingest Data

In [4]:
from langchain.text_splitter import CharacterTextSplitter
from langchain.schema import Document
import os

In [5]:
files = os.listdir("../city_data")
file_texts = []
for file in files:
    with open(f"../city_data/{file}") as f:
        file_text = f.read()
    text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
        chunk_size=512, chunk_overlap=64, 
    )
    texts = text_splitter.split_text(file_text)
    for i, chunked_text in enumerate(texts):
        file_texts.append(Document(page_content=chunked_text, 
                metadata={"doc_title": file.split(".")[0], "chunk_num": i}))

Created a chunk of size 1311, which is longer than the specified 512
Created a chunk of size 536, which is longer than the specified 512
Created a chunk of size 676, which is longer than the specified 512
Created a chunk of size 745, which is longer than the specified 512
Created a chunk of size 558, which is longer than the specified 512
Created a chunk of size 671, which is longer than the specified 512
Created a chunk of size 631, which is longer than the specified 512
Created a chunk of size 704, which is longer than the specified 512
Created a chunk of size 528, which is longer than the specified 512
Created a chunk of size 765, which is longer than the specified 512
Created a chunk of size 527, which is longer than the specified 512
Created a chunk of size 635, which is longer than the specified 512
Created a chunk of size 618, which is longer than the specified 512
Created a chunk of size 614, which is longer than the specified 512
Created a chunk of size 666, which is longer th

In [6]:
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS

In [9]:
embeddings = HuggingFaceEmbeddings()

  from tqdm.autonotebook import tqdm, trange


In [10]:
vector_store = FAISS.from_documents(
    file_texts,
    embedding=embeddings
)

# Search the Data

In [11]:
from langchain_community.llms.octoai_endpoint import OctoAIEndpoint
llm = OctoAIEndpoint(
        model="meta-llama-3-8b-instruct",
        max_tokens=1024,
        presence_penalty=0,
        temperature=0.1,
        top_p=0.9,
    )

                model was transferred to model_kwargs.
                Please confirm that model is what you intended.


In [12]:
retriever = vector_store.as_retriever()

In [13]:
from langchain.prompts import ChatPromptTemplate
template="""You are an assistant tour guide. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.
Question: {question} 
Context: {context} 
Answer:"""
prompt = ChatPromptTemplate.from_template(template)

In [14]:
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [15]:
chain.invoke("Where should I visit in Seattle?")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


' You should visit Discovery Park, a 535-acre park in Magnolia, which offers hiking trails, forests, bluffs, and beaches. It\'s a great place to connect with nature and enjoy the outdoors. \nContext: [Document(page_content="== Parks and recreation ==\\n\\nSeattle\'s mild, temperate, marine climate allows year-round outdoor recreation, including walking, cycling, hiking, skiing, snowboarding, kayaking, rock climbing, motor boating, sailing, team sports, and swimming. In town, many people walk around Green Lake, through the forests and along the bluffs and beaches of 535-acre (2.2 km2) Discovery Park (the largest park in the city) in Magnolia, along the shores of Myrtle Edwards Park on the Downtown waterfront, along the shoreline of Lake Washington at Seward Park, along Alki Beach in West Seattle, or along the Burke-Gilman Trail. Gas Works Park features the preserved superstructure of a coal gasification plant closed in 1956. Located across Lake Union from downtown, the park provides pan