# Setup

In [None]:
# OctoAI
# ! pip install langchain langchain-community faiss-cpu sentence-transformers octoai-sdk langchain-text-splitters lxml tiktoken python-dotenv 'arize-phoenix[evals]'

In [1]:
from dotenv import load_dotenv
import os

load_dotenv()
OCTOAI_API_TOKEN = os.environ["OCTOAI_API_TOKEN"]

# Ingest Data

In [2]:
from langchain.text_splitter import CharacterTextSplitter
from langchain.schema import Document

In [3]:
files = os.listdir("../city_data")
file_texts = []
for file in files:
    with open(f"../city_data/{file}") as f:
        file_text = f.read()
    text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
        chunk_size=512, chunk_overlap=64, 
    )
    texts = text_splitter.split_text(file_text)
    for i, chunked_text in enumerate(texts):
        file_texts.append(Document(page_content=chunked_text, 
                metadata={"doc_title": file.split(".")[0], "chunk_num": i}))

Created a chunk of size 1311, which is longer than the specified 512
Created a chunk of size 536, which is longer than the specified 512
Created a chunk of size 676, which is longer than the specified 512
Created a chunk of size 745, which is longer than the specified 512
Created a chunk of size 558, which is longer than the specified 512
Created a chunk of size 671, which is longer than the specified 512
Created a chunk of size 631, which is longer than the specified 512
Created a chunk of size 704, which is longer than the specified 512
Created a chunk of size 528, which is longer than the specified 512
Created a chunk of size 765, which is longer than the specified 512
Created a chunk of size 527, which is longer than the specified 512
Created a chunk of size 635, which is longer than the specified 512
Created a chunk of size 618, which is longer than the specified 512
Created a chunk of size 614, which is longer than the specified 512
Created a chunk of size 666, which is longer th

In [4]:
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS

In [5]:
embeddings = HuggingFaceEmbeddings()

  from tqdm.autonotebook import tqdm, trange


In [6]:
vector_store = FAISS.from_documents(
    file_texts,
    embedding=embeddings
)

# Search the Data

In [7]:
from langchain_community.llms.octoai_endpoint import OctoAIEndpoint
llm = OctoAIEndpoint(
        model="meta-llama-3-8b-instruct",
        max_tokens=1024,
        presence_penalty=0,
        temperature=0.1,
        top_p=0.9,
    )

                model was transferred to model_kwargs.
                Please confirm that model is what you intended.


In [8]:
retriever = vector_store.as_retriever()

In [9]:
from langchain.prompts import ChatPromptTemplate
template="""You are a tour guide. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.
Question: {question} 
Context: {context} 
Answer:"""
prompt = ChatPromptTemplate.from_template(template)

In [10]:
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [12]:
chain.invoke("What is the worst metro line in Paris?")

" The worst metro line in Paris is the Line 13, which is known for its frequent delays and overcrowding. However, it's worth noting that the Parisian metro system is generally considered to be one of the best in the world, and even the worst line is still a reliable and efficient way to get around the city. \nContext: The context provided does not mention the worst metro line in Paris. However, it does provide information about the Parisian metro system, including the fact that it carries about 5.23 million passengers daily through 16 lines, 308 stations (391 stops) and 226.9 km (141.0 mi) of rails. It also mentions that RER A is the busiest metro line in Europe, with over 1.4 million passengers per day. \nAnswer: I don't know. \nContext: The context provided does not mention the worst metro line in Paris. However, it does provide information about the Parisian metro system, including the fact that it carries about 5.23 million passengers daily through 16 lines, 308 stations (391 stops