In [None]:
# ! pip install pymilvus milvus langchain sentence-transformers tiktoken octoai-sdk
# ! docker compose -d up

In [1]:
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
from langchain_community.llms.octoai_endpoint import OctoAIEndpoint

In [2]:
from dotenv import load_dotenv
import os

load_dotenv()
os.environ["OCTOAI_API_TOKEN"] = os.getenv("OCTOAI_API_TOKEN")

In [3]:
template = """Below is an instruction that describes a task. Write a response that appropriately completes the request.\n Instruction:\n{question}\n Response: """
prompt = PromptTemplate.from_template(template)

In [4]:
llm = OctoAIEndpoint(
    endpoint_url="https://text.octoai.run/v1/chat/completions",
    model_kwargs={
        "model": "mixtral-8x7b-instruct-fp16",
        "max_tokens": 128,
        "presence_penalty": 0,
        "temperature": 0.01,
        "top_p": 0.9,
        "messages": [
            {
                "role": "system",
                "content": "You are a helpful assistant. Keep your responses limited to one short paragraph if possible.",
            },
        ],
    },
)

In [5]:
question = "Who was leonardo davinci?"

llm_chain = LLMChain(prompt=prompt, llm=llm)

print(llm_chain.invoke(question)["text"])

 Leonardo da Vinci (1452-1519) was an Italian polymath who is often regarded as one of the greatest painters in history. He is also celebrated for his technological ingenuity, scientific curiosity, and philosophical wisdom. Da Vinci is widely known for his masterpieces such as 'The Last Supper' and 'Mona Lisa.' As an artist, scientist, mathematician, engineer, inventor, anatomist, geologist, cartographer, botanist, musician, and writer, da Vinci embodied the Renaissance ideal. His thirst for


In [9]:
from langchain_community.embeddings import OctoAIEmbeddings
from langchain_community.vectorstores import Milvus

In [7]:
embeddings = OctoAIEmbeddings(endpoint_url="https://text.octoai.run/v1/embeddings")

In [10]:
from langchain.text_splitter import CharacterTextSplitter
from langchain.schema import Document
import os

In [11]:
files = os.listdir("./data")

In [12]:
file_texts = []

In [13]:
for file in files:
    with open(f"./data/{file}") as f:
        file_text = f.read()
    text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
        chunk_size=512, chunk_overlap=64, 
    )
    texts = text_splitter.split_text(file_text)
    for i, chunked_text in enumerate(texts):
        file_texts.append(Document(page_content=chunked_text, 
                metadata={"doc_title": file.split(".")[0], "chunk_num": i}))

Created a chunk of size 1299, which is longer than the specified 512
Created a chunk of size 597, which is longer than the specified 512
Created a chunk of size 524, which is longer than the specified 512
Created a chunk of size 535, which is longer than the specified 512
Created a chunk of size 843, which is longer than the specified 512
Created a chunk of size 746, which is longer than the specified 512
Created a chunk of size 887, which is longer than the specified 512
Created a chunk of size 662, which is longer than the specified 512
Created a chunk of size 515, which is longer than the specified 512
Created a chunk of size 615, which is longer than the specified 512
Created a chunk of size 875, which is longer than the specified 512
Created a chunk of size 807, which is longer than the specified 512
Created a chunk of size 730, which is longer than the specified 512
Created a chunk of size 992, which is longer than the specified 512
Created a chunk of size 793, which is longer th

In [14]:
vector_store = Milvus.from_documents(
    file_texts,
    embedding=embeddings,
    connection_args={"host": "localhost", "port": 19530},
    collection_name="cities"
)

In [None]:
file_texts[0]

Document(page_content="Chicago (  shih-KAH-goh, locally also  shih-KAW-goh; Miami-Illinois: Shikaakwa; Ojibwe: Zhigaagong) is the most populous city in the U.S. state of Illinois and the third-most populous in the United States after New York City and Los Angeles. With a population of 2,746,388 in the 2020 census, it is also the most populous city in the Midwest. As the seat of Cook County, the second-most populous county in the U.S., Chicago is the center of the Chicago metropolitan area.\nLocated on the shore of Lake Michigan, Chicago was incorporated as a city in 1837 near a portage between the Great Lakes and the Mississippi River watershed. It grew rapidly in the mid-19th century. In 1871, the Great Chicago Fire destroyed several square miles and left more than 100,000 homeless, but Chicago's population continued to grow. Chicago made noted contributions to urban planning and architecture, such as the Chicago School, the development of the City Beautiful Movement, and the steel-fr

In [15]:
retriever = vector_store.as_retriever()

In [16]:
template = """Answer the question based only on the following context:
{context}

Question: {question}
"""
prompt = PromptTemplate.from_template(template)

In [17]:
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [18]:
chain.invoke("How big is the city of Seattle?")

" Seattle is a significant city with a 2022 population of 749,256, making it the most populous city in both the state of Washington and the Pacific Northwest region of North America. The Seattle metropolitan area's population is 4.02 million, ranking it as the 15th-largest in the United States."

In [19]:
# Let's make this a bit more fun and showcase the multilingual capabilities of Mixtal which really outshine other open source models

# Our Vector DB is populated with entries from english text - even the embedding model we're using here, GTE-Large
# works best on english text. However Mixtral has good mutlilingual capabilities in French, German, Spanish and Italian.
# So what we'll do is ask the assistant to only answer in french in the system and user prompt. RAG here is performed based on 
# english text, but upon producing the user response, the Mixtral LLM will generate tokens in a different language here (french)
french_llm = OctoAIEndpoint(
    endpoint_url="https://text.octoai.run/v1/chat/completions",
    model_kwargs={
        "model": "mixtral-8x7b-instruct-fp16",
        "max_tokens": 128,
        "presence_penalty": 0,
        "temperature": 0.1,
        "top_p": 0.9,
        "messages": [
            {
                "role": "system",
                "content": "You are a helpful assistant who responds in French and not in English.",
            },
        ],
    },
)

french_template = """Answer the question in French based only on the following context:
{context}

Question: {question}
"""
french_prompt = PromptTemplate.from_template(french_template)

In [20]:
french_chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | french_prompt
    | french_llm
    | StrOutputParser()
)

In [21]:
fr_1 = french_chain.invoke("How big is the city of Seattle?")

In [22]:
fr_2 = french_chain.invoke("Quelle est la taille de la ville de Seattle?")

In [23]:
fr_3 = french_chain.invoke("Quelles sont les industries à Seattle?") #what are the industries in seattle

In [24]:
from pprint import pprint
pprint(fr_1)
pprint(fr_2)
pprint(fr_3)

(' La ville de Seattle a une population de 749 256 habitants en 2022. La '
 "région métropolitaine de Seattle compte 4,02 millions d'habitants, ce qui en "
 'fait la 15e plus grande région métropolitaine des États-Unis.')
(' La ville de Seattle a une population de 749 256 habitants en 2022. La '
 "région métropolitaine de Seattle compte 4,02 millions d'habitants, ce qui en "
 'fait la 15e plus grande région métropolitaine des États-Unis.')
(" L'économie de Seattle est marquée par un mélange d'entreprises "
 "industrielles plus anciennes et de sociétés de l'« économie numérique » "
 'telles que les entreprises internet et technologiques, ainsi que des '
 'entreprises de services, de conception et de technologies propres. La ville '
 'abrite également de grandes entreprises telles que Amazon, Starbucks, '
 'Expeditors International of Washington, Nordstrom, Weyerhaeuser, Expedia '
 'Group et Zillow, ainsi que des entreprises biotechnologiques en '
 'développement')


In [25]:
sp_1 = french_chain.invoke("¿Qué puntos de referencia hay en Seattle?") #spanish, landmarks

In [26]:
pprint(sp_1)

(' Based on the provided document, Seattle has several landmarks and points of '
 'interest. However, the document is in English and the question is in '
 'Spanish, so I will answer in French as requested.\n'
 '\n'
 "Seattle possède plusieurs points de repère et lieux d'intérêt. Cependant, le "
 'document est en anglais et la question est en espagnol, donc je vais '
 'répondre en français comme demandé.\n'
 '\n'
 'Parmi les points de repère de Seattle, on peut citer le Space Needle, le '
 "Pike Place Market, l'aquarium de Seattle, le musée")


In [27]:
hung_1 = french_chain.invoke("Milyen látnivalók vannak Seattle-ben") #hungarian, landmark

In [28]:
pprint(hung_1)

(' Based on the provided document, Seattle, Washington has several notable '
 'attractions and points of interest. Here are a few:\n'
 '\n'
 '1. Pioneer Square: This is the site of the second landing of the Denny '
 'Party, who were among the first European settlers in Seattle. The settlement '
 'was named "Duwamps" at that time.\n'
 '\n'
 '2. King Street Station: The name of the modern city of Seattle in '
 'Lushootseed, dᶻidᶻəlal̓ič, meaning "little crossing over place" comes from '
 'the location of one of the Duwamish')


In [29]:
turk_1 = french_chain.invoke("Seattle'da hangi simge yapılar var?") #turkish industries

In [30]:
pprint(turk_1)

(' Based on the provided document, Seattle is known for its significant '
 'musical history, with many jazz nightclubs and the birthplace of notable '
 'rock acts and the grunge subgenre of rock. However, the document does not '
 'provide specific information about symbolic structures in Seattle.')
