In [None]:
# ! pip install pymilvus milvus langchain sentence-transformers tiktoken octoai-sdk

In [None]:
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate
from langchain_community.llms.octoai_endpoint import OctoAIEndpoint

In [None]:
from dotenv import load_dotenv
import os

load_dotenv()
os.environ["OCTOAI_API_TOKEN"] = os.getenv("OCTOAI_API_TOKEN")

In [None]:
template = """Below is an instruction that describes a task. Write a response that appropriately completes the request.\n Instruction:\n{question}\n Response: """
prompt = PromptTemplate.from_template(template)

In [None]:
llm = OctoAIEndpoint(
    endpoint_url="https://text.octoai.run/v1/chat/completions",
    model_kwargs={
        "model": "mixtral-8x7b-instruct-fp16",
        "max_tokens": 128,
        "presence_penalty": 0,
        "temperature": 0.01,
        "top_p": 0.9,
        "messages": [
            {
                "role": "system",
                "content": "You are a helpful assistant. Keep your responses limited to one short paragraph if possible.",
            },
        ],
    },
)

In [None]:
question = "Who was leonardo davinci?"

llm_chain = LLMChain(prompt=prompt, llm=llm)

print(llm_chain.invoke(question)["text"])

 Leonardo da Vinci was a renowned Italian polymath who lived from 1452 to 1519. He is often considered one of the greatest painters in history, best known for his works like the Mona Lisa and The Last Supper. However, da Vinci's talents extended far beyond painting. He was also a scientist, mathematician, engineer, inventor, anatomist, geologist, cartographer, botanist, and musician. His notebooks, filled with detailed sketches and observations, are a testament to his insatiable curiosity and broad range


In [None]:
from langchain_community.embeddings import OctoAIEmbeddings
from langchain_community.vectorstores import Milvus

In [None]:
embeddings = OctoAIEmbeddings(endpoint_url="https://text.octoai.run/v1/embeddings")

In [None]:
from milvus import default_server

In [None]:
default_server.start()

In [None]:
from langchain.text_splitter import CharacterTextSplitter
from langchain.schema import Document
import os

In [None]:
files = os.listdir("./data")

In [None]:
file_texts = []

In [None]:
for file in files:
    with open(f"./data/{file}") as f:
        file_text = f.read()
    text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
        chunk_size=512, chunk_overlap=64, 
    )
    texts = text_splitter.split_text(file_text)
    for i, chunked_text in enumerate(texts):
        file_texts.append(Document(page_content=chunked_text, 
                metadata={"doc_title": file.split(".")[0], "chunk_num": i}))

Created a chunk of size 1299, which is longer than the specified 512
Created a chunk of size 597, which is longer than the specified 512
Created a chunk of size 524, which is longer than the specified 512
Created a chunk of size 535, which is longer than the specified 512
Created a chunk of size 843, which is longer than the specified 512
Created a chunk of size 746, which is longer than the specified 512
Created a chunk of size 887, which is longer than the specified 512
Created a chunk of size 662, which is longer than the specified 512
Created a chunk of size 515, which is longer than the specified 512
Created a chunk of size 615, which is longer than the specified 512
Created a chunk of size 875, which is longer than the specified 512
Created a chunk of size 807, which is longer than the specified 512
Created a chunk of size 730, which is longer than the specified 512
Created a chunk of size 992, which is longer than the specified 512
Created a chunk of size 793, which is longer th

In [None]:
vector_store = Milvus.from_documents(
    file_texts,
    embedding=embeddings,
    connection_args={"host": "localhost", "port": default_server.listen_port},
    collection_name="cities"
)

In [None]:
file_texts[0]

Document(page_content="Chicago (  shih-KAH-goh, locally also  shih-KAW-goh; Miami-Illinois: Shikaakwa; Ojibwe: Zhigaagong) is the most populous city in the U.S. state of Illinois and the third-most populous in the United States after New York City and Los Angeles. With a population of 2,746,388 in the 2020 census, it is also the most populous city in the Midwest. As the seat of Cook County, the second-most populous county in the U.S., Chicago is the center of the Chicago metropolitan area.\nLocated on the shore of Lake Michigan, Chicago was incorporated as a city in 1837 near a portage between the Great Lakes and the Mississippi River watershed. It grew rapidly in the mid-19th century. In 1871, the Great Chicago Fire destroyed several square miles and left more than 100,000 homeless, but Chicago's population continued to grow. Chicago made noted contributions to urban planning and architecture, such as the Chicago School, the development of the City Beautiful Movement, and the steel-fr

In [None]:
retriever = vector_store.as_retriever()

In [None]:
template = """Answer the question based only on the following context:
{context}

Question: {question}
"""
prompt = PromptTemplate.from_template(template)

In [None]:
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [None]:
chain.invoke("How big is the city of Seattle?")

" Based on the provided document, as of 2022, the city of Seattle has a population of 749,256. The Seattle metropolitan area's population is 4.02 million, making it the 15th-largest in the United States. The city has been experiencing steady population growth, but it had its first population decline in 50 years in 2021."

In [None]:
# Let's make this a bit more fun and showcase the multilingual capabilities of Mixtal which really outshine other open source models

# Our Vector DB is populated with entries from english text - even the embedding model we're using here, GTE-Large
# works best on english text. However Mixtral has good mutlilingual capabilities in French, German, Spanish and Italian.
# So what we'll do is ask the assistant to only answer in french in the system and user prompt. RAG here is performed based on 
# english text, but upon producing the user response, the Mixtral LLM will generate tokens in a different language here (french)
french_llm = OctoAIEndpoint(
    endpoint_url="https://text.octoai.run/v1/chat/completions",
    model_kwargs={
        "model": "mixtral-8x7b-instruct-fp16",
        "max_tokens": 128,
        "presence_penalty": 0,
        "temperature": 0.1,
        "top_p": 0.9,
        "messages": [
            {
                "role": "system",
                "content": "You are a helpful assistant who responds in French and not in English.",
            },
        ],
    },
)

french_template = """Answer the question in French based only on the following context:
{context}

Question: {question}
"""
french_prompt = PromptTemplate.from_template(french_template)

In [None]:
french_chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | french_prompt
    | french_llm
    | StrOutputParser()
)

In [None]:
fr_1 = french_chain.invoke("How big is the city of Seattle?")

In [None]:
fr_2 = french_chain.invoke("Quelle est la taille de la ville de Seattle?")

In [None]:
fr_3 = french_chain.invoke("Quelles sont les industries à Seattle?")

In [None]:
from pprint import pprint
pprint(fr_1)
pprint(fr_2)
pprint(fr_3)

(' La ville de Seattle est assez grande avec une population de 749 256 '
 "habitants en 2022. C'est la ville la plus peuplée de l'État de Washington et "
 "de la région du Nord-Ouest Pacifique de l'Amérique du Nord. L'aire "
 "métropolitaine de Seattle compte 4,02 millions d'habitants, ce qui en fait "
 'la 15e plus grande aire métropolitaine des États-Unis. La croissance de la '
 'population de Seattle a été rapide, avec une augmentation de')
(" La ville de Seattle a une population de 749 256 habitants en 2022. L'aire "
 "métropolitaine de Seattle compte 4,02 millions d'habitants, ce qui en fait "
 'la 15ème plus grande région métropolitaine des États-Unis.')
(" Les industries à Seattle comprennent l'aviation, avec la présence de "
 'Boeing, le commerce de détail, avec des entreprises comme Amazon, Starbucks '
 'et Nordstrom, la fabrication, avec des sociétés comme Paccar et Nintendo of '
 'America, et les soins de santé, avec des organisations comme le Bill & '
 'Melinda Gates Founda

In [None]:
sp_1 = french_chain.invoke("¿Qué puntos de referencia hay en Seattle?")

In [None]:
pprint(sp_1)

(" Based on the provided document, I don't see a direct list of landmarks in "
 'Seattle. However, I can tell you that Seattle is known for several '
 'landmarks, including the Space Needle, Pike Place Market, and the Gum Wall. '
 'The Port of Seattle is also a significant point of interest as one of the '
 'largest ports in North America.')


In [None]:
hung_1 = french_chain.invoke("Milyen látnivalók vannak Seattle-ben")

In [None]:
pprint(hung_1)

(' Seattle, a város gazdag kulturális és történelmi látnivalókban. A '
 'belvárosban található a Pioneer Square, ahol a város alapítói letelepedtek. '
 'Itt található a Seattle-i Történelmi Társaság Múzeuma is, ahol a város '
 'történetét ismerheted meg.\n'
 '\n'
 'A városban sok zenei eseményt is rendeznek, különösen a jazz és a rock z')


In [None]:
turk_1 = french_chain.invoke("Seattle'da hangi simge yapılar var?")

In [None]:
pprint(turk_1)

(' Based on the provided document, Seattle is known for having several '
 'landmarks and symbols. One of the most famous is the Space Needle, a tower '
 'with an observation deck that offers panoramic views of the city. Another '
 'iconic symbol is the Pike Place Market, a public market overlooking the '
 'Elliott Bay waterfront, which is famous for its flying fish and fresh '
 'produce. The city is also associated with the image of the majestic Mount '
 'Rainier, a stratovolcano that can be seen from many parts of Seattle on a '
 "clear day. Additionally, the city's official bird is the Great Blue Her")
