In [None]:
# ! pip install -qU pymilvus langchain sentence-transformers tiktoken octoai-sdk openai 
# ! zsh ../standalone_embed.sh start

In [1]:
from dotenv import load_dotenv
import os

load_dotenv()
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")

In [2]:
from langchain_openai import OpenAI
llm = OpenAI()

In [3]:
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Milvus

In [4]:
embeddings = HuggingFaceEmbeddings()

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
from langchain.text_splitter import CharacterTextSplitter
from langchain.schema import Document
import os

In [6]:
files = os.listdir("./french_city_data")

In [7]:
files

['Chicago.txt',
 'Lisbon.txt',
 'Berlin.txt',
 'Moscow.txt',
 'Copenhagen.txt',
 'Karachi.txt',
 'Paris.txt',
 'Houston.txt',
 'Seattle.txt',
 'Munich.txt',
 'Shanghai.txt',
 'Pékin.txt',
 'Beijing.txt',
 'London.txt',
 'Toronto.txt',
 'San Francisco.txt',
 'Atlanta.txt',
 'Boston.txt',
 'Tokyo.txt',
 'Cairo.txt']

In [8]:
file_texts = []

In [9]:
for file in files:
    with open(f"./french_city_data/{file}") as f:
        file_text = f.read()
    text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
        chunk_size=512, chunk_overlap=64, 
    )
    texts = text_splitter.split_text(file_text)
    for i, chunked_text in enumerate(texts):
        file_texts.append(Document(page_content=chunked_text, 
                metadata={"doc_title": file.split(".")[0], "chunk_num": i}))

Created a chunk of size 1335, which is longer than the specified 512
Created a chunk of size 701, which is longer than the specified 512
Created a chunk of size 620, which is longer than the specified 512
Created a chunk of size 600, which is longer than the specified 512
Created a chunk of size 725, which is longer than the specified 512
Created a chunk of size 828, which is longer than the specified 512
Created a chunk of size 552, which is longer than the specified 512
Created a chunk of size 611, which is longer than the specified 512
Created a chunk of size 673, which is longer than the specified 512
Created a chunk of size 571, which is longer than the specified 512
Created a chunk of size 802, which is longer than the specified 512
Created a chunk of size 907, which is longer than the specified 512
Created a chunk of size 607, which is longer than the specified 512
Created a chunk of size 876, which is longer than the specified 512
Created a chunk of size 768, which is longer th

In [10]:
# For the first run
# 
vector_store = Milvus.from_documents(
    file_texts,
    embedding=embeddings,
    connection_args={"host": "localhost", "port": 19530},
    collection_name="french_cities"
)

# if you already have the data you need stored in Milvus
# vector_store = Milvus(
#     embedding_function=embeddings,
#     connection_args={"host": "localhost", "port": 19530},
#     collection_name="french cities"
# )

In [11]:
retriever = vector_store.as_retriever()

In [12]:
from langchain.prompts import ChatPromptTemplate
template="""You are an assistant for question-answering tasks. 
Use the following pieces of retrieved context to answer the question. 
If you don't know the answer, just say that you don't know. 
Use three sentences maximum and keep the answer concise.
Answer in French.
Question: {question} 
Context: {context} 
Answer:"""
prompt = ChatPromptTemplate.from_template(template)

In [13]:
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [14]:
response = chain.invoke("Tell me a historical fact about Karachi.")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [15]:
response

" Karachi a été mentionnée pour la première fois dans l'ouvrage Histoire des plantes de Théophraste au IIIe siècle av. J.-C. Elle a été occupée par les Britanniques au début du XIXe siècle et est devenue la capitale du Sind en 1839. En 1876, le futur fondateur du Pakistan, Muhammad Ali Jinnah, est né et enterré à Karachi."

In [16]:
response_2 = chain.invoke("Racontez-moi un fait historique sur Karachi.")

In [17]:
response_2

" Karachi est une ville qui a été fondée par les Britanniques au début du XIXe siècle et qui est devenue la capitale du Sind. Elle a été un important centre économique et a connu une croissance rapide, notamment grâce à son port. Depuis les années 1980, la ville a été le théâtre de conflits ethniques et religieux, et en 2012, elle a été le site de l'incendie industriel le plus meurtrier de l'histoire."