In [None]:
# ! pip install -qU pymilvus langchain sentence-transformers tiktoken octoai-sdk openai 
# ! zsh ../standalone_embed.sh start

In [1]:
from dotenv import load_dotenv
import os

load_dotenv()
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")

In [2]:
from langchain_openai import OpenAI
llm = OpenAI()

In [3]:
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Milvus

In [4]:
embeddings = HuggingFaceEmbeddings(model_name="sdadas/mmlw-roberta-base")

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
from langchain.text_splitter import CharacterTextSplitter
from langchain.schema import Document
import os

In [6]:
files = os.listdir("./polish_city_data")

In [7]:
files

['Chicago.txt',
 'Berlin.txt',
 'Lizbona.txt',
 'Paryż.txt',
 'Szanghaj.txt',
 'Houston.txt',
 'Seattle.txt',
 'Karaczi.txt',
 'Kopenhaga.txt',
 'Monachium.txt',
 'Tokio.txt',
 'Toronto.txt',
 'Londyn.txt',
 'San Francisco.txt',
 'Atlanta.txt',
 'Boston.txt',
 'Pekin.txt',
 'Moskwa.txt',
 'Kair.txt']

In [8]:
file_texts = []

In [9]:
for file in files:
    with open(f"./polish_city_data/{file}") as f:
        file_text = f.read()
    text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
        chunk_size=512, chunk_overlap=64, 
    )
    texts = text_splitter.split_text(file_text)
    for i, chunked_text in enumerate(texts):
        file_texts.append(Document(page_content=chunked_text, 
                metadata={"doc_title": file.split(".")[0], "chunk_num": i}))

Created a chunk of size 729, which is longer than the specified 512
Created a chunk of size 835, which is longer than the specified 512
Created a chunk of size 1448, which is longer than the specified 512
Created a chunk of size 633, which is longer than the specified 512
Created a chunk of size 922, which is longer than the specified 512
Created a chunk of size 1077, which is longer than the specified 512
Created a chunk of size 542, which is longer than the specified 512
Created a chunk of size 547, which is longer than the specified 512
Created a chunk of size 561, which is longer than the specified 512
Created a chunk of size 761, which is longer than the specified 512
Created a chunk of size 609, which is longer than the specified 512
Created a chunk of size 816, which is longer than the specified 512
Created a chunk of size 721, which is longer than the specified 512
Created a chunk of size 633, which is longer than the specified 512
Created a chunk of size 3028, which is longer 

In [10]:
# For the first run
# 
vector_store = Milvus.from_documents(
    file_texts,
    embedding=embeddings,
    connection_args={"host": "localhost", "port": 19530},
    collection_name="polish_cities"
)

# if you already have the data you need stored in Milvus
# vector_store = Milvus(
#     embedding_function=embeddings,
#     connection_args={"host": "localhost", "port": 19530},
#     collection_name="french cities"
# )

In [11]:
retriever = vector_store.as_retriever()

In [12]:
from langchain.prompts import ChatPromptTemplate
template="""You are an assistant for question-answering tasks. 
Use the following pieces of retrieved context to answer the question. 
If you don't know the answer, just say that you don't know. 
Use three sentences maximum and keep the answer concise.
Answer in Polish.
Question: {question} 
Context: {context} 
Answer:"""
prompt = ChatPromptTemplate.from_template(template)

In [13]:
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [14]:
response = chain.invoke("Which sports teams are in Chicago?")

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [15]:
response

' Drużyny sportowe w Chicago to Chicago Cubs, Chicago White Sox, Chicago Blackhawks, Chicago Bulls, Chicago Bears i Chicago Fire.'

In [16]:
response_2 = chain.invoke("Które drużyny sportowe są w Chicago?")

In [17]:
response_2

' W Chicago znajdują się drużyny sportowe Chicago Cubs, Chicago White Sox, Chicago Blackhawks, Chicago Bulls, Chicago Bears i Chicago Fire.'