In [None]:
# ! pip install -qU pymilvus langchain sentence-transformers tiktoken openai 
# ! zsh ../standalone_embed.sh start

In [1]:
from dotenv import load_dotenv
import os

load_dotenv()
os.environ["OPENAI_API_KEY"] = os.getenv("OPENAI_API_KEY")

In [2]:
from langchain_openai import OpenAI
llm = OpenAI()

In [3]:
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import Milvus

In [5]:
embeddings = HuggingFaceEmbeddings(model_name="TownsWu/PEG")

No sentence-transformers model found with name TownsWu/PEG. Creating a new one with MEAN pooling.


In [6]:
from langchain.text_splitter import CharacterTextSplitter
from langchain.schema import Document
import os

In [7]:
files = os.listdir("./chinese_city_data")

In [8]:
files

['开罗.txt',
 '多伦多.txt',
 '莫斯科.txt',
 '芝加哥.txt',
 '柏林.txt',
 '里斯本.txt',
 '休斯敦.txt',
 '亚特兰大.txt',
 '卡拉奇.txt',
 '上海市_(中華民國).txt',
 '波士顿.txt',
 '巴黎.txt',
 '哥本哈根.txt',
 '慕尼黑.txt',
 '西雅圖.txt',
 '东京.txt',
 '北京市.txt',
 '伦敦.txt',
 '旧金山.txt']

In [9]:
file_texts = []

In [10]:
for file in files:
    with open(f"./chinese_city_data/{file}") as f:
        file_text = f.read()
    text_splitter = CharacterTextSplitter.from_tiktoken_encoder(
        chunk_size=512, chunk_overlap=64, 
    )
    texts = text_splitter.split_text(file_text)
    for i, chunked_text in enumerate(texts):
        file_texts.append(Document(page_content=chunked_text, 
                metadata={"doc_title": file.split(".")[0], "chunk_num": i}))

Created a chunk of size 1343, which is longer than the specified 512
Created a chunk of size 614, which is longer than the specified 512
Created a chunk of size 1809, which is longer than the specified 512
Created a chunk of size 1798, which is longer than the specified 512
Created a chunk of size 1239, which is longer than the specified 512
Created a chunk of size 852, which is longer than the specified 512
Created a chunk of size 1497, which is longer than the specified 512
Created a chunk of size 584, which is longer than the specified 512
Created a chunk of size 585, which is longer than the specified 512
Created a chunk of size 567, which is longer than the specified 512
Created a chunk of size 1481, which is longer than the specified 512
Created a chunk of size 1752, which is longer than the specified 512
Created a chunk of size 1745, which is longer than the specified 512
Created a chunk of size 1046, which is longer than the specified 512
Created a chunk of size 1077, which is 

In [11]:
# For the first run
# 
vector_store = Milvus.from_documents(
    file_texts,
    embedding=embeddings,
    connection_args={"host": "localhost", "port": 19530},
    collection_name="chinese_cities"
)

# if you already have the data you need stored in Milvus
# vector_store = Milvus(
#     embedding_function=embeddings,
#     connection_args={"host": "localhost", "port": 19530},
#     collection_name="chinese_cities"
# )

In [12]:
retriever = vector_store.as_retriever()

In [13]:
from langchain.prompts import ChatPromptTemplate
template="""You are an assistant for question-answering tasks. 
Use the following pieces of retrieved context to answer the question. 
If you don't know the answer, just say that you don't know. 
Use three sentences maximum and keep the answer concise.
Answer in Chinese.
Question: {question} 
Context: {context} 
Answer:"""
prompt = ChatPromptTemplate.from_template(template)

In [14]:
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser
chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [18]:
response = chain.invoke("东京 应该参观哪些地标？")

In [19]:
response

' 东京塔，皇居，浅草寺'

In [20]:
response_2 = chain.invoke("What landmarks should I visit in Tokyo?")

In [21]:
response_2

' 你应该去东京的富士箱根伊豆国立公园和东京晴空塔。'