In [1]:
# Load Notion page as a markdownfile file
from langchain.document_loaders import NotionDirectoryLoader
path='./notion_docs'
loader = NotionDirectoryLoader(path)
docs = loader.load()
md_file=docs[0].page_content

In [2]:
md_file

'# Engineering Role Definitions\n\n> Note:\xa0Although the levels may be different, these tend to map pretty cleanly to similar hierarchies at Google and Microsoft, but they have no correlation to hierarchies outside of the "club" of elite companies (Microsoft/Amazon/Google/Facebook, etc. on the west coast, Bloomberg/Goldman/HFT firms, etc. on the east coast).\n> \n> \n> People with 10-15 years of software engineering experience interview\xa0*all the time*\xa0at these companies, but they may not be fit for much more than a junior engineering role (SDE I or SDE II, depending on which company you\'re looking at).\n> \n\n## Junior Engineer/Intern:\n\n- Is essentially a new and inexperienced (below the level of Amazon/Microsoft/Google engineers) junior engineer.\n- Attending college for computer science, or really good at teaching themselves from books and online tutorials.\n- Can probably code something but not design it.\n- Needs a lot of hand holding (from other software engineers) to c

In [3]:
# Let's create groups based on the section headers in our page
from langchain.text_splitter import MarkdownHeaderTextSplitter
headers_to_split_on = [
    ("##", "Section"),
]
markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)
md_header_splits = markdown_splitter.split_text(md_file)

In [4]:
len(md_header_splits)

10

In [5]:
# Define our text splitter
from langchain.text_splitter import RecursiveCharacterTextSplitter
chunk_size = 64
chunk_overlap = 8
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
all_splits = text_splitter.split_documents(md_header_splits)

In [6]:
all_splits

[Document(page_content='# Engineering Role Definitions', metadata={}),
 Document(page_content='> Note:\xa0Although the levels may be different, these tend to map', metadata={}),
 Document(page_content='to map pretty cleanly to similar hierarchies at Google and', metadata={}),
 Document(page_content='and Microsoft, but they have no correlation to hierarchies', metadata={}),
 Document(page_content='outside of the "club" of elite companies', metadata={}),
 Document(page_content='(Microsoft/Amazon/Google/Facebook, etc. on the west coast,', metadata={}),
 Document(page_content='coast, Bloomberg/Goldman/HFT firms, etc. on the east coast).', metadata={}),
 Document(page_content='>\n>', metadata={}),
 Document(page_content='> People with 10-15 years of software engineering experience', metadata={}),
 Document(page_content='interview\xa0*all the time*\xa0at these companies, but they may not', metadata={}),
 Document(page_content='may not be fit for much more than a junior engineering role', met

In [7]:
len(all_splits)

72

In [8]:
from milvus import default_server

In [9]:
default_server.start()

In [10]:
from langchain.vectorstores import Milvus
from langchain.embeddings import OpenAIEmbeddings

In [11]:
vectordb = Milvus.from_documents(documents=all_splits,
                                 embedding=OpenAIEmbeddings(),
                                 connection_args={"host": "127.0.0.1", "port": default_server.listen_port},
                                 collection_name="EngineeringNotionDoc")

In [12]:
from langchain.llms import OpenAI
from langchain.retrievers.self_query.base import SelfQueryRetriever
from langchain.chains.query_constructor.base import AttributeInfo

metadata_fields_info = [
    AttributeInfo(
        name="Section",
        description="Part of the document that the text comes from",
        type="string or list[string]"
    ),
]
document_content_description = "Major sections of the document"

llm = OpenAI(temperature=0)
retriever = SelfQueryRetriever.from_llm(llm, vectordb, document_content_description, metadata_fields_info, verbose=True)

In [16]:
retriever.get_relevant_documents("What makes a distinguished engineer?")

query='distinguished engineer' filter=None limit=None


RPC error: [search], <MilvusException: (code=1, message=syncTimestamp Failed:err: failed to connect 172.19.0.186:53100, reason: context deadline exceeded
, /Users/jibin/Working/milvus-lite/milvus_binary/milvus/pkg/tracer/stack_trace.go:51 github.com/milvus-io/milvus/pkg/tracer.StackTrace
/Users/jibin/Working/milvus-lite/milvus_binary/milvus/internal/util/grpcclient/client.go:405 github.com/milvus-io/milvus/internal/util/grpcclient.(*ClientBase[...]).ReCall
/Users/jibin/Working/milvus-lite/milvus_binary/milvus/internal/distributed/rootcoord/client/client.go:120 github.com/milvus-io/milvus/internal/distributed/rootcoord/client.wrapGrpcCall[...]
/Users/jibin/Working/milvus-lite/milvus_binary/milvus/internal/distributed/rootcoord/client/client.go:321 github.com/milvus-io/milvus/internal/distributed/rootcoord/client.(*Client).AllocTimestamp
/Users/jibin/Working/milvus-lite/milvus_binary/milvus/internal/proxy/timestamp.go:61 github.com/milvus-io/milvus/internal/proxy.(*timestampAllocator).al

MilvusException: <MilvusException: (code=1, message=syncTimestamp Failed:err: failed to connect 172.19.0.186:53100, reason: context deadline exceeded
, /Users/jibin/Working/milvus-lite/milvus_binary/milvus/pkg/tracer/stack_trace.go:51 github.com/milvus-io/milvus/pkg/tracer.StackTrace
/Users/jibin/Working/milvus-lite/milvus_binary/milvus/internal/util/grpcclient/client.go:405 github.com/milvus-io/milvus/internal/util/grpcclient.(*ClientBase[...]).ReCall
/Users/jibin/Working/milvus-lite/milvus_binary/milvus/internal/distributed/rootcoord/client/client.go:120 github.com/milvus-io/milvus/internal/distributed/rootcoord/client.wrapGrpcCall[...]
/Users/jibin/Working/milvus-lite/milvus_binary/milvus/internal/distributed/rootcoord/client/client.go:321 github.com/milvus-io/milvus/internal/distributed/rootcoord/client.(*Client).AllocTimestamp
/Users/jibin/Working/milvus-lite/milvus_binary/milvus/internal/proxy/timestamp.go:61 github.com/milvus-io/milvus/internal/proxy.(*timestampAllocator).alloc
/Users/jibin/Working/milvus-lite/milvus_binary/milvus/internal/proxy/timestamp.go:83 github.com/milvus-io/milvus/internal/proxy.(*timestampAllocator).AllocOne
/Users/jibin/Working/milvus-lite/milvus_binary/milvus/internal/proxy/task_scheduler.go:170 github.com/milvus-io/milvus/internal/proxy.(*baseTaskQueue).Enqueue
/Users/jibin/Working/milvus-lite/milvus_binary/milvus/internal/proxy/impl.go:2667 github.com/milvus-io/milvus/internal/proxy.(*Proxy).Search
/Users/jibin/Working/milvus-lite/milvus_binary/milvus/internal/distributed/proxy/service.go:830 github.com/milvus-io/milvus/internal/distributed/proxy.(*Server).Search
/Users/jibin/go/pkg/mod/github.com/milvus-io/milvus-proto/go-api/v2@v2.3.0/milvuspb/milvus.pb.go:11836 github.com/milvus-io/milvus-proto/go-api/v2/milvuspb._MilvusService_Search_Handler.func1
)>

In [1]:
default_server.stop()

NameError: name 'default_server' is not defined

In [28]:
from pymilvus import utility, connections

In [29]:
connections.connect(host="127.0.0.1", port=default_server.listen_port)

In [36]:
utility.drop_collection("NotionEngineeringDoc")