In [1]:
from langchain.document_loaders import PDFPlumberLoader, DirectoryLoader

In [7]:
loader = DirectoryLoader(".DEMO/", glob="**/*.pdf", loader_cls=PDFPlumberLoader)

In [12]:
docs = loader.load()

In [15]:
len(docs)

5

In [13]:
for doc in docs:
    print(doc.metadata)

{'source': '.DEMO/DEK Technologies Vietnam Environmental Policy.pdf', 'file_path': '.DEMO/DEK Technologies Vietnam Environmental Policy.pdf', 'page': 0, 'total_pages': 3, 'Author': 'cuong.n.truong', 'CreationDate': "D:20201029145236+07'00'", 'ModDate': "D:20201029145236+07'00'", 'Producer': 'Microsoft: Print To PDF', 'Title': 'Microsoft Word - DEK Technologies Vietnam Environmental Policy A'}
{'source': '.DEMO/DEK Technologies Vietnam Environmental Policy.pdf', 'file_path': '.DEMO/DEK Technologies Vietnam Environmental Policy.pdf', 'page': 1, 'total_pages': 3, 'Author': 'cuong.n.truong', 'CreationDate': "D:20201029145236+07'00'", 'ModDate': "D:20201029145236+07'00'", 'Producer': 'Microsoft: Print To PDF', 'Title': 'Microsoft Word - DEK Technologies Vietnam Environmental Policy A'}
{'source': '.DEMO/DEK Technologies Vietnam Environmental Policy.pdf', 'file_path': '.DEMO/DEK Technologies Vietnam Environmental Policy.pdf', 'page': 2, 'total_pages': 3, 'Author': 'cuong.n.truong', 'Creation

In [10]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, chunk_overlap=200
)

In [14]:
chunks = splitter.split_documents(docs)
len(chunks)

10

In [31]:
len(chunks[0].page_content)

970

In [16]:
from langchain.embeddings import HuggingFaceInstructEmbeddings

embeddings = HuggingFaceInstructEmbeddings(
    model_name="hkunlp/instructor-large",
    model_kwargs={"device": "cpu"},
)

  from tqdm.autonotebook import trange


load INSTRUCTOR_Transformer
max_seq_length  512


In [26]:
embeddings.embed_query(chunks[0].page_content)

[-0.019714122638106346,
 -0.01188711728900671,
 -0.0008289535762742162,
 -0.029276864603161812,
 0.05548582226037979,
 0.01794351078569889,
 -0.002873651683330536,
 -0.010323842987418175,
 -0.0390128567814827,
 0.030891532078385353,
 0.02300182543694973,
 -0.02910073846578598,
 0.03372665122151375,
 0.039224397391080856,
 -0.05285296589136124,
 -0.056836191564798355,
 -0.02882777340710163,
 -0.0001753516698954627,
 -0.10293170809745789,
 -0.0010982127860188484,
 0.046982310712337494,
 -0.004307083785533905,
 -0.01969161257147789,
 0.015154782682657242,
 0.00045917759416624904,
 0.008493468165397644,
 0.015808895230293274,
 -0.00647988636046648,
 0.03462284803390503,
 -0.054446954280138016,
 -0.009126508608460426,
 -0.057950761169195175,
 -0.038231395184993744,
 -0.017411144450306892,
 -0.013174974359571934,
 0.006909512914717197,
 0.038878608494997025,
 0.018431726843118668,
 0.024555524811148643,
 0.040673088282346725,
 0.005754735320806503,
 0.009767132811248302,
 -0.0042665628716349

In [20]:
from langchain_community.vectorstores.chroma import Chroma

store = Chroma.from_documents(
    documents=chunks,
    embedding=embeddings
)

In [21]:
question = "What is anual trip"

In [22]:
retriever = store.as_retriever()

In [24]:
retrieved_docs = retriever.invoke(question)

In [25]:
for doc in retrieved_docs:
    print('----')
    print(doc.page_content)

----
Prepared: Support Function Document no: DEK-14:0123
Approved: Daniel Tedesco Revision: C
Security Class: Internal Date: 28/07/2020
Document Type: Policy
DEK Technologies Vietnam Annual Trip Policy
1 Purpose
To clearly define the expectations and behavior which allows members to have an enjoyable annual trip.
2 Effective Date
This version of the policy is effective from the 1st of August 2020.
3 Principles
The annual trip is:
 A benefit for the members, a way to show appreciation to all members for their contribution
during the year.
 An opportunity to get to know everyone better in order to build closer relationships and team
work.
4 Policy
All members are encouraged to attend the annual trip; however, it is not a mandatory event.
Some events at the annual trip are mandatory, while others are voluntary to participate in. Members
must be involved in all mandatory events. If a member is missing any mandatory event during the trip
----
must be involved in all mandatory events. If a

In [27]:
from langchain.prompts import PromptTemplate

prompt = PromptTemplate.from_template(
    """
    <s> [INST] You are a helpful assitant, answer question about ingested policy documents. Use only in the context, do not use any information outside of this context. If you don't know just say that you don't know.[/INST] </s> 
    [INST] Question: {question} 
    Context: {context} 
    Answer: [/INST]
    """
)

In [28]:
from langchain_community.chat_models import ChatOllama

model = ChatOllama(model="phi3")

In [32]:
from langchain.globals import set_debug

set_debug(True)

In [29]:
from langchain.schema.runnable import RunnableConfig, RunnablePassthrough
from langchain.schema.output_parser import StrOutputParser

chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | model
    | StrOutputParser()
)

In [33]:
chain.invoke(question)

[32;1m[1;3m[chain/start][0m [1m[1:chain:RunnableSequence] Entering Chain run with input:
[0m{
  "input": "What is anual trip"
}
[32;1m[1;3m[chain/start][0m [1m[1:chain:RunnableSequence > 2:chain:RunnableParallel<context,question>] Entering Chain run with input:
[0m{
  "input": "What is anual trip"
}
[32;1m[1;3m[chain/start][0m [1m[1:chain:RunnableSequence > 2:chain:RunnableParallel<context,question> > 3:chain:RunnablePassthrough] Entering Chain run with input:
[0m{
  "input": "What is anual trip"
}
[36;1m[1;3m[chain/end][0m [1m[1:chain:RunnableSequence > 2:chain:RunnableParallel<context,question> > 3:chain:RunnablePassthrough] [1ms] Exiting Chain run with output:
[0m{
  "output": "What is anual trip"
}
[36;1m[1;3m[chain/end][0m [1m[1:chain:RunnableSequence > 2:chain:RunnableParallel<context,question>] [156ms] Exiting Chain run with output:
[0m[outputs]
[32;1m[1;3m[chain/start][0m [1m[1:chain:RunnableSequence > 5:prompt:PromptTemplate] Entering Prompt run wi

'An annual trip is a benefit for the members of DEK Technologies Vietnam and serves as an opportunity to strengthen relationships among team members by getting to know everyone better during their time off work. Members are encouraged to attend this event, though it is not mandatory; however, they must be involved in all required events if attending. Failure to attend any mandatory event without prior written approval results in the member being responsible for the unrecoverable costs of that absence. The trip lasts three days, with two weekend days and one weekday. For those who cannot attend, their office will be operational as usual on the weekday they miss out. Any additional expenses paid by the company are expected to be repaited by the member if chosen options for travel are changed after registration.'