In [259]:
from langchain.document_loaders.pdf import PyPDFDirectoryLoader
from langchain_text_splitters.character import RecursiveCharacterTextSplitter
import os

In [260]:
def data_loading(DATA_PATH):
    loader = PyPDFDirectoryLoader(DATA_PATH)
    return loader.load()

def documents_splitter(docs:list , chunk_size:int=500 ,chunk_overlap:int=100):
    splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap,
    length_function=len,
    )
    return splitter.split_documents(docs)

def generate_chunk_id(docs_chunks):
    current_page = 0
    doc_idx = 0
    
    for doc in docs_chunks:
        page = doc.metadata['page']
        source = doc.metadata['source']
        filename = os.path.basename(source)

        #page didnt change
        if page == current_page:
            
            #chunks_id.append(f'{page}-{doc_idx}')
            doc.metadata['id'] = f'{filename}:{page}-{doc_idx}'
            doc_idx += 1   
        #page changed    
        else:
            #source changed
            if page == 0 :
                current_page = 0
                doc_idx = 0
                doc.metadata['id'] = f'{filename}:{page}-{doc_idx}'
                doc_idx += 1
            #source didnt change but page 
            else:
                current_page += 1
                doc_idx = 0
                #chunks_id.append(f'{page}-{doc_idx}')
                doc.metadata['id'] = f'{filename}:{page}-{doc_idx}'
                doc_idx += 1
    return docs_chunks


In [261]:
from langchain.embeddings import OllamaEmbeddings
def embedding_model(model = "all-minilm"):
    embed_model = OllamaEmbeddings(model=model)
    return embed_model
    
    #embeddings.embed_documents(docs)

In [262]:
DATA_PATH = "/Users/nachanon/projects/rag_llm/data"
docs = data_loading(DATA_PATH)
docs_chunks = documents_splitter(docs)
embed_model = embedding_model()
docs_chunks_with_id = generate_chunk_id(docs_chunks)

In [204]:
## INITIALIZE DATABASE 

import faiss
from langchain_community.vectorstores import FAISS
from langchain_community.docstore.in_memory import InMemoryDocstore
FAISS_PATH = '/Users/nachanon/projects/rag_llm/faiss'

#this line of code below use to set dimension of embed vector that we want to store into vec db
index = faiss.IndexFlatL2(len(embed_model.embed_query("hello world")))

vector_store = FAISS(
    embedding_function=embedding_model(),
    index=index,
    docstore= InMemoryDocstore(),
    index_to_docstore_id={}
)


vector_store.save_local(FAISS_PATH)

In [265]:
## LOAD DATABASE
FAISS_PATH = '/Users/nachanon/projects/rag_llm/faiss'
v2 = FAISS.load_local(FAISS_PATH,embedding_model(),allow_dangerous_deserialization=True)



In [263]:
docs_chunks_with_id

[Document(metadata={'source': '/Users/nachanon/projects/rag_llm/data/monopoly.pdf', 'page': 0, 'id': 'monopoly.pdf:0-0'}, page_content='MONOPOLY \nProperty Trading Game from Parker Brothers" \nAGES 8+ \n2 to 8 Players \nContents: Gameboard, 3 dice, tokens, 32 houses, I2 hotels, Chance \nand Community Chest cards, Title Deed cards, play money and a Banker\'s tray. \nNow there\'s a faster way to play MONOPOLY. Choose to play by \nthe classic rules for buying, renting and selling properties or use the \nSpeed Die to get into the action faster. If you\'ve never played the classic'),
 Document(metadata={'source': '/Users/nachanon/projects/rag_llm/data/monopoly.pdf', 'page': 0, 'id': 'monopoly.pdf:0-1'}, page_content="Speed Die to get into the action faster. If you've never played the classic \nMONOPOLY game, refer to the Classic Rules beginning on the next page. \nIf you already know how to play and want to use the Speed Die, just \nread the section below for the additional Speed Die rules.

In [275]:
#this section of code below is to update chunk of text that doest exist in my db

not_found_docs = []
for doc in docs_chunks_with_id:
    if type(v2.docstore.search(doc.metadata['id'])) == str:
        not_found_docs.append(doc)

not_found_docs
######check point for today : update db : next-> more functional coding
#vector_store.add_documents(documents=docs_chunks_with_id, ids= [doc.metadata['id'] for doc in docs_chunks_with_id])

[Document(metadata={'source': '/Users/nachanon/projects/rag_llm/data/sandwich.pdf', 'page': 0, 'id': 'sandwich.pdf:0-0'}, page_content='Note: The pork chop can be either frozen and heated up or cooked from scratch. (See my pork chop recipe for more info)  HOW TO MAKE A SANDWICH  \nHello everyone, My name is Alson and I’ll be guiding you on how to make the  perfect sandwich  that \ncan be ate at anytime and anywhere. For those who don’t know,the best feature for a sandwich  is \nthat you can freely customize them  in any way you want, but for this  instruction, I will guide you on'),
 Document(metadata={'source': '/Users/nachanon/projects/rag_llm/data/sandwich.pdf', 'page': 0, 'id': 'sandwich.pdf:0-1'}, page_content='how to make my personal favourite,the pork chop sandwich.  This sandwich is one of my recent \ndiscovery  and it was astonishing even with the most simplistic ingrediants. Let’s  get started by  \nwashing your hands and  prep for the ingrediants listed below the picture.  \

In [273]:
type(v2.docstore.search('sandwich.pdf:3-1')) == str

True

In [205]:
vector_store.add_documents(documents=docs_chunks_with_id, ids= [doc.metadata['id'] for doc in docs_chunks_with_id])

['monopoly.pdf:0-0',
 'monopoly.pdf:0-1',
 'monopoly.pdf:0-2',
 'monopoly.pdf:0-3',
 'monopoly.pdf:1-0',
 'monopoly.pdf:1-1',
 'monopoly.pdf:1-2',
 'monopoly.pdf:1-3',
 'monopoly.pdf:1-4',
 'monopoly.pdf:2-0',
 'monopoly.pdf:2-1',
 'monopoly.pdf:2-2',
 'monopoly.pdf:2-3',
 'monopoly.pdf:2-4',
 'monopoly.pdf:2-5',
 'monopoly.pdf:3-0',
 'monopoly.pdf:3-1',
 'monopoly.pdf:3-2',
 'monopoly.pdf:3-3',
 'monopoly.pdf:3-4',
 'monopoly.pdf:3-5',
 'monopoly.pdf:4-0',
 'monopoly.pdf:4-1',
 'monopoly.pdf:4-2',
 'monopoly.pdf:4-3',
 'monopoly.pdf:4-4',
 'monopoly.pdf:5-0',
 'monopoly.pdf:5-1',
 'monopoly.pdf:5-2',
 'monopoly.pdf:5-3',
 'monopoly.pdf:5-4',
 'monopoly.pdf:5-5',
 'monopoly.pdf:6-0',
 'monopoly.pdf:6-1',
 'monopoly.pdf:6-2',
 'monopoly.pdf:6-3',
 'monopoly.pdf:6-4',
 'monopoly.pdf:7-0',
 'monopoly.pdf:7-1',
 'monopoly.pdf:7-2',
 'monopoly.pdf:7-3',
 'monopoly.pdf:7-4',
 'monopoly.pdf:7-5',
 'pokers.pdf:0-0',
 'pokers.pdf:0-1',
 'pokers.pdf:1-0',
 'pokers.pdf:1-1',
 'pokers.pdf:1-2',
 '

In [205]:
vector_store.add_documents(documents=docs_chunks_with_id, ids= [doc.metadata['id'] for doc in docs_chunks_with_id])

['monopoly.pdf:0-0',
 'monopoly.pdf:0-1',
 'monopoly.pdf:0-2',
 'monopoly.pdf:0-3',
 'monopoly.pdf:1-0',
 'monopoly.pdf:1-1',
 'monopoly.pdf:1-2',
 'monopoly.pdf:1-3',
 'monopoly.pdf:1-4',
 'monopoly.pdf:2-0',
 'monopoly.pdf:2-1',
 'monopoly.pdf:2-2',
 'monopoly.pdf:2-3',
 'monopoly.pdf:2-4',
 'monopoly.pdf:2-5',
 'monopoly.pdf:3-0',
 'monopoly.pdf:3-1',
 'monopoly.pdf:3-2',
 'monopoly.pdf:3-3',
 'monopoly.pdf:3-4',
 'monopoly.pdf:3-5',
 'monopoly.pdf:4-0',
 'monopoly.pdf:4-1',
 'monopoly.pdf:4-2',
 'monopoly.pdf:4-3',
 'monopoly.pdf:4-4',
 'monopoly.pdf:5-0',
 'monopoly.pdf:5-1',
 'monopoly.pdf:5-2',
 'monopoly.pdf:5-3',
 'monopoly.pdf:5-4',
 'monopoly.pdf:5-5',
 'monopoly.pdf:6-0',
 'monopoly.pdf:6-1',
 'monopoly.pdf:6-2',
 'monopoly.pdf:6-3',
 'monopoly.pdf:6-4',
 'monopoly.pdf:7-0',
 'monopoly.pdf:7-1',
 'monopoly.pdf:7-2',
 'monopoly.pdf:7-3',
 'monopoly.pdf:7-4',
 'monopoly.pdf:7-5',
 'pokers.pdf:0-0',
 'pokers.pdf:0-1',
 'pokers.pdf:1-0',
 'pokers.pdf:1-1',
 'pokers.pdf:1-2',
 '

In [221]:
v2.similarity_search_with_relevance_scores('how much money in start game')



[(Document(metadata={'source': '/Users/nachanon/projects/rag_llm/data/monopoly.pdf', 'page': 2, 'id': 'monopoly.pdf:2-0'}, page_content="Each player is given $1,500 divided as follows: P each of $500s, \n$100~ and $50~; 6 $40~; 5 each of $105, $5~ and $Is. \nAll remaining money and other equipment go to the Bank. Stack the .. \nBank's money on edge in the compartments in the plastic Banker's tray. \nBANKER. Select as Banker a player who will also \nmake a good Auctioneer A Banker who plays \n~n the game must keep hislher personal funds \nseparate from those of the Bank. When more than"),
  -19.846157658683055),
 (Document(metadata={'source': '/Users/nachanon/projects/rag_llm/data/monopoly.pdf', 'page': 0, 'id': 'monopoly.pdf:0-2'}, page_content="/ \nfast as playing with i't. \n1. When starting the game, hand out an extra $1,000 to each player \n(two $5005 should work). The game moves fast and you'll need \nthe extra cash to buy and build. \n2. Do not use the Speed Die until you've land