In [43]:
from langchain.document_loaders.pdf import PyPDFDirectoryLoader
from langchain_text_splitters.character import RecursiveCharacterTextSplitter
import faiss
from langchain_community.vectorstores import FAISS
from langchain_community.docstore.in_memory import InMemoryDocstore
import os

In [44]:
def data_loading(DATA_PATH):
    loader = PyPDFDirectoryLoader(DATA_PATH)
    return loader.load()

def documents_splitter(docs:list , chunk_size:int=500 ,chunk_overlap:int=100):
    splitter = RecursiveCharacterTextSplitter(
    chunk_size=chunk_size,
    chunk_overlap=chunk_overlap,
    length_function=len,
    )
    return splitter.split_documents(docs)

def generate_chunk_id(docs_chunks):
    current_page = 0
    doc_idx = 0
    
    for doc in docs_chunks:
        page = doc.metadata['page']
        source = doc.metadata['source']
        filename = os.path.basename(source)

        #page didnt change
        if page == current_page:
            
            #chunks_id.append(f'{page}-{doc_idx}')
            doc.metadata['id'] = f'{filename}:{page}-{doc_idx}'
            doc_idx += 1   
        #page changed    
        else:
            #source changed
            if page == 0 :
                current_page = 0
                doc_idx = 0
                doc.metadata['id'] = f'{filename}:{page}-{doc_idx}'
                doc_idx += 1
            #source didnt change but page 
            else:
                current_page += 1
                doc_idx = 0
                #chunks_id.append(f'{page}-{doc_idx}')
                doc.metadata['id'] = f'{filename}:{page}-{doc_idx}'
                doc_idx += 1
    return docs_chunks


In [45]:
from langchain.embeddings import OllamaEmbeddings
def embedding_model(model = "all-minilm"):
    embed_model = OllamaEmbeddings(model=model)
    return embed_model
    
    #embeddings.embed_documents(docs)

In [46]:
DATA_PATH = "/Users/nachanon/projects/rag_llm/data"
docs = data_loading(DATA_PATH)
docs_chunks = documents_splitter(docs)
embed_model = embedding_model()
docs_chunks_with_id = generate_chunk_id(docs_chunks)

In [47]:
## INITIALIZE DATABASE 


FAISS_PATH = '/Users/nachanon/projects/rag_llm/faiss'

#this line of code below use to set dimension of embed vector that we want to store into vec db
index = faiss.IndexFlatL2(len(embed_model.embed_query("hello world")))

vector_store = FAISS(
    embedding_function=embedding_model(),
    index=index,
    docstore= InMemoryDocstore(),
    index_to_docstore_id={}
)
vector_store.add_documents(documents=docs_chunks_with_id, ids= [doc.metadata['id'] for doc in docs_chunks_with_id])

vector_store.save_local(FAISS_PATH)

In [50]:
## LOAD DATABASE
FAISS_PATH = '/Users/nachanon/projects/rag_llm/faiss'
loaded_vector_db = FAISS.load_local(FAISS_PATH,embedding_model(),allow_dangerous_deserialization=True)



In [51]:
loaded_vector_db.docstore.search(docs_chunks_with_id[0].metadata['id'])

Document(metadata={'source': '/Users/nachanon/projects/rag_llm/data/monopoly.pdf', 'page': 0, 'id': 'monopoly.pdf:0-0'}, page_content='MONOPOLY \nProperty Trading Game from Parker Brothers" \nAGES 8+ \n2 to 8 Players \nContents: Gameboard, 3 dice, tokens, 32 houses, I2 hotels, Chance \nand Community Chest cards, Title Deed cards, play money and a Banker\'s tray. \nNow there\'s a faster way to play MONOPOLY. Choose to play by \nthe classic rules for buying, renting and selling properties or use the \nSpeed Die to get into the action faster. If you\'ve never played the classic')

In [52]:
#this section of code below is to update chunk of text that doest exist in my db

not_found_docs = []
for doc in docs_chunks_with_id:
    if type(loaded_vector_db.docstore.search(doc.metadata['id'])) == str:
        not_found_docs.append(doc)


######check point for today : update db : next-> more functional coding
if len(not_found_docs) != 0 :
    loaded_vector_db.add_documents(documents=not_found_docs, ids= [doc.metadata['id'] for doc in not_found_docs])
else:
    print('not_found_docs = 0')

not_found_docs = 0


In [53]:
#loaded_vector_db.similarity_search_with_relevance_scores('step 3')

In [67]:
from langchain.prompts import ChatPromptTemplate
prompt_template = """
This is query from user: {question}
And use this context to answer query : {context}

"""
query = "how much money each player got in starting"
results = loaded_vector_db.similarity_search_with_relevance_scores(query)

context_text = "\n\n-------------\n\n".join([doc.page_content for doc, _score in results])
prompt_template = ChatPromptTemplate.from_template(prompt_template)
prompt = prompt_template.format(context=context_text, question=query)

In [71]:
from langchain_ollama.llms import OllamaLLM
model = OllamaLLM(model="llama3")
model.invoke(prompt)

'According to the text, each player is given $1,500 divided as follows:\n\n* P (I assume this means "plurals", not a specific amount): 3 x $500 = $1,500\n* $100: 6\n* $50: 5\n* $40: 6\n* $105: 5\n* $5: 5\n* $15: 5\n\nThis adds up to a total of $4,115. The remaining money and other equipment go to the Bank.\n\nSo, each player starts with $1,500 - $3,995 (total value of all the bills) = $505.'

In [72]:
p

'Human: \nThis is query from user: how much money each player got in starting\nAnd use this context to answer query : Each player is given $1,500 divided as follows: P each of $500s, \n$100~ and $50~; 6 $40~; 5 each of $105, $5~ and $Is. \nAll remaining money and other equipment go to the Bank. Stack the .. \nBank\'s money on edge in the compartments in the plastic Banker\'s tray. \nBANKER. Select as Banker a player who will also \nmake a good Auctioneer A Banker who plays \n~n the game must keep hislher personal funds \nseparate from those of the Bank. When more than\n\n-------------\n\nit sells houses and hotels to the players and loans money when \nrequired on mortgages. \nThe Bank collects all taxes, fines, loans and interest, and the price of \nall properties which it sells and auctions. \nThe Bank nwer "goes broke." If the Bank runs out of money, the Banker \nmay issue as much more as needed by writing on any ordinary paper. \nTHE PLAY: Starting with the Banker, each player in tu