In [1]:
from langchain_huggingface.embeddings import HuggingFaceEmbeddings

huggingface_embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")

  from tqdm.autonotebook import tqdm, trange


In [2]:
from langchain_core.documents import Document
from langchain_postgres import PGVector
from langchain_postgres.vectorstores import PGVector

# See docker command above to launch a postgres instance with pgvector enabled.
connection = "postgresql+psycopg://admin:admin@localhost:6024/langchain"  # Uses psycopg3!
collection_name = "my_docs"


vector_store = PGVector(
    embeddings=huggingface_embeddings,
    collection_name=collection_name,
    connection=connection,
    use_jsonb=True,
)

In [52]:
from langchain_community.document_loaders import PyPDFLoader

dict_path = r"D:\Thinkture\tcm_backend\temp_sync"
import os

# Get the list of PDF files in the directory
pdf_files = [f for f in os.listdir(dict_path) if f.endswith('.pdf')]

print("pdf_files",pdf_files)

if not pdf_files:
    print("No PDF files found in the directory.")
else:
    pages = []
    for pdf_file in pdf_files:
        file_path = os.path.join(dict_path, pdf_file)
        print(f"Loading PDF: {file_path}")
        loader = PyPDFLoader(file_path)
        async for page in loader.alazy_load():
            pages.append(page)

    print(f"Total pages loaded: {len(pages)}")

print(type(pages[0]))
print(f"{pages[8].metadata}\n")

pdf_files ['paul (1).pdf', 'Resume_OngYuanQin (5).pdf', 'SQL-Cheat-Sheet-PDF (4).pdf']
Loading PDF: D:\Thinkture\tcm_backend\temp_sync\paul (1).pdf
Loading PDF: D:\Thinkture\tcm_backend\temp_sync\Resume_OngYuanQin (5).pdf
Loading PDF: D:\Thinkture\tcm_backend\temp_sync\SQL-Cheat-Sheet-PDF (4).pdf
Total pages loaded: 11
<class 'langchain_core.documents.base.Document'>
{'source': 'D:\\Thinkture\\tcm_backend\\temp_sync\\paul (1).pdf', 'page': 8}



In [None]:
from langchain_community.document_loaders import DirectoryLoader

dict_path = r"D:\Thinkture\tcm_backend\temp_sync"
loader = DirectoryLoader(dict_path)
docs = loader.load()
len(docs)

In [None]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=100)

chunks = splitter.split_documents(pages)

print(len(chunks))

In [68]:
print(chunks[32])

page_content='OngYuanQinAddress:JohorBahru,Malaysia|Mobile:(+60)13-7698311|Email:yuanqin1108@gmail.comLinkedIn|PersonalWebsite|GitHub
SUMMARYAnAIsoftwareengineerwithtwoyearsofexperience,IampassionateaboutleveragingmyexpertiseinLargeLanguageModel,NaturalLanguageProcessing,ComputerVision,andsoftwaredevelopment.Committedtoinnovation,IseekachallengingroletocontributetoimpactfulprojectsandfurtheradvancemyskillsinAIandsoftwaredevelopment.
WORKEXPERIENCEIntelMicroelectronics(M)Sdn.Bhd.(Oct2022-Present)ArtificialIntelligenceEngineer•DevelopedachatbotusingLangChainAgentandOpenAIAPIforengineerstoreadandupdatetickets.•DevelopedadataminingpipelineusingNLPandPython,andvisualizedthedataanalysisresultswithPowerBI.•CreatedanddesignedGenerativeAIplatformswithseamlessAPIintegrationusingReact.jsandRedux.•BuiltasmartticketinganalysismicroservicesinC#forasmartchatbotplatform.•DevelopedanautomatedFAQgeneratorusingPythonandNLP,increasingtheFAQgenerationrateperweek.•CreatedpipelineforLLMstocomprehendcomplexdi

In [71]:
huggingface_embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-mpnet-base-v2")
db=PGVector.from_documents(documents=chunks,embedding=huggingface_embeddings,connection=connection,collection_name=collection_name)
print("db",db)






db <langchain_postgres.vectorstores.PGVector object at 0x0000017C9B064C40>


In [74]:
retriever = db.as_retriever(search_kwargs={'k': 3})  # default 4
query = "Do you have any experience in Artificial Intelligence Engineer"
retriever.invoke(query)

[Document(id='98a9c90d-957b-49bb-894e-77c90321a2c2', metadata={'page': 0, 'source': 'D:\\Thinkture\\tcm_backend\\temp_sync\\Resume_OngYuanQin (5).pdf'}, page_content='OngYuanQinAddress:JohorBahru,Malaysia|Mobile:(+60)13-7698311|Email:yuanqin1108@gmail.comLinkedIn|PersonalWebsite|GitHub\nSUMMARYAnAIsoftwareengineerwithtwoyearsofexperience,IampassionateaboutleveragingmyexpertiseinLargeLanguageModel,NaturalLanguageProcessing,ComputerVision,andsoftwaredevelopment.Committedtoinnovation,IseekachallengingroletocontributetoimpactfulprojectsandfurtheradvancemyskillsinAIandsoftwaredevelopment.\nWORKEXPERIENCEIntelMicroelectronics(M)Sdn.Bhd.(Oct2022-Present)ArtificialIntelligenceEngineer•DevelopedachatbotusingLangChainAgentandOpenAIAPIforengineerstoreadandupdatetickets.•DevelopedadataminingpipelineusingNLPandPython,andvisualizedthedataanalysisresultswithPowerBI.•CreatedanddesignedGenerativeAIplatformswithseamlessAPIintegrationusingReact.jsandRedux.•BuiltasmartticketinganalysismicroservicesinC#for

In [14]:
from langchain_ollama import ChatOllama
from langchain.schema.output_parser import StrOutputParser
from langchain.prompts import PromptTemplate
from langchain.schema.runnable import RunnablePassthrough

def setup_conversational_chain(retriever):
        MODEL = "llama3.1"
        llm = ChatOllama(model=MODEL, temperature=0)
        parser = StrOutputParser()
        
        template = """Answer the question based ONLY on the following context:
        {context}
        Question: {question}
        """

        def format_docs(docs):  
            format_docs = "\n\n".join([d.page_content for d in docs])
            return format_docs
        prompt = PromptTemplate.from_template(template)
        try:
            chain = (
            {"context": retriever | format_docs, "question": RunnablePassthrough()}
            | prompt
            | llm
            | parser
            )
            print("Chain setup completed successfully")
            return chain
        except Exception as e:
            print(f"Error in setup_conversational_chain: {str(e)}")

        

       
            

The MultiQueryRetriever automates the process of prompt tuning by using an LLM to generate multiple queries from different perspectives for a given user input query. For each query, it retrieves a set of relevant documents and takes the unique union across all queries to get a larger set of potentially relevant documents. By generating multiple perspectives on the same question, the MultiQueryRetriever can mitigate some of the limitations of the distance-based retrieval and get a richer set of results.

In [24]:
from langchain.retrievers.multi_query import MultiQueryRetriever

MODEL = "llama3.1"
llm = ChatOllama(model=MODEL, temperature=0)
QUERY_PROMPT = PromptTemplate(
    input_variables=["question"],
    template="""You are an AI language model assistant. Your task is to generate five
    different versions of the given user question to retrieve relevant documents from
    a vector database. By generating multiple perspectives on the user question, your
    goal is to help the user overcome some of the limitations of the distance-based
    similarity search. Provide these alternative questions separated by newlines.
    Original question: {question}""",
)
retriever = MultiQueryRetriever.from_llm(
    vector_store.as_retriever(search_kwargs={'k': 4}),
    llm,
    prompt=QUERY_PROMPT
)
chain = setup_conversational_chain(retriever)





Chain setup completed successfully


In [25]:
response = chain.invoke(" do you knowCowCrossingDetectionAlertSystem")
response

"Yes, I'm familiar with the Cow Crossing Detection Alert System. According to the context provided, it's a research project developed by Ong Yuan Qin in 2022.\n\nHere are some details about the project:\n\n* It uses Python, YOLOV3, deep learning, and Raspberry Pi to detect and identify cows.\n* The system recognizes distinct cow sounds for enhanced detection.\n* When cows are detected on roadways, it sends alerts to subscribed devices (computers and smartphones).\n\nLet me know if you'd like more information!"

In [4]:
from datetime import datetime
import pytz


sg_time = pytz.timezone("Asia/Singapore") 
time = datetime.now(sg_time)
print(time)


2024-09-26 16:54:06.363456+08:00
