In [30]:
# Step 1: Install required libraries
# Run these commands in your terminal or command prompt:
# pip install PyPDF2
# pip install langchain

import PyPDF2
import re
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Step 2: Function to extract text from a PDF
def extract_text_from_pdf(pdf_path):
    """
    Extracts text from a PDF file.

    Args:
        pdf_path (str): Path to the PDF file.

    Returns:
        str: Extracted text from the PDF.
    """
    text = ""
    with open(pdf_path, "rb") as file:
        reader = PyPDF2.PdfReader(file)
        for page in reader.pages:
            page_text = page.extract_text() or ""
            text += page_text
    return text

# Step 3: Clean the text (optional, based on your document)
def clean_text(text):
    """
    Cleans the extracted text by removing extra spaces and unwanted special characters.

    Args:
        text (str): Raw text extracted from the PDF.

    Returns:
        str: Cleaned text.
    """
    # Remove extra spaces and newlines
    text = " ".join(text.split())
    # Remove unwanted special characters; customize this regex as needed
    text = re.sub(r"[^a-zA-Z0-9\s.,!?]", "", text)
    return text

# Step 4: Split the text into smaller chunks using RecursiveCharacterTextSplitter
def split_text_into_chunks(text, chunk_size=2000, chunk_overlap=300):
    """
    Splits the text into smaller chunks using LangChain's RecursiveCharacterTextSplitter.
    It uses a list of separators to attempt to break the text at natural boundaries.

    Args:
        text (str): The text to split.
        chunk_size (int): Maximum size of each chunk (default: 2000 characters).
        chunk_overlap (int): Overlap between consecutive chunks (default: 300 characters).

    Returns:
        list: List of text chunks.
    """
    # Define a list of separators to use in a recursive fashion
    separators = ["\n\n", "\n", ". ", " ", ""]
    text_splitter = RecursiveCharacterTextSplitter(
        separators=separators,
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len
    )
    chunks = text_splitter.split_text(text)
    return chunks

# Step 5: Main function to process the document
def process_document(pdf_path):
    """
    Processes a PDF document: extracts text, cleans it, and splits it into manageable chunks.

    Args:
        pdf_path (str): Path to the PDF file.

    Returns:
        list: List of cleaned and split text chunks.
    """
    # Extract text from the PDF
    text = extract_text_from_pdf(pdf_path)
    print(f"Extracted text length: {len(text)} characters")

    # Clean the text
    cleaned_text = clean_text(text)
    print(f"Cleaned text length: {len(cleaned_text)} characters")

    # Split the cleaned text into chunks
    chunks = split_text_into_chunks(cleaned_text)
    print(f"Number of chunks: {len(chunks)}")
    return chunks

# Step 6: Run the script
if __name__ == "__main__":
    # Path to your PDF file (update with your actual path)
    pdf_path = r"C:\Users\kingl\OneDrive\Desktop\mlops_project\chatbot\chatbot\Rag\256_page_document.pdf"
    
    # Process the document to obtain chunks
    chunks = process_document(pdf_path)
    
    # Print the first chunk as an example
    print("\nFirst chunk:")
    print(chunks[0])


Extracted text length: 328901 characters
Cleaned text length: 309634 characters
Number of chunks: 187

First chunk:
Neural NetworkDesign 2nd EditionHagan DemuthBealeDe JessNeural Network Design 2nd Edtion Martin T. Hagan Oklahoma State University Stillwater, Oklahoma Howard B. Demuth University of Colorado Boulder, Colorado Mark Hudson Beale MHB Inc. Hayden, Idaho Orlando De Jess Consultant Frisco, Texas Copyright by Martin T. Hagan and Howard B. Demu th. All rights reserved. No part of the book may be reproduced, stored in a retrieval system, or transcribed in any form or by any means  electronic, mechanical, photocopying, recording or otherwise  without the prior permission of Hagan and Demuth. MTH To Janet, Thomas, Daniel, Mom and Dad HBD To Hal, Katherine, Kimberly and Mary MHB To Leah, Valerie, Asia, Drake, Coral and Morgan ODJ To Marisela, Mara Victor ia , Manuel, Mam y Pap


In [32]:
# Step 1: Import required libraries
from langchain_community.embeddings import OllamaEmbeddings

# Step 2: Initialize OpenAI Embeddings
def initialize_embeddings():
    """
    Initializes the OpenAI Embeddings model.

    Returns:
        OpenAIEmbeddings: Embeddings model.
    """
    embeddings = OllamaEmbeddings(model="llama3.2:1b")
    print("OpenAI Embeddings initialized successfully!")
    return embeddings

# Step 3: Generate embeddings for text chunks
def generate_embeddings(chunks, embeddings):
    """
    Generates embeddings for a list of text chunks.

    Args:
        chunks (list): List of text chunks.
        embeddings (OpenAIEmbeddings): Embeddings model.

    Returns:
        list: List of embeddings (vectors) for each chunk.
    """
    # Generate embeddings for each chunk
    chunk_embeddings = embeddings.embed_documents(chunks)
    print(f"Generated embeddings for {len(chunk_embeddings)} chunks.")
    return chunk_embeddings



In [33]:

embeddings = initialize_embeddings()

# Generate embeddings for the chunks
chunk_embeddings = generate_embeddings(chunks, embeddings)

# Print the first embedding as an example
print("\nFirst chunk embedding (first 10 dimensions):")
print(chunk_embeddings[0][:10])

OpenAI Embeddings initialized successfully!
Generated embeddings for 187 chunks.

First chunk embedding (first 10 dimensions):
[0.6292080283164978, 2.6260859966278076, 0.5753267407417297, 0.39746174216270447, 1.8612641096115112, -2.4867329597473145, 1.3653322458267212, 0.5735061168670654, -1.4694424867630005, -0.35355550050735474]


In [34]:
import pickle

# Save embeddings to a pickle file
with open('chunk_embeddings.pkl', 'wb') as f:
    pickle.dump(chunk_embeddings, f)

print("Embeddings saved!")


Embeddings saved!


In [35]:
from langchain.docstore.document import Document
from langchain_community.vectorstores import Chroma

# Define a dummy embeddings wrapper that implements embed_documents.
class DummyEmbedding:
    def __init__(self, precomputed):
        self.precomputed = precomputed

    def embed_documents(self, texts):
        # Ensure the number of texts matches the number of precomputed embeddings.
        if len(texts) != len(self.precomputed):
            raise ValueError("Mismatch between number of texts and precomputed embeddings")
        return self.precomputed

# Wrap your precomputed embeddings.
dummy_embedding = DummyEmbedding(chunk_embeddings)

# Convert your chunks (plain text strings) to Document objects.
documents = [Document(page_content=chunk) for chunk in chunks]

# Create the Chroma vector store using the Document objects and the dummy embeddings.
vector_store = Chroma.from_documents(documents, dummy_embedding)




below is for llms usage 


In [10]:
from langchain_core.prompts import ChatPromptTemplate

prompt=ChatPromptTemplate.from_template("""
Answer the following question based on the provided context.
Think step by step before providing a detailed answer.
I will tip you $1000 if user finds your answer helpful.                      
<context> 
{context}  
</context>                                    
                                        
                                        """)

In [12]:
from langchain_community.llms import Ollama


llms = Ollama(model="llama3.2:1b")
llms


  llms = Ollama(model="llama3.2:1b")


Ollama(model='llama3.2:1b')

In [36]:
from langchain.chains.combine_documents import create_stuff_documents_chain
doc_chain=create_stuff_documents_chain(llms,prompt)
doc_chain


RunnableBinding(bound=RunnableBinding(bound=RunnableAssign(mapper={
  context: RunnableLambda(format_docs)
}), kwargs={}, config={'run_name': 'format_inputs'}, config_factories=[])
| ChatPromptTemplate(input_variables=['context'], input_types={}, partial_variables={}, messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=['context'], input_types={}, partial_variables={}, template='\nAnswer the following question based on the provided context.\nThink step by step before providing a detailed answer.\nI will tip you $1000 if user finds your answer helpful.                      \n<context> \n{context}  \n</context>                                    \n                                        \n                                        '), additional_kwargs={})])
| Ollama(model='llama3.2:1b')
| StrOutputParser(), kwargs={}, config={'run_name': 'stuff_documents_chain'}, config_factories=[])