In [None]:
from dotenv import load_dotenv
import os

load_dotenv("../.env")

os.environ['LANGSMITH_TRACING'] = "true"
os.environ['LANGSMITH_API_KEY'] = os.getenv("LANGSMITH_API_KEY")

In [None]:
# creating sample documents
from langchain_core.documents import Document

documents = [
    Document(
        page_content = "Dogs are great companions, known for their loyalty and friendliness.",
        metadata = {"source": "mammal-pets-doc"}
    ),
    Document(
        page_content = "Cats are independent pets that often enjoy their own space.",
        metadata = {"source": "mammal-pets-doc"}
    )
]

In [None]:
# **************
#* loading pdf
# **************
from langchain_community.document_loaders import PyPDFLoader

loader = PyPDFLoader(file_path = "../data/Deep_Learning_A_Visual_Approach.pdf")

pages = loader.load()
print(len(pages))

771


In [None]:
print(pages[33].page_content)
print(pages[33].metadata)

4   Chapter 1
Our goal is to discover meaningful information, where it’s up to us to 
decide what’s meaningful. We usually want to find patterns that help us 
understand the data or use past measurements to predict future events. For 
example, we might want to predict a movie someone would like based on 
movies they’ve already rated, read the handwriting on a note, or identify a 
song from just a few notes.
We generally find the information we’re after in three steps: we iden -
tify the information that we want to find, we collect data that we hope will 
hold that information, and then we design and run algorithms to extract as 
much of that information as possible from that data. 
In this chapter, we’ll cover some of the major movements in machine 
learning. We’ll begin by discussing an early approach to machine learning 
called an expert system. We’ll then discuss three of the major approaches 
to learning: supervised learning, unsupervised learning, and reinforce -
ment learning. We

In [None]:
# ***********
#* Splitting
# ***********
# so that the relevent portion of teh doc are not wasted out by surrounding text
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1000,
    chunk_overlap = 250,
    add_start_index = True
)

all_splits = text_splitter.split_documents(pages)

In [None]:
print(len(all_splits))

2037


In [None]:
print(all_splits[330].page_content)
print(all_splits[330].metadata)

fair coin? We can tighten that up by using the information that we got back 
heads. We’ll see later that the best way to phrase our question is in the form 
of a template that asks, “What is the probability that (something1) is true, 
given that (something2) is true?” In this case that becomes, “What is the 
probability that we have the fair coin, given that we saw heads?”
We can diagram this in pictures. It’s the area of the Fair heads region 
compared to the total area that could have given us heads, which is the sum 
of Fair heads and Rigged heads. Figure 4-4 shows this ratio.
Fair
heads
Fair
heads Rigged
heads
Figure 4-4: If the coin comes up heads, how likely is it that we had the fair coin? 
It’s the size of the region where a fair coin gives us heads divided by all the 
areas combined that would give us heads.
{'producer': 'Adobe PDF Library 15.0', 'creator': 'Adobe InDesign 14.0 (Macintosh)', 'creationdate': '2021-05-20T11:31:26-07:00', 'author': 'Andrew Glassner', 'moddate': '

In [None]:
# start_index = from where the chuck started (character position)
# doc --> pages --> split 
# so each page will split and page start from char=0 to its length
# so start_index can be same if page in metadata is different
# if page in metadata is same then start_index will be unique
# Orignal (page from pages) = 
# 00000000000000000000000000000000000000000000000000000000000000000000000000000000
# after splitting (chunk_size = 20, chunk_verap = 5) = 
# 00000000000000000000
#                00000000000000000000
#                               00000000000000000000
#                                              00000000000000000000
#                                                             00000000000000000000

In [None]:
# ***********
#* Embedding
# ***********
# vector representation of words, phrases or text
# map words, phrases or text to dense vector in lower dimensional space
# where semantically (meaning and context same) similar words are closer to each other
from langchain_google_genai import GoogleGenerativeAIEmbeddings

embedding = GoogleGenerativeAIEmbeddings(
    model = "models/embedding-001"
)

In [None]:
# make some vector of splitted text as example
vector_0 = embedding.embed_query(all_splits[0].page_content)
vector_1 = embedding.embed_query(all_splits[1].page_content)

print(vector_0)
print(vector_1)

print(len(vector_0))
print(len(vector_1))

print(len(vector_0) == len(vector_1))

[0.024646690115332603, -0.013153929263353348, -0.013030963018536568, 0.0048489258624613285, 0.03913797810673714, -0.03946133702993393, -0.02268364280462265, -0.015573874115943909, 0.027147268876433372, -0.011456996202468872, -0.0015806421870365739, 0.044523514807224274, 0.0024603051133453846, -0.03562241420149803, 0.052474938333034515, -0.013904137536883354, 0.04434680938720703, -0.0071497573517262936, 0.03296810761094093, -0.03517112508416176, 0.0205229502171278, 0.012799547985196114, 0.059259045869112015, -0.07170335948467255, -0.031072275713086128, -0.005411963909864426, 0.05223638936877251, -0.0446898452937603, 0.007593974471092224, 0.05128737539052963, -0.0907960757613182, 0.01740247942507267, -0.025876745581626892, 0.026278840377926826, 0.037362273782491684, -0.07004192471504211, 0.01694502681493759, -0.004798019304871559, -0.00450379541143775, 0.017398720607161522, 0.04509361460804939, -0.04567670449614525, -0.044450536370277405, -0.01000504195690155, -0.0069931624457240105, 0.0

In [None]:
# ***************
#* Vector Store
# ***************
# After embedding, text ---converted into---> vector
# We will do the vector similarity search with the i/p on vector store
# vector similarity (ex. cosine similarity)

In [None]:
# in-memory
from langchain_core.vectorstores import InMemoryVectorStore
vector_store_inmemory = InMemoryVectorStore(embedding = embedding)

In [None]:
# mongodb
from langchain_mongodb import MongoDBAtlasVectorSearch
from pymongo import MongoClient

os.environ["LANGSMITH_API_KEY"] = os.getenv("LANGSMITH_API_KEY")
os.environ["LANGSMITH_TRACING"] = "true"

MONGODB_ATLAS_CLUSTER_URI = os.getenv("MONGODB_ATLAS_CLUSTER_URI")

client = MongoClient(MONGODB_ATLAS_CLUSTER_URI)

DB_NAME = "langchain_test_db"
COLLECTION_NAME = "langchain_test_vectorstores"
ATLAS_VECTOR_SEARCH_INDEX_NAME = "langchain_test_index"
MONGODB_COLLECTION = client[DB_NAME][COLLECTION_NAME]

vector_store_mongodb = MongoDBAtlasVectorSearch(
    embedding = embedding,
    collection = MONGODB_COLLECTION,
    index_name = ATLAS_VECTOR_SEARCH_INDEX_NAME,
    relevance_score_fn = "cosine"
)

In [None]:
# using mongodb
# ids = vector_store_mongodb.add_documents(documents = all_splits)

In [None]:
# len(ids)
# len(all_splits)

# print(ids[0])

In [None]:
# similarity search
response = vector_store_mongodb.similarity_search(query = "deep", k = 5)
print(response)

[Document(id='6827ac7947f2b0b86d91ee09', metadata={'_id': '6827ac7947f2b0b86d91ee09', 'producer': 'Adobe PDF Library 15.0', 'creator': 'Adobe InDesign 14.0 (Macintosh)', 'creationdate': '2021-05-20T11:31:26-07:00', 'author': 'Andrew Glassner', 'moddate': '2021-08-18T16:00:59-07:00', 'title': 'Deep Learning: A Visual Approach', 'trapped': '/False', 'source': '../data/Deep_Learning_A_Visual_Approach.pdf', 'total_pages': 771, 'page': 39, 'page_label': '10', 'start_index': 2210}, page_content='only because of their appearance when we draw them stacked up vertically. \nThey’re not deep in the sense of having profound understanding or pen -\netrating insights. When a deep learning system attaches a name to a face in \na photo, it has no knowledge of what faces are, or what people are, or even \nthat people exist. The computer just measures pixels and, using the pat -\nterns it learned from the training data, produces the most likely label. \nLet’s jump many chapters ahead and take a quick lo

In [None]:
# Print the page_content of each document from response
for i, doc in enumerate(response):
    print(f"\n--- Result {i+1} ---")
    print(doc.page_content)
    print(doc.metadata)


--- Result 1 ---
only because of their appearance when we draw them stacked up vertically. 
They’re not deep in the sense of having profound understanding or pen -
etrating insights. When a deep learning system attaches a name to a face in 
a photo, it has no knowledge of what faces are, or what people are, or even 
that people exist. The computer just measures pixels and, using the pat -
terns it learned from the training data, produces the most likely label. 
Let’s jump many chapters ahead and take a quick look at a deep net-
work, pictured in Figure 1-6. In this simple network, we start with four input 
numbers, shown at the bottom of the figure. These might be the values of 
the four pixels in a 2 by 2 grayscale image, the closing price of a stock over 
four sequential days, or four samples from a snippet of voice data. Each 
input value is just a floating-point number, such as –2.982 or 3.1142.
{'_id': '6827ac7947f2b0b86d91ee09', 'producer': 'Adobe PDF Library 15.0', 'creator': '

In [None]:
# combining the page_content from each doc to one
from utils import clean_text
doc_content = []
for doc in response:
    doc_content.append(clean_text(doc.page_content))
print(doc_content)

['only because of their appearance when we draw them stacked up vertically Theyre not deep in the sense of having profound understanding or pen etrating insights When a deep learning system attaches a name to a face in a photo it has no knowledge of what faces are or what people are or even that people exist The computer just measures pixels and using the pat terns it learned from the training data produces the most likely label Lets jump many chapters ahead and take a quick look at a deep network pictured in Figure16 In this simple network we start with four input numbers shown at the bottom of the figure These might be the values of the four pixels in a 2 by 2 grayscale image the closing price of a stock over four sequential days or four samples from a snippet of voice data Each input value is just a floatingpoint number such as 2982 or 31142', '326Chapter 13We dont need to get into the math behind these algorithms Happily modern deep learning libraries offer each of these schemes pl

In [None]:
# Model
from langchain_google_genai import ChatGoogleGenerativeAI

llm = ChatGoogleGenerativeAI(
    model = "gemini-2.0-flash"
)

In [None]:
# Prompt
from langchain_core.prompts import PromptTemplate
prompt = PromptTemplate.from_template(
    '''
    ## QUESTION:
    {question}
    
    ## CONTENT:
    {doc_content}
    
    ## INSTRUCTION:
    Based on the content provided in 'CONTENT' section. Answer the question mentioned in 'QUESTION' section.
    Your answer should be based on the content provided in 'CONTENT' section only.
    If you are not able to get the answer then return the answer as 'No information in the given documents.'
    Do not return preamble in answer.
    
    ## AMSWER (NO PREAMBLE):
    '''
)

In [None]:
# chain
chain = prompt | llm

In [None]:
# defining question
question = "What is mean by Neural Network"

In [None]:
# running the chain
response = chain.invoke(
    input = {
        'question': question,
        'doc_content': doc_content
    }
)
print(response.content)

A neural network is a network made of a series of layers. Organizing neurons in layers allows data to be analyzed hierarchically. The early layers process the raw input data, and each subsequent layer uses information from neurons on the previous layer to process larger chunks of data. The phrase "deep learning" comes from this structure.


In [None]:
# defining question
question = "dogs"

In [None]:
# running the chain
response = chain.invoke(
    input = {
        'question': question,
        'doc_content': doc_content
    }
)
print(response.content)

No information in the given documents.
