In [14]:
from pathlib import Path
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from langchain_chroma import Chroma

In [15]:

from dotenv import load_dotenv
load_dotenv() 

True

In [25]:
pdf_path = Path("./Data Analysis With Python & Pandas.pdf")

In [26]:

load = PyPDFLoader(file_path = pdf_path)
doc = load.load()


In [27]:

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size = 1000,
    chunk_overlap = 200
)

In [28]:


spited = text_splitter.split_documents(documents=doc)

In [None]:
store = Chroma.from_documents(
    documents=spited,
    embedding=GoogleGenerativeAIEmbeddings(model="models/embedding-001"),
    persist_directory="qdrant_store"
    
)

In [31]:
retriever = store.as_retriever(
     search_type = "similarity",
        search_kwargs = {
            "k":10 
        }
)

In [32]:
retrieved_docs = retriever.invoke("What is node js")

In [33]:
from langchain_google_genai import ChatGoogleGenerativeAI

llm = ChatGoogleGenerativeAI(model="gemini-1.5-pro",temperature=0.3, max_tokens=500)

In [34]:
from langchain.chains import create_retrieval_chain
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain_core.prompts import ChatPromptTemplate

In [35]:
system_prompt = (
    "You are a helpful assistant that answers questions about Data Science {context}"
)

prompt = ChatPromptTemplate.from_messages(
    [
        ("system", system_prompt),
        ("user", "{input}"),
    ]
)

In [36]:
question_answer_chain = create_stuff_documents_chain(llm, prompt)
rag_chain = create_retrieval_chain(retriever, question_answer_chain)
    

In [37]:
response = rag_chain.invoke({"input": "Can you explain Data Frame in Pandas?"})
print(response["answer"])

A Pandas DataFrame is a two-dimensional, size-mutable, potentially heterogeneous tabular data structure with labeled axes (rows and columns).  Think of it like a spreadsheet, a SQL table, or a dictionary of Series objects.  It's the most commonly used Pandas object for data manipulation and analysis in Python.

Here's a breakdown of key aspects of DataFrames:

**Key Features:**

* **Two-Dimensional:** DataFrames are organized in rows and columns, allowing you to store and access data in a grid-like format.
* **Size-Mutable:** You can easily add or remove rows and columns from a DataFrame after it's created.
* **Potentially Heterogeneous:**  Unlike NumPy arrays, DataFrames can hold columns of different data types (e.g., integers, floats, strings, booleans, dates).
* **Labeled Axes:**  Both rows and columns have labels (indexes), making it easier to access and manipulate specific parts of the data.  These labels don't have to be just numbers; they can be strings or other data types.

**S