In [23]:
import os
import re
import nltk
import bs4
from langchain import hub
from langchain_core.output_parsers import StrOutputParser
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.runnables import RunnablePassthrough
from langchain_community.vectorstores import Chroma
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from langchain_community.vectorstores.utils import filter_complex_metadata
import google.generativeai as genai

from langchain_google_genai import ChatGoogleGenerativeAI
import urllib
import warnings
from pathlib import Path as p
from pprint import pprint

import pandas as pd
from langchain.document_loaders import PyPDFLoader
from langchain.chains import RetrievalQA
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from dotenv import load_dotenv
import langchain.vectorstores as vectorstores
from langchain_community.document_loaders.pdf import PyPDFLoader

In [36]:
def load_dot_env():
    load_dotenv("../keys.env")
    llm_api_key = os.getenv("Gemini_key")
    return llm_api_key
    
def load_model(llm_api_key):
    model = ChatGoogleGenerativeAI(model="gemini-1.5-pro-latest",google_api_key=llm_api_key,
                             temperature=0.2,convert_system_message_to_human=True)
    embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001",google_api_key=llm_api_key)
    return model  ,embeddings  


In [37]:
llm_api_key = load_dot_env()
llm,embeddings = load_model(llm_api_key)

In [30]:
import re
def load_document(document_path):
    pdf_loader = PyPDFLoader(document_path)
    pages = pdf_loader.load_and_split()
    return pages
def split_text(pages):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=1000)
    context = "\n\n".join(str(p.page_content) for p in pages)
    texts = text_splitter.split_text(context)
    return texts

def remove_emojis(string):
    emoji_pattern = re.compile(
        "["
        u"\U0001F600-\U0001F64F" # emoticons
        u"\U0001F300-\U0001F5FF" # symbols & pictographs
        u"\U0001F680-\U0001F6FF" # transport & map symbols
        u"\U0001F1E0-\U0001F1FF" # flags (iOS)
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        "]+", 
        flags=re.UNICODE
    )
    
    return emoji_pattern.sub(r'', string)


def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


In [33]:
document=load_document("../docs/Practical Statistics for Data Scientists.pdf")
filtered_docs = filter_complex_metadata(document)
splits=split_text(filtered_docs)

for i in range(len(splits)):
    splits[i] =remove_emojis(splits[i])



In [40]:
def vector_index(texts,embeddings,persist_directory="./Database"):
    vector_index = Chroma.from_texts(texts, embeddings)
    db = Chroma.from_texts(texts, embeddings, persist_directory=persist_directory)
    db.persist()
    # db2 = Chroma.from_documents(text, embedding_function, persist_directory="./chroma_db")
    return vector_index
vector_index = vector_index(splits, embeddings)


In [51]:
def get_retriever(vector_index,k=5):
    retriever = vector_index.as_retriever(search_kwargs={"k":k})
    return retriever
retriever = get_retriever(vector_index)

In [48]:
def define_chain(retriever,format_docs,llm):
    rag_chain = (
        {"context": retriever | format_docs , "question": RunnablePassthrough()}
        | llm
        | StrOutputParser()
    )
    return rag_chain
prompt = hub.pull("rlm/rag-prompt")
# Chain
rag_chain = define_chain(retriever,format_docs,llm  )
# Question
answer=rag_chain.invoke("What is data science and define its steps")

Retrying langchain_google_genai.chat_models._chat_with_retry.<locals>._chat_with_retry in 2.0 seconds as it raised ResourceExhausted: 429 Resource has been exhausted (e.g. check quota)..
Retrying langchain_google_genai.chat_models._chat_with_retry.<locals>._chat_with_retry in 4.0 seconds as it raised ResourceExhausted: 429 Resource has been exhausted (e.g. check quota)..
Retrying langchain_google_genai.chat_models._chat_with_retry.<locals>._chat_with_retry in 8.0 seconds as it raised ResourceExhausted: 429 Resource has been exhausted (e.g. check quota)..


In [52]:
print(answer)

Data science is the applied science of analyzing and modeling data. While a comprehensive definition of its steps is not provided in the context, the text emphasizes that the first step in any data science project is **exploratory data analysis (EDA)**. EDA involves summarizing and visualizing data to gain insights and understanding. 

