## Installation

In [1]:
! pip install -q --upgrade google-generativeai langchain-google-genai chromadb pypdf

## Import Libraries

In [3]:

import google.generativeai as genai
import os

from langchain_google_genai import ChatGoogleGenerativeAI
import urllib
import warnings
from pathlib import Path as p
from pprint import pprint

import pandas as pd
from langchain import PromptTemplate
from langchain.chains.question_answering import load_qa_chain
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.chains import RetrievalQA
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from dotenv import load_dotenv

  from .autonotebook import tqdm as notebook_tqdm


## Setup Model

In [15]:

load_dotenv("./keys.env")
llm_api_key = os.getenv('Gemini_key')
def load_model(llm_api_key=llm_api_key):
    model = ChatGoogleGenerativeAI(model="gemini-1.5-pro-latest",google_api_key=llm_api_key,
                             temperature=0.2,convert_system_message_to_human=True)
    embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001",google_api_key=llm_api_key)
    return model  ,embeddings  


In [17]:
model,embeddings=load_model()

## Load document & Split

In [10]:
def load_document(document_path):
    pdf_loader = PyPDFLoader(document_path)
    pages = pdf_loader.load_and_split()
    return pages
document=load_document("./Documents/Practical Statistics for Data Scientists.pdf")

In [11]:
def split_text(pages):
    text_splitter = RecursiveCharacterTextSplitter(chunk_size=10000, chunk_overlap=1000)
    context = "\n\n".join(str(p.page_content) for p in pages)
    texts = text_splitter.split_text(context)
    return texts
texts=split_text(document)

### Remove Symbols , Emojis

In [12]:
import re
def remove_emojis(string):
    emoji_pattern = re.compile(
        "["
        u"\U0001F600-\U0001F64F" # emoticons
        u"\U0001F300-\U0001F5FF" # symbols & pictographs
        u"\U0001F680-\U0001F6FF" # transport & map symbols
        u"\U0001F1E0-\U0001F1FF" # flags (iOS)
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        "]+", 
        flags=re.UNICODE
    )
    
    return emoji_pattern.sub(r'', string)
for i in range(len(texts)):
    texts[i] =remove_emojis(texts[i])

### Get Vector

In [34]:
import langchain.vectorstores as vectorstores

In [42]:
def vector_index(texts,embeddings,persist_directory="./Database"):
    vector_index = Chroma.from_texts(texts, embeddings).as_retriever(search_kwargs={"k":5})
    db = Chroma.from_texts(texts, embeddings, persist_directory=persist_directory)
    db.persist()
    # db2 = Chroma.from_documents(text, embedding_function, persist_directory="./chroma_db")
    return vector_index
vector_index = vector_index(texts, embeddings)

In [43]:
vectordb = Chroma(persist_directory="./Database", embedding_function=embeddings)


In [49]:
retriever = vectordb.as_retriever(search_kwargs={"k": 5})

### QA Retriver

In [51]:
def qa_chain(vector_index, model):
    qa_chain = RetrievalQA.from_chain_type(
        model,
        retriever=vector_index,
        return_source_documents=False,
        chain_type="stuff"

    )
    return qa_chain
qa_chain = qa_chain(retriever, model)

In [26]:
from IPython.display import display
from IPython.display import Markdown
import textwrap

def ask_question(question,qa_chain):
    result = qa_chain({"query": question},return_only_outputs=True)

    
    return result['result']

In [52]:
question="What is the difference between a histogram and a bar chart?"
ans=ask_question(question,qa_chain)




In [53]:
print(ans)

A histogram groups values for a single numeric variable into bins and displays the bins as contiguous bars.  A bar chart displays the frequency or proportion of categorical data as separate bars. 



: 