#### Install Packages

In [None]:
!pip install langchain_google_genai
!pip install langchain
!pip install unstructured
!pip install chromadb

In [None]:
# unzip dir of text files
!unzip "/content/data.zip"

In [40]:
# Inbuild packages
import os
from typing import List
import warnings
warnings.filterwarnings("ignore")

# Third party packages
from tqdm import tqdm
import google.generativeai as genai
from langchain_google_genai import GoogleGenerativeAIEmbeddings, GoogleGenerativeAI
from langchain.chains.retrieval_qa.base import RetrievalQA
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import TextLoader, DirectoryLoader

#### Load All files and process files

In [13]:
# load all files and into one document
loader = DirectoryLoader('./data', glob = "**/*.txt")
docs = loader.load()

#### Convert text data into chunks

In [14]:
# splitting document into text
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 100, chunk_overlap = 50)
texts = text_splitter.split_documents(docs)

#### Store all text data into database as embedding form

In [27]:
persist_directory = "db"

embedding_model = GoogleGenerativeAIEmbeddings(model = "models/embedding-001", google_api_key = "AIzaSyAJc5QKxVwHJiUl1V9jFiunoeUTW123FCM")

vector_db = Chroma.from_documents(documents = texts,
                                  embedding = embedding_model,
                                  persist_directory = persist_directory)

In [30]:
# persiste the db to disk
vector_db.persist()
vector_db = None

# Now we can load the persisted database from disk, and use it as normal.
vector_db = Chroma(persist_directory = persist_directory,
                   embedding_function = embedding_model)
vector_db

<langchain_community.vectorstores.chroma.Chroma at 0x78eeea520460>

#### Retriver data from database

In [65]:
retriever = vector_db.as_retriever(search_kwargs={"k": 2})

#### Make a chain for Q & A

In [66]:
# create the chain to answer questions
qa_chain = RetrievalQA.from_chain_type(llm = GoogleGenerativeAI(model = "gemini-pro", google_api_key = "AIzaSyAJc5QKxVwHJiUl1V9jFiunoeUTW123FCM"),
                                  chain_type="stuff",
                                  retriever=retriever,
                                  return_source_documents=True)

In [67]:
## Cite sources
def process_llm_response(llm_response):
    print("Answer : \n")
    print(llm_response['result'])

In [68]:
# full example --> There is not such data
query = "How much money did Pando raise?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

Answer : 

I don't know. The provided text does not mention how much money Pando raised.


In [69]:
# British TV viewers lead the trend of illegally downloading US shows from the net
query = "What is the news of Hotspot users gain free net calls?"
llm_response = qa_chain(query)
process_llm_response(llm_response)

Answer : 

People using wireless net hotspots will soon be able to make free phone calls as well as surf the internet.
