In [1]:
import os
from dotenv import load_dotenv
import json
import pandas as pd
import re
import chromadb
from chromadb import Documents, EmbeddingFunction, Embeddings
from chromadb.config import Settings
from tqdm import tqdm
# Load environment variables from .env file
load_dotenv()
gemini_api_key = os.getenv("GEMINI_API_KEY")

In [2]:
import google.generativeai as genai

genai.configure(api_key=gemini_api_key)

model = genai.GenerativeModel('gemini-1.5-pro')

In [3]:
class GeminiEmbeddingFunction(EmbeddingFunction):
  def __call__(self, input: Documents) -> Embeddings:
    model = 'models/embedding-001'
    title = "Custom query"
    return genai.embed_content(model=model,
                                content=input,
                                task_type="retrieval_document",
                                title=title)["embedding"]

In [4]:
def create_chromadb(list_of_words):


    client = chromadb.PersistentClient(path="vectorSearchForFarmGenie")
    client.delete_collection(name='embeddings_for_farm_book')
    chroma_db = client.create_collection('embeddings_for_farm_book', metadata={"hnsw:space": "cosine"}, embedding_function=GeminiEmbeddingFunction())

    for i, d in tqdm(enumerate(list_of_words)):

        chroma_db.add(
            documents=d,
            ids = str(i),
            # metadatas={'Timestamp_{}'.format(i): str(timestamps[i][0]) + '_' + str(timestamps[i][1])}
            # metadatas = {'question': d}
        )

    return chroma_db

In [5]:
def CreateEmbeddings(query):
    model = 'models/embedding-001'
    embedding = genai.embed_content(model=model,
                                content=query,
                                task_type="retrieval_document",
)
    return embedding

In [6]:
def load_chromadb(collection_name):
    client = chromadb.PersistentClient(path="vectorSearchForFarmGenie")
    chroma_db = client.get_collection(collection_name)


    return chroma_db 


In [47]:
import fitz 
import re

doc = fitz.open('data/Books/farmerbook.pdf') 
text = "" 
for i, page in enumerate(doc):
    if( i>=8 and i<=138):
        blocks = page.get_text("blocks")
    
        # Sort blocks by their top-left y-coordinate
        blocks.sort(key=lambda b: b[1])

        # Concatenate the text from the sorted blocks
        for block in blocks:
            text += block[4] + "\n"

# Remove extra spaces and newlines
# text = re.sub(r'\s+', ' ', text).strip()

# # Fix spaces within words and around colons
# text = re.sub(r'(\w)\s+(\w)', r'\1 \2', text)
# text = re.sub(r'\s*:\s*', ': ', text)

# # Fix cases where colons are separated by newlines
# text = re.sub(r'(\w)\s*:\s*(\w)', r'\1: \2', text)
# text = re.sub(r'(\w)\s*\n\s*:\s*(\w)', r'\1: \2', text)

# print(text)

In [10]:

from pypdf import PdfReader 
import re
reader = PdfReader('../data/Books/farmerbook.pdf') 

# print(len(reader.pages)) 

# page = reader.pages[143] 

# print(page.extract_text()) 
text = ""
for i, doc in enumerate(reader.pages):
    if(i >= 8 and i < 144):
        page = reader.pages[i]
        data = page.extract_text()
        text += data
# Remove extra spaces and newlines
text = re.sub(r'\s+', ' ', text).strip()

# Fix spaces within words and around colons
text = re.sub(r'(\w)\s+(\w)', r'\1 \2', text)
text = re.sub(r'\s*:\s*', ': ', text)

# Fix cases where colons are separated by newlines
text = re.sub(r'(\w)\s*:\s*(\w)', r'\1: \2', text)
text = re.sub(r'(\w)\s*\n\s*:\s*(\w)', r'\1: \2', text)


In [11]:
print(text)



In [12]:
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder(
    model_name="gpt-4",
    chunk_size= 400,
    chunk_overlap=20,
)
chunks = text_splitter.split_text(text)

In [13]:
len(chunks)

156

In [14]:
import pickle
pickle.dump(chunks, open("chunks.pkl", "wb"))

In [46]:
# len(texts)
for i in chunks:
    print(i)
    print('--------------------------------------------------------------------')

General Conditions for Cultivation of Crops Farmer’s Handbook on Basic Agriculture 1Agricultural Universities, Research Institutes, Krishi Vigyan Kendras have been generat- ing ample technologies to improve the productiv-ity and profitability of the farmers. How many of these technologies are reaching the farmers? Base-line Situation Assessment conducted by Partner - ship farming in India, in Gujarat and Maharashtra, clearly indicated that farmers with access to techni-cal knowledge on agriculture realized better income compared to others. Fifty one percent of sample farmers who were part of partnership farming In-dia had knowledge of soil testing compared to only 28% of control group. Mulching and intercropping as a practice were not widely adopted by control group of farmers. They were less aware about other organic fertilizers. Fertigation as a method of appli-cation of fertilizers was not widely practiced by con-trol group. Farmers who accessed information from agricultural univers

In [57]:
list_of_words = chunks

In [58]:
db = create_chromadb(list_of_words=list_of_words)

156it [01:15,  2.08it/s]


In [7]:

collection_name = 'embeddings_for_farm_book'
db = load_chromadb(collection_name=collection_name)

In [8]:
query = "What kind of bookworms and catepillars appear due to climate change?"
query_embeddings = CreateEmbeddings(query)
query_embeddings = query_embeddings['embedding']
# query_embeddings = create_embeddings(query)
result = db.query(query_embeddings=query_embeddings, n_results=1)
result

{'ids': [['6']],
 'distances': [[0.360317587852478]],
 'metadatas': [[None]],
 'embeddings': None,
 'documents': [['due to climate change • Due to increase in rainfall: Pests like bollworm, red hairy caterpillar and leaf spot diseases may increase.Due to increase in temperature: Suck-ing pests such as mites and leaf miner may in-crease. • Due to variation in rainfall and temperature: Pest and diseases of crops to be altered because of more enhanced pathogen and vector devel-opment, rapid pathogen transmission and in-creased host susceptibility. Sometimes a minor pest may become a major pest. • Agricultural biodiversity is also threatened by decreased rainfall and increased temperature, sea level rise and increased frequency and se- verity of drought, cyclone and flood. Quality of farm products such as fruits, vegetables, tea, coffee, aromatic and medicinal plants may be affected. Water • Demand for irrigation to increase with in-creased temperature and higher amount of evapo-transpirat