In [13]:
import os
import re
import json
from pathlib import Path


class RAGDataIngestion:
    def __init__(self, base_dir: str, output_path: str):
        self.base_dir = base_dir
        self.output_path = output_path

    # Function to extract raw text from a markdown file
    def _extract_text(self, md_path: Path) -> str:
        with open(md_path, "r", encoding="utf-8") as file:
            return file.read()

    # Function to format markdown content into structured JSON
    def _structure_entry(self, crop: str, disease: str, content: str) -> dict:
        def _extract_section(text: str, header: str) -> str:
            pattern = rf"### {header}\n+(.*?)(?=\n### |\Z)"
            match = re.search(pattern, text, re.DOTALL | re.IGNORECASE)
            return match.group(1).strip() if match else ""

        return {
            "crop": crop,
            "disease": disease,
            "symptoms": _extract_section(content, "Symptoms"),
            "cause": _extract_section(content, "Cause"),
            "management": _extract_section(content, "Management"),
        }

    # Walk through the knowledge base directory and process markdown files
    def ingest_knowledge_base(self) -> list:
        data = []

        for crop_folder in Path(self.base_dir).iterdir():
            if crop_folder.is_dir():
                crop_name = crop_folder.name

                for md_file in crop_folder.glob("*.md"):
                    disease_name = md_file.stem
                    raw_content = self._extract_text(md_file)
                    entry = self._structure_entry(crop_name, disease_name, raw_content)
                    data.append(entry)

        return data

    # Save data to a JSON file
    @staticmethod
    def save_to_json(data: list, output_path: str):
        with open(output_path, "w", encoding="utf-8") as f:
            json.dump(data, f, indent=2, ensure_ascii=False)


In [14]:
import os

current_dir = os.getcwd()  # 
data_path = os.path.join(current_dir, "..", "..", "knowledge_base")  # adjust relative path if needed
data_path = os.path.abspath(data_path)

rag = RAGDataIngestion(base_dir=data_path, output_path= "./formatted_knowledge.json")
json_data = rag.ingest_knowledge_base()


In [17]:
print(json_data[0:2])

[{'crop': 'Cashew', 'disease': 'anthracnose', 'symptoms': '- Sunken, reddish-brown lesions on main stalk, branches, and immature fruits.\n- Crinkling of flower buds and young flowers.\n- Black lesions on immature fruits.', 'cause': '- Fungal infection (Colletotrichum species).', 'management': '- Use healthy, disease-free seedlings.\n- Prune and remove dead or diseased twigs, branches, and plant debris.\n- Ensure proper spacing and aeration.\n- Apply mancozeb or folpet fungicides as recommended.\n- Use insecticides to control sucking insects if needed.'}, {'crop': 'Cashew', 'disease': 'gummosis', 'symptoms': '- Cankers on trunk and branches, oozing gum-like resin.\n- Dieback of branches, yellowing of leaves, slow growth.', 'cause': '- Fungal infection (Lasiodiplodia theobromae), often in stressed trees.', 'management': '- Prune and destroy infected branches.\n- Surgically remove cankers.\n- Apply copper-based fungicides.\n- Use resistant clones and proper management practices.'}]


In [None]:
import os
from dotenv import load_dotenv
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import FAISS

# Load .env
load_dotenv(override=True)
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

# Initialize embeddings
embedding_model = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)

# Format each entry as a string chunk
texts = [
    f"Crop: {entry['crop']}\nDisease: {entry['disease']}\n\nSymptoms:\n{entry['symptoms']}\n\nCause:\n{entry['cause']}\n\nManagement:\n{entry['management']}"
    for entry in json_data
]



In [29]:

# Create FAISS index
faiss_index = FAISS.from_texts(texts, embedding_model)

# (Optional) Save index
faiss_index.save_local("faiss_index")

In [33]:

# Load the FAISS index from the saved directory
faiss_index = FAISS.load_local("faiss_index", embeddings=embedding_model, 
allow_dangerous_deserialization=True,
)

query = "what is cassava green mite?"
results = faiss_index.similarity_search(query, k=1)

for i, doc in enumerate(results):
    print(f"\nResult {i+1}:\n{doc.page_content}")




Result 1:
Crop: Cassava
Disease: green_mite

Symptoms:
- Green to yellowish mites, barely visible, appear as tiny spots.
- Feed on young leaves and sprouts.
- Leaves develop chlorotic spots, become mottled, die, and may fall off, leaving shoot tip like a "candle stick."
- Roots show stunted bulking and lack of fresh growth.
- Symptoms can resemble cassava mosaic virus.

Cause:
- Cassava green mite (Mononychellus tanajoa), spread by human activity, wind, and water.

Management:
- Use resistant varieties and clean planting material.
- Plant early in rainy season.
- Avoid planting downwind from infested fields.
- Limit movement in infested fields.
- Intercrop with pigeon pea.
- Encourage natural predators by reducing chemical use.
- Rotate with non-host crops.
- Monitor regularly and destroy infested stems.
- Release natural enemies (predatory mites/insects).
- Remove and destroy infested tips.
- Use garlic or onion-based sprays as described.


In [None]:
query = "what is fall army worm?"
results = faiss_index.similarity_search(query, k=1)

for i, doc in enumerate(results):
    print(f"\nResult {i+1}:\n{doc.page_content}")



Result 1:
Crop: Maize
Disease: fall_armyworm

Symptoms:
- Feeds on more than 80 plant species, including maize.
- Damages leaves, growing points, and cobs.
- Caterpillars hide in foliage during the day, feed at night.
- Severe feeding can kill growing points and bore into cobs.

Cause:
- Fall armyworm (Spodoptera frugiperda) infestation.

Management:
- Monitor fields for early detection.
- Apply insecticides directly into plant funnel at early larval stages.
- Use Bacillus thuringiensis-based biological pesticides.
- Ground sprays over rows are more effective than general sprays.


In [41]:
retriever = faiss_index.as_retriever(search_type="similarity", search_kwargs={"k": 1})
rel_docs = retriever.get_relevant_documents("what is fall army worm?")
contents = [doc.page_content for doc in rel_docs]


In [42]:
contents

['Crop: Maize\nDisease: fall_armyworm\n\nSymptoms:\n- Feeds on more than 80 plant species, including maize.\n- Damages leaves, growing points, and cobs.\n- Caterpillars hide in foliage during the day, feed at night.\n- Severe feeding can kill growing points and bore into cobs.\n\nCause:\n- Fall armyworm (Spodoptera frugiperda) infestation.\n\nManagement:\n- Monitor fields for early detection.\n- Apply insecticides directly into plant funnel at early larval stages.\n- Use Bacillus thuringiensis-based biological pesticides.\n- Ground sprays over rows are more effective than general sprays.']