<a href="https://colab.research.google.com/github/ykhier/Cloud_Course/blob/main/Lab7_RAG1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

üìö Complete Cell Structure
üîß Setup Cells (1-3)

Cell 1: Package Installation with progress tracking
Cell 2: Import Libraries with fallback detection
Cell 3: Vector Store Classes (Simple fallback)

üß† Core System Cells (4-6)

Cell 4: RAG System Core Class
Cell 5: Data Loading Methods
Cell 6: Search and Query Methods

üìä Data & Interface Cells (7-9)

Cell 7: Sample IOLR Data for testing
Cell 8: Initialize RAG System
Cell 9: Simple Query Interface

üîÑ Optional Enhancement Cells (10-11)

Cell 10: Load Your Own Papers (optional)
Cell 11: Gradio Web Interface (optional)

üìà Analytics & Advanced Cells (12-14)

Cell 12: Analytics and Evaluation
Cell 13: Advanced Query Features
Cell 14: System Summary and Testing

In [None]:
# CELL 2: Import Libraries and Check Dependencies
# ==============================================
"""
üìö CELL 2: IMPORT LIBRARIES
Run this cell to import all required libraries and check what's available.
"""

import json
import pandas as pd
import numpy as np
from typing import List, Dict, Any, Optional
import re
import time

# Check what packages are available
print("üîç Checking available packages...")

# ChromaDB
try:
    import chromadb
    CHROMADB_AVAILABLE = True
    print("‚úÖ ChromaDB: Available")
except ImportError:
    CHROMADB_AVAILABLE = False
    print("‚ùå ChromaDB: Not available (will use fallback)")

# SentenceTransformers
try:
    from sentence_transformers import SentenceTransformer
    TRANSFORMERS_AVAILABLE = True
    print("‚úÖ SentenceTransformers: Available")
except ImportError:
    TRANSFORMERS_AVAILABLE = False
    print("‚ùå SentenceTransformers: Not available (will use TF-IDF)")

# OpenAI
try:
    import openai
    OPENAI_AVAILABLE = True
    print("‚úÖ OpenAI: Available")
except ImportError:
    OPENAI_AVAILABLE = False
    print("‚ùå OpenAI: Not available (will use template responses)")

# Gradio for interface
try:
    import gradio as gr
    GRADIO_AVAILABLE = True
    print("‚úÖ Gradio: Available")
except ImportError:
    GRADIO_AVAILABLE = False
    print("‚ùå Gradio: Not available (will use simple interface)")

# Fallback imports
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

print("\nüìã System Status:")
print(f"   Vector DB: {'ChromaDB' if CHROMADB_AVAILABLE else 'Simple Store'}")
print(f"   Embeddings: {'Transformer' if TRANSFORMERS_AVAILABLE else 'TF-IDF'}")
print(f"   Generation: {'OpenAI GPT' if OPENAI_AVAILABLE else 'Template'}")
print(f"   Interface: {'Gradio' if GRADIO_AVAILABLE else 'Simple'}")

print("\nüéØ Ready for Cell 3!")

In [None]:
# CELL 3: Vector Store Classes
# ============================
"""
üóÑÔ∏è CELL 3: VECTOR STORE CLASSES
This cell defines the vector storage classes with fallback options.
"""

class SimpleVectorStore:
    """Fallback vector store when ChromaDB is not available"""

    def __init__(self):
        self.documents = []
        self.embeddings = []
        self.metadatas = []
        self.ids = []
        print("üì¶ SimpleVectorStore initialized")

    def add(self, embeddings, documents, metadatas, ids):
        """Add documents to the store"""
        self.embeddings.extend(embeddings)
        self.documents.extend(documents)
        self.metadatas.extend(metadatas)
        self.ids.extend(ids)
        print(f"‚úÖ Added {len(documents)} documents to simple vector store")

    def query(self, query_embeddings, n_results=5):
        """Query the vector store"""
        if not self.embeddings:
            return {'ids': [[]], 'documents': [[]], 'metadatas': [[]], 'distances': [[]]}

        # Calculate similarities
        similarities = cosine_similarity(query_embeddings, self.embeddings)[0]

        # Get top results
        top_indices = np.argsort(similarities)[::-1][:n_results]

        results = {
            'ids': [[self.ids[i] for i in top_indices]],
            'documents': [[self.documents[i] for i in top_indices]],
            'metadatas': [[self.metadatas[i] for i in top_indices]],
            'distances': [[1 - similarities[i] for i in top_indices]]
        }

        return results

    def count(self):
        """Get count of documents"""
        return len(self.documents)

print("‚úÖ Vector store classes defined!")
print("üìã Next: Run Cell 4 for RAG system core")

In [None]:
# CELL 4: RAG System Core
# =======================
"""
üß† CELL 4: RAG SYSTEM CORE
This cell defines the main EcologicalRAG class.
"""

class EcologicalRAG:
    """Main RAG system for ecological research papers"""

    def __init__(self, openai_api_key=None):
        print("üåä Initializing Ecological RAG System...")

        # Setup embedding model
        if TRANSFORMERS_AVAILABLE:
            try:
                self.embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
                self.use_transformers = True
                print("‚úÖ Loaded SentenceTransformer embeddings")
            except:
                self.use_transformers = False
                self.tfidf = TfidfVectorizer(max_features=1000, stop_words='english')
                print("‚ö†Ô∏è Using TF-IDF embeddings (fallback)")
        else:
            self.use_transformers = False
            self.tfidf = TfidfVectorizer(max_features=1000, stop_words='english')
            print("‚ö†Ô∏è Using TF-IDF embeddings")

        # Setup vector store
        if CHROMADB_AVAILABLE:
            try:
                client = chromadb.Client()
                try:
                    self.collection = client.get_collection("ecological_papers")
                    print("‚úÖ Loaded existing ChromaDB collection")
                except:
                    self.collection = client.create_collection("ecological_papers")
                    print("‚úÖ Created new ChromaDB collection")
                self.use_chromadb = True
            except:
                self.collection = SimpleVectorStore()
                self.use_chromadb = False
                print("‚ö†Ô∏è Using simple vector store (fallback)")
        else:
            self.collection = SimpleVectorStore()
            self.use_chromadb = False
            print("‚ö†Ô∏è Using simple vector store")

        # Setup OpenAI
        if openai_api_key and OPENAI_AVAILABLE:
            openai.api_key = openai_api_key
            self.use_openai = True
            print("‚úÖ OpenAI configured")
        else:
            self.use_openai = False
            print("‚ö†Ô∏è Using template responses")


        self.papers = []
        self.fitted = False
        print("üéâ RAG system ready!")

    def preprocess_text(self, text):
        """Clean text for better processing"""
        if not text:
            return ""
        text = re.sub(r'\s+', ' ', text)
        text = re.sub(r'[^\w\s\-\.\(\)]', ' ', text)
        return text.strip()

    def extract_entities(self, text):
        """Extract ecological entities from text"""
        entities = {'species': [], 'locations': [], 'methods': []}

        # Species (binomial nomenclature)
        species = re.findall(r'\b[A-Z][a-z]+ [a-z]+\b', text)
        entities['species'] = list(set(species))[:3]

        # Locations
        locations = re.findall(r'\b(Plant|Home plant|Plant disease|Image detection)\b', text, re.IGNORECASE)
        entities['locations'] = list(set(locations))[:3]

        # Methods
        methods = re.findall(r'\b(Plant|Disease|sequencing|survey|analysis|modeling)\b', text, re.IGNORECASE)
        entities['methods'] = list(set(methods))[:3]

        return entities

    def generate_embeddings(self, texts):
        """Generate embeddings using available method"""
        if self.use_transformers:
            return self.embedding_model.encode(texts, show_progress_bar=True)
        else:
            if not self.fitted:
                self.tfidf.fit(texts)
                self.fitted = True
            return self.tfidf.transform(texts).toarray()

print("‚úÖ RAG core class defined!")
print("üìã Next: Run Cell 5 for data loading methods")

In [None]:
# CELL 5: Data Loading Methods
# ============================
"""
üìö CELL 5: DATA LOADING METHODS
This cell adds data loading capabilities to the RAG system.
"""

def add_load_papers_method():
    """Add load_papers method to EcologicalRAG class"""

    def load_papers(self, papers_data):
        """Load papers into the RAG system"""
        print(f"üìö Loading {len(papers_data)} papers...")

        valid_papers = [p for p in papers_data if p.get('abstract', '').strip()]
        print(f"üìñ Found {len(valid_papers)} papers with abstracts")

        if not valid_papers:
            print("‚ùå No valid papers found!")
            return

        documents, metadatas, ids = [], [], []

        for i, paper in enumerate(valid_papers):
            # Combine title and abstract
            text = f"{paper.get('title', '')} {paper.get('abstract', '')}"
            text = self.preprocess_text(text)

            if len(text) < 50:
                continue

            entities = self.extract_entities(text)

            metadata = {
                'title': paper.get('title', 'Unknown'),
                'authors': paper.get('authors', 'Unknown'),
                'journal': paper.get('journal', 'Unknown'),
                'year': paper.get('year', 2022),
                'doi': paper.get('doi', ''),
                'species': ', '.join(entities['species']),
                'locations': ', '.join(entities['locations']),
                'methods': ', '.join(entities['methods'])
            }

            documents.append(text)
            metadatas.append(metadata)
            ids.append(f"paper_{i}")

        if not documents:
            print("‚ùå No processable documents found!")
            return

        # Generate embeddings
        print("üîÑ Generating embeddings...")
        embeddings = self.generate_embeddings(documents)

        # Add to vector store
        print("üíæ Adding to vector store...")
        if self.use_chromadb:
            self.collection.add(
                embeddings=embeddings.tolist(),
                documents=documents,
                metadatas=metadatas,
                ids=ids
            )
        else:
            self.collection.add(
                embeddings=embeddings,
                documents=documents,
                metadatas=metadatas,
                ids=ids
            )

        self.papers = valid_papers
        print(f"‚úÖ Successfully loaded {len(documents)} papers!")

    # Add method to class
    EcologicalRAG.load_papers = load_papers

# Apply the method
add_load_papers_method()

print("‚úÖ Data loading methods added!")
print("üìã Next: Run Cell 6 for search and query methods")

In [None]:
# CELL 6: Search and Query Methods
# ================================
"""
üîç CELL 6: SEARCH AND QUERY METHODS
This cell adds search and response generation to the RAG system.
"""

def add_search_methods():
    """Add search and query methods to EcologicalRAG class"""

    def search(self, query, n_results=3):
        """Search for relevant papers"""
        query_processed = self.preprocess_text(query)
        query_embedding = self.generate_embeddings([query_processed])

        if self.use_chromadb:
            results = self.collection.query(
                query_embeddings=query_embedding.tolist(),
                n_results=n_results
            )
        else:
            results = self.collection.query(
                query_embeddings=query_embedding,
                n_results=n_results
            )

        return results

    def _generate_openai_response(self, query, papers, search_results):
        """Generate response using OpenAI"""
        context = "\n\n".join([
            f"Paper: {papers[i]['title']}\n"
            f"Authors: {papers[i]['authors']}\n"
            f"Content: {search_results['documents'][0][i][:400]}..."
            for i in range(min(3, len(papers)))
        ])

        prompt = f"""You are an expert ecologist. Answer this question based on the research provided:

Question: {query}

Research Papers:
{context}

Provide a comprehensive answer citing the research. Focus on Plant disease and Home plants ecosystems."""

        try:
            response = openai.ChatCompletion.create(
                model="gpt-3.5-turbo",
                messages=[
                    {"role": "system", "content": "You are an expert ecologist."},
                    {"role": "user", "content": prompt}
                ],
                max_tokens=800,
                temperature=0.7
            )
            return response.choices[0].message.content
        except Exception as e:
            return f"OpenAI error: {e}\n\nFalling back to template response:\n\n{self._generate_template_response(query, papers, search_results)}"

    def _generate_template_response(self, query, papers, search_results):
        """Generate template response without OpenAI"""
        response = f"üîç **Search Results for:** {query}\n\n"
        response += f"üìä **Found {len(papers)} relevant papers:**\n\n"

        for i, paper in enumerate(papers[:3]):
            response += f"**{i+1}. {paper['title']}**\n"
            response += f"   üë• Authors: {paper['authors']}\n"
            response += f"   üìñ Journal: {paper['journal']} ({paper['year']})\n"

            if paper.get('species'):
                response += f"   üêü Species: {paper['species']}\n"
            if paper.get('locations'):
                response += f"   üìç Locations: {paper['locations']}\n"
            if paper.get('methods'):
                response += f"   üî¨ Methods: {paper['methods']}\n"

            response += f"   üîó DOI: {paper['doi']}\n\n"

        # Add summary
        all_species = set()
        all_locations = set()
        for paper in papers:
            if paper.get('species'):
                all_species.update([s.strip() for s in paper['species'].split(',') if s.strip()])
            if paper.get('locations'):
                all_locations.update([l.strip() for l in paper['locations'].split(',') if l.strip()])

        response += "üìã **Summary:**\n"
        if all_species:
            response += f"   üêü Species mentioned: {', '.join(list(all_species)[:5])}\n"
        if all_locations:
            response += f"   üìç Study areas: {', '.join(list(all_locations))}\n"

        return response

    def generate_response(self, query, search_results):
        """Generate response based on search results"""

        if not search_results['documents'][0]:
            return "‚ùå No relevant papers found for your query."

        papers = search_results['metadatas'][0]

        if self.use_openai:
            return self._generate_openai_response(query, papers, search_results)
        else:
            return self._generate_template_response(query, papers, search_results)

    def query(self, question, n_results=3):
        """Main query function"""
        print(f"üîç Processing: {question}")

        search_results = self.search(question, n_results)
        response = self.generate_response(question, search_results)

        return {
            'question': question,
            'response': response,
            'papers_found': len(search_results['documents'][0]),
            'search_results': search_results
        }

    # Add methods to class
    EcologicalRAG.search = search
    EcologicalRAG._generate_openai_response = _generate_openai_response
    EcologicalRAG._generate_template_response = _generate_template_response
    EcologicalRAG.generate_response = generate_response
    EcologicalRAG.query = query

# Apply the methods
add_search_methods()

print("‚úÖ Search and query methods added!")
print("üìã Next: Run Cell 7 for sample data")

In [None]:
# CELL 7: Sample Data
# ===================
"""
üìä CELL 7: SAMPLE DATA
This cell provides sample papers for testing the system, now focused on home plants.
"""

def get_sample_home_plant_papers():
    """Get sample papers on identifying home plants"""

    sample_papers = [
        {
            'title': 'Plant Disease Detection and Classification: A Systematic Review',
            'authors': 'Filipe Neves dos Santos',
            'journal': 'Journal of Plant Informatics',
            'year': 2023,
            'doi': '10.3390/s23104769',
            'abstract': 'Agricultural productivity is increasingly threatened by plant diseases, which can spread rapidly and lead to significant crop losses if not identified early. Detecting plant diseases accurately in diverse and uncontrolled environments remains challenging, as most current detection methods rely heavily on lab-captured images that may not generalise well to real-world settings. This paper aims to develop models capable of accurately identifying plant diseases across diverse conditions, overcoming the limitations of existing methods. A combined dataset was utilised, incorporating the PlantDoc dataset with web-sourced images of plants from online platforms. State-of-the-art convolutional neural network (CNN) architectures, including EfficientNet-B0, EfficientNet-B3, ResNet50, and DenseNet201, were employed and fine-tuned for plant leaf disease classification. A key contribution of this work is the application of enhanced data augmentation techniques, such as adding Gaussian noise, to improve model generalisation. The results demonstrated varied performance across the datasets. When trained and tested on the PlantDoc dataset, EfficientNet-B3 achieved an accuracy of 73.31%. In cross-dataset evaluation, where the model was trained on PlantDoc and tested on a web-sourced dataset, EfficientNet-B3 reached 76.77% accuracy. The best performance was achieved with the combination of the PlanDoc and web-sourced datasets resulting in an accuracy of 80.19% indicating very good generalisation in diverse conditions. Class-wise F1-scores consistently exceeded 90% for diseases such as apple rust leaf and grape leaf across all models, demonstrating the effectiveness of this approach for plant disease detection.'
        },
        {
            'title': 'Smartphone-Based Image Recognition for On-Site Identification of Ornamental Plants',
            'authors': 'Marios Michailidis',
            'journal': 'Horticultural Science Research',
            'year': 2024,
            'doi': '10.3390/electronics13061010',
            'abstract': 'This paper investigates the usage of machine learning (ML) algorithms on agricultural images with the aim of extracting information regarding the health of plants. More specifically, a custom convolutional neural network is trained on Google Colab using photos of healthy and unhealthy plants. The trained models are evaluated using various single-board computers (SBCs) that demonstrate different essential characteristics. Raspberry Pi 3 and Raspberry Pi 4 are the current mainstream SBCs that use their Central Processing Units (CPUs) for processing and are used for many applications for executing ML algorithms based on popular related libraries such as TensorFlow. NVIDIA Graphic Processing Units (GPUs) have a different rationale and base the execution of ML algorithms on a GPU that uses a different architecture than a CPU. GPUs can also implement high parallelization on the Compute Unified Device Architecture (CUDA) cores. Another current approach involves using a Tensor Processing Unit (TPU) processing unit carried by the Google Coral Dev TPU Board, which is an Application-Specific Integrated Circuit (ASIC) specialized for accelerating ML algorithms such as Convolutional Neural Networks (CNNs) via the usage of TensorFlow Lite. This study experiments with all of the above-mentioned devices and executes custom CNN models with the aim of identifying plant diseases. In this respect, several evaluation metrics are used, including knowledge extraction time, CPU utilization, Random Access Memory (RAM) usage, swap memory, temperature, current milli Amperes (mA), voltage (Volts), and power consumption milli Watts (mW).'
        },
        {
            'title': 'Plant Leaf Disease Detection Using Deep Learning: A Multi-Dataset Approach',
            'authors': 'Manjunatha Shettigere Krishna',
            'journal': 'Botanical Genetics',
            'year': 2024,
            'doi': '10.3390/j8010004',
            'abstract': 'Agricultural productivity is increasingly threatened by plant diseases, which can spread rapidly and lead to significant crop losses if not identified early. Detecting plant diseases accurately in diverse and uncontrolled environments remains challenging, as most current detection methods rely heavily on lab-captured images that may not generalise well to real-world settings. This paper aims to develop models capable of accurately identifying plant diseases across diverse conditions, overcoming the limitations of existing methods. A combined dataset was utilised, incorporating the PlantDoc dataset with web-sourced images of plants from online platforms. State-of-the-art convolutional neural network (CNN) architectures, including EfficientNet-B0, EfficientNet-B3, ResNet50, and DenseNet201, were employed and fine-tuned for plant leaf disease classification. A key contribution of this work is the application of enhanced data augmentation techniques, such as adding Gaussian noise, to improve model generalisation. The results demonstrated varied performance across the datasets. When trained and tested on the PlantDoc dataset, EfficientNet-B3 achieved an accuracy of 73.31%. In cross-dataset evaluation, where the model was trained on PlantDoc and tested on a web-sourced dataset, EfficientNet-B3 reached 76.77% accuracy. The best performance was achieved with the combination of the PlanDoc and web-sourced datasets resulting in an accuracy of 80.19% indicating very good generalisation in diverse conditions. Class-wise F1-scores consistently exceeded 90% for diseases such as apple rust leaf and grape leaf across all models, demonstrating the effectiveness of this approach for plant disease detection.'
        },
        {
            'title': 'Plant Leaf Disease Detection Using Deep Learning A Multi-Dataset Approach',
            'authors': 'Pedro Machado',
            'journal': 'Agricultural AI Journal',
            'year': 2024,
            'doi': '10.3390/j8010204',
            'abstract': 'Agricultural productivity is increasingly threatened by plant diseases, which can spread rapidly and lead to significant crop losses if not identified early. Detecting plant diseases accurately in diverse and uncontrolled environments remains challenging, as most current detection methods rely heavily on lab-captured images that may not generalise well to real-world settings. This paper aims to develop models capable of accurately identifying plant diseases across diverse conditions, overcoming the limitations of existing methods. A combined dataset was utilised, incorporating the PlantDoc dataset with web-sourced images of plants from online platforms. State-of-the-art convolutional neural network (CNN) architectures, including EfficientNet-B0, EfficientNet-B3, ResNet50, and DenseNet201, were employed and fine-tuned for plant leaf disease classification. A key contribution of this work is the application of enhanced data augmentation techniques, such as adding Gaussian noise, to improve model generalisation. The results demonstrated varied performance across the datasets. When trained and tested on the PlantDoc dataset, EfficientNet-B3 achieved an accuracy of 73.31%. In cross-dataset evaluation, where the model was trained on PlantDoc and tested on a web-sourced dataset, EfficientNet-B3 reached 76.77% accuracy. The best performance was achieved with the combination of the PlanDoc and web-sourced datasets resulting in an accuracy of 80.19% indicating very good generalisation in diverse conditions. Class-wise F1-scores consistently exceeded 90% for diseases such as apple rust leaf and grape leaf across all models, demonstrating the effectiveness of this approach for plant disease detection.'

        },
         {
            'title': 'AgriFusionNet: A Lightweight Deep Learning Model for Multisource Plant Disease Diagnosis',
            'authors': 'Saleh Albahli',
            'journal': 'Agricultural AI Journal',
            'year': 2025,
            'doi': '10.3390/agriculture15141523',
            'abstract': 'Timely and accurate identification of plant diseases is critical to mitigating crop losses and enhancing yield in precision agriculture. This paper proposes AgriFusionNet, a lightweight and efficient deep learning model designed to diagnose plant diseases using multimodal data sources. The framework integrates RGB and multispectral drone imagery with IoT-based environmental sensor data (e.g., temperature, humidity, soil moisture), recorded over six months across multiple agricultural zones. Built on the EfficientNetV2-B4 backbone, AgriFusionNet incorporates Fused-MBConv blocks and Swish activation to improve gradient flow, capture fine-grained disease patterns, and reduce inference latency. The model was evaluated using a comprehensive dataset composed of real-world and benchmarked samples, showing superior performance with 94.3% classification accuracy, 28.5 ms inference time, and a 30% reduction in model parameters compared to state-of-the-art models such as Vision Transformers and InceptionV4. Extensive comparisons with both traditional machine learning and advanced deep learning methods underscore its robustness, generalization, and suitability for deployment on edge devices. Ablation studies and confusion matrix analyses further confirm its diagnostic precision, even in visually ambiguous cases. The proposed framework offers a scalable, practical solution for real-time crop health monitoring, contributing toward smart and sustainable agricultural ecosystems.'

         }

    ]

    print(f"üìö Loaded {len(sample_papers)} sample home plant papers:")
    for i, paper in enumerate(sample_papers, 1):
        print(f"   {i}. {paper['title'][:60]}...")

    return sample_papers

# Load sample data
SAMPLE_PAPERS = get_sample_home_plant_papers()

print("\n‚úÖ Sample data ready!")
print("üìã Next: Run Cell 8 to initialize RAG system")

In [None]:
# CELL 8: Initialize RAG System
# =============================
"""
üöÄ CELL 8: INITIALIZE RAG SYSTEM
This cell creates the RAG system and loads the sample papers.
Set your OpenAI API key here if you have one (optional).
"""

# Configuration
OPENAI_API_KEY = None  # Replace with your OpenAI API key if you have one
# OPENAI_API_KEY = "sk-your-api-key-here"  # Uncomment and add your key

# Initialize the RAG system
print("üåä Initializing Ecological RAG System...")
rag_system = EcologicalRAG(openai_api_key=OPENAI_API_KEY)

# Load sample papers
print("\nüìö Loading sample papers into RAG system...")
rag_system.load_papers(SAMPLE_PAPERS)

# Test the system
print("\nüß™ Testing system with sample query...")
test_result = rag_system.query("What image recognitions algortihms van help in detecting plant diseases?")
print(f"‚úÖ Test successful! Found {test_result['papers_found']} relevant papers")

print("\nüéâ RAG system is ready!")
print("üìã Next: Run Cell 9 for simple interface or Cell 10 to load your own papers")

In [None]:
# CELL 9: Simple Query Interface
# =============================
"""
üí¨ CELL 9: SIMPLE QUERY INTERFACE
This cell provides a simple interface to query the RAG system.
Copy and run this cell to start asking questions about ecology!
"""

def query_interface():
    """Simple interface for querying the RAG system"""

    print("üåä ECOLOGICAL RAG SYSTEM - QUERY INTERFACE")
    print("=" * 60)
    print("Ask questions about marine and freshwater ecology!")
    print("Type 'quit' to exit, 'help' for examples")
    print("=" * 60)

    while True:
        try:
            # Get user input
            query = input("\nüîç Your question: ").strip()

            if query.lower() == 'quit':
                print("üëã Goodbye!")
                break

            if query.lower() == 'help':
                print("\nüí° Example questions:")
                print("Here are some sample queries derived from the documents:")
                print(" ¬† ‚Ä¢ Compare ResNet50 and VGG16 model accuracy.")
                print(" ¬† ‚Ä¢ What is the final MVGG16 accuracy?")
                print(" ¬† ‚Ä¢ List plant disease monitoring sensors.")
                print(" ¬† ‚Ä¢ What prevents model overfitting generally?")
                print(" ¬† ‚Ä¢ What is the AI-IoT Pivot?")
                print(" ¬† ‚Ä¢ How many diseases did Mohanty classify?")
                print(" ¬† ‚Ä¢ Which crop has 100% detection?")
                print(" ¬† ‚Ä¢ List key mobile app functions.")

            if not query:
                print("‚ö†Ô∏è Please enter a question")
                continue

            # Process query
            print("\nüîÑ Searching through research papers...")
            result = rag_system.query(query, n_results=3)

            # Display results
            print("\n" + "="*60)
            print(f"üìã RESULTS FOR: {query}")
            print("="*60)
            print(result['response'])
            print("="*60)
            print(f"üìä Found {result['papers_found']} relevant papers")
            print("üí° Type 'help' for more example questions")

        except KeyboardInterrupt:
            print("\nüëã Goodbye!")
            break
        except Exception as e:
            print(f"‚ùå Error: {e}")
            print("üí° Try a different question or check if the system is properly initialized")
            continue

def single_query(question):
    """Ask a single question without the interactive loop"""
    try:
        print(f"üîç Searching for: {question}")
        result = rag_system.query(question, n_results=3)

        print("\n" + "="*60)
        print(f"üìã ANSWER:")
        print("="*60)
        print(result['response'])
        print("="*60)
        print(f"üìä Based on {result['papers_found']} research papers")

        return result

    except Exception as e:
        print(f"‚ùå Error: {e}")
        return None

# Quick test of the interface
def test_interface():
    """Test the interface with sample questions"""

    test_queries = [
        "What is the maximum accuracy achieved by the ResNet50 model for plant disease detection?",
        "Which deep learning architectures were evaluated for identifying apple leaf diseases?",
        "What key advantages does the AI-IoT Smart Agriculture Pivot offer over drones and robotics?",
        "What are the main findings regarding the use of deep learning for image-based plant disease detection in the Mohanty et al. paper[cite: 21]?",
        "How does the paper by Mahlein et al. describe the use of thermal sensors (IRT) for disease detection[cite: 563]?",
        "List the components and specifications of the controller used in the proposed hardware pilot[cite: 1301].",
        "What was the accuracy of the Custom CNN model for classifying potato diseases, and why was it the best model for that crop[cite: 3100]?"
    ]

    print("üß™ Testing interface with sample questions...")

    for i, question in enumerate(test_questions, 1):
        print(f"\n[Test {i}/3] {question}")
        result = single_query(question)
        if result:
            print(f"‚úÖ Success!")
        else:
            print(f"‚ùå Failed")

    print("\n‚úÖ Interface test completed!")

# Display available functions
print("‚úÖ Simple interface ready!")
print("\nüöÄ Available functions:")
print("   ‚Ä¢ query_interface() - Start interactive questioning")
print("   ‚Ä¢ single_query('your question') - Ask one question")
print("   ‚Ä¢ test_interface() - Test with sample questions")

print("\nüí° Example usage:")
print("   query_interface()  # Start interactive session")
print("   single_query(Which crop has 100% detection?')")

print("\nüìã Next: Run Cell 10 to load your own papers (optional)")

In [None]:
# CELL 10: Load Your Own Papers (OPTIONAL)
# ========================================
"""
üìÅ CELL 10: LOAD YOUR OWN PAPERS (OPTIONAL)
Use this cell to load papers you collected with the scraper.
Skip this cell if you want to use the sample data.
"""

def load_collected_papers(file_path):
    """Load papers from your collected JSON file"""
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            papers = json.load(f)

        # Filter papers with abstracts
        valid_papers = [p for p in papers if p.get('abstract', '').strip()]

        print(f"üìä Loaded {len(papers)} total papers")
        print(f"‚úÖ Found {len(valid_papers)} papers with abstracts")

        return valid_papers

    except FileNotFoundError:
        print(f"‚ùå File {file_path} not found")
        return None
    except Exception as e:
        print(f"‚ùå Error loading papers: {e}")
        return None

def analyze_paper_collection(papers):
    """Analyze the loaded paper collection"""

    if not papers:
        print("‚ùå No papers to analyze")
        return

    print("\nüìä PAPER COLLECTION ANALYSIS")
    print("="*50)

    # Basic stats
    total_papers = len(papers)
    with_abstracts = len([p for p in papers if p.get('abstract', '').strip()])

    print(f"üìö Total papers: {total_papers}")
    print(f"üìù With abstracts: {with_abstracts}")
    print(f"üìà Success rate: {with_abstracts/total_papers*100:.1f}%")

    # Journal analysis
    journals = [p.get('journal', 'Unknown') for p in papers if p.get('journal')]
    if journals:
        journal_counts = pd.Series(journals).value_counts()
        print(f"\nüìñ Top journals:")
        for journal, count in journal_counts.head().items():
            print(f"   ‚Ä¢ {journal}: {count} papers")

    # Abstract length analysis
    abstract_lengths = [len(p.get('abstract', '')) for p in papers if p.get('abstract')]
    if abstract_lengths:
        print(f"\nüìè Abstract lengths:")
        print(f"   ‚Ä¢ Average: {np.mean(abstract_lengths):.0f} characters")
        print(f"   ‚Ä¢ Range: {min(abstract_lengths)} - {max(abstract_lengths)}")

    print("="*50)

# UNCOMMENT THE LINES BELOW TO LOAD YOUR OWN PAPERS
"""
print("üìÅ Loading your collected  papers...")

# Replace with your file path
your_papers = load_collected_papers('iolr_2022_abstracts_abstracts_only.json')

if your_papers:
    print(f"üîÑ Replacing sample data with {len(your_papers)} collected papers...")

    # Analyze the collection
    analyze_paper_collection(your_papers)

    # Create new RAG system with your papers
    rag_system = EcologicalRAG(openai_api_key=OPENAI_API_KEY)
    rag_system.load_papers(your_papers)

    print("‚úÖ Your papers loaded successfully!")
else:
    print("‚ö†Ô∏è Could not load your papers, continuing with sample data")
"""

print("üìã Ready to load your own papers!")
print("Uncomment the code above and set your file path")
print("üìã Next: Run Cell 11 for Gradio interface (optional) or Cell 12 for analytics")

In [None]:
# CELL 11: Gradio Web Interface (OPTIONAL)
# ========================================
"""
üé® CELL 11: GRADIO WEB INTERFACE (OPTIONAL)
This cell creates a web-based interface using Gradio.
Only run this if Gradio was installed successfully.
"""

if GRADIO_AVAILABLE:

    def gradio_query(question, n_results=3):
        """Query function for Gradio interface"""
        if not question.strip():
            return "Please enter a question about ecological research."

        try:
            result = rag_system.query(question, n_results=int(n_results))
            return result['response']
        except Exception as e:
            return f"Error: {e}"

    def create_gradio_interface():
        """Create Gradio web interface"""

        # Example questions for the interface
        examples = [
            ["Compare ResNet50 and VGG16 model accuracy?", 3],
            ["What prevents model overfitting generally?", 3],
            ["Which crop has 100% detection?", 3],
            ["List plant disease monitoring sensors?", 3],
            ["What prevents model overfitting generally?", 3]
        ]

        # Create interface
        interface = gr.Interface(
            fn=gradio_query,
            inputs=[
                gr.Textbox(
                    label="üîç Ask your ecological question",
                    placeholder="e.g., How do we detect plant disease using image detection?",
                    lines=2
                ),
                gr.Slider(
                    minimum=1,
                    maximum=5,
                    value=3,
                    step=1,
                    label="üìä Number of papers to search"
                )
            ],
            outputs=gr.Textbox(
                label="üìã Research-based Answer",
                lines=15
            ),
            title="üåä Ecological RAG System -  Research Assistant",
            description="""
            Ask questions about marine and freshwater ecology research!
            This system searches through reseaech papers
            to provide evidence-based answers about home plants ecosystems.
            """,
            examples=examples,
            theme=gr.themes.Soft()
        )

        return interface

    print("üé® Creating Gradio web interface...")
    interface = create_gradio_interface()
    interface.launch(share=True)

    print("üöÄ To launch web interface, run: interface.launch(share=True)")
    print("üì± This will open a new tab in your browser")

else:
    print("‚ö†Ô∏è Gradio not available. Use Cell 9 for simple interface instead.")

print("üìã Next: Run Cell 12 for analytics and evaluation")

In [None]:
# CELL 12: Analytics and Evaluation
# =================================
"""
üìà CELL 12: ANALYTICS AND EVALUATION
This cell provides tools to analyze and evaluate RAG system performance.
"""

class QueryAnalytics:
    """Analytics for RAG system queries"""

    def __init__(self, rag_system):
        self.rag_system = rag_system
        self.query_history = []

    def logged_query(self, question, n_results=3):
        """Query with logging for analytics"""

        start_time = time.time()
        result = self.rag_system.query(question, n_results)
        end_time = time.time()

        # Log the query
        log_entry = {
            'timestamp': time.time(),
            'question': question,
            'response_time': end_time - start_time,
            'papers_found': result['papers_found'],
            'response_length': len(result['response']),
            'result': result
        }

        self.query_history.append(log_entry)
        return result

    def get_analytics(self):
        """Get analytics summary"""

        if not self.query_history:
            return "No queries logged yet"

        df = pd.DataFrame(self.query_history)

        analytics = {
            'total_queries': len(self.query_history),
            'avg_response_time': df['response_time'].mean(),
            'avg_papers_found': df['papers_found'].mean(),
            'avg_response_length': df['response_length'].mean(),
            'most_common_topics': self._extract_topics(),
            'recent_queries': df.tail(5)['question'].tolist()
        }

        return analytics

    def _extract_topics(self):
        """Extract common topics from queries"""
        all_queries = ' '.join([q['question'].lower() for q in self.query_history])

        # Common ecological terms
        topics = {
            'DL_Models': sum(q.count('ResNet50') + q.count('VGG16') + q.count('MVGG16') for q in all_queries),
            'Accuracy_Metrics': sum(q.count('accuracy') + q.count('100%') for q in all_queries),
            'Hardware_Deployment': sum(q.count('Pivot') + q.count('sensors') for q in all_queries),
            'Model_Training_Issues': sum(q.count('overfitting') for q in all_queries),
            'Study_Scope': sum(q.count('Mohanty') + q.count('diseases') + q.count('crop') for q in all_queries),
            'Application_Interface': sum(q.count('mobile app') + q.count('functions') for q in all_queries)
        }

        return {k: v for k, v in topics.items() if v > 0}

    def print_analytics(self):
        """Print formatted analytics"""

        analytics = self.get_analytics()

        if isinstance(analytics, str):
            print(analytics)
            return

        print("\nüìà RAG SYSTEM ANALYTICS")
        print("="*40)
        print(f"üîç Total queries: {analytics['total_queries']}")
        print(f"‚è±Ô∏è Avg response time: {analytics['avg_response_time']:.2f}s")
        print(f"üìö Avg papers found: {analytics['avg_papers_found']:.1f}")
        print(f"üìù Avg response length: {analytics['avg_response_length']:.0f} chars")

        if analytics['most_common_topics']:
            print(f"\nüè∑Ô∏è Common topics:")
            for topic, count in analytics['most_common_topics'].items():
                print(f"   ‚Ä¢ {topic}: {count} mentions")

        if analytics['recent_queries']:
            print(f"\nüïí Recent queries:")
            for i, query in enumerate(analytics['recent_queries'], 1):
                print(f"   {i}. {query[:60]}...")

# Initialize analytics
analytics = QueryAnalytics(rag_system)

def test_system_performance():
    """Test system with various queries"""

    test_queries = [
        "What is the final testing accuracy achieved by the ResNet50 model for 11-class plant disease?",
        "Which CNN models were identified as the best performers for detecting potato and tomato leaf diseases?",
        "How is the detected plant disease treated or managed by the actuators in the AI-IoT smart agriculture pivot system?",
        "Describe the key strategies implemented to solve the overfitting issue in the large CNN models used for multi-crop disease detection?",
    ]

    print("üß™ Testing system performance with sample queries...")

    for i, query in enumerate(test_queries, 1):
        print(f"[{i}/{len(test_queries)}] Testing: {query[:50]}...")
        result = analytics.logged_query(query)
        print(f"   ‚úÖ Found {result['papers_found']} papers")

    print("\nüìä Performance test completed!")
    analytics.print_analytics()

print("‚úÖ Analytics system ready!")
print("üìä Run: test_system_performance() to test with sample queries")
print("üìà Run: analytics.print_analytics() to see current stats")
print("üìã Next: Run Cell 13 for advanced features")

In [None]:
# CELL 13: Advanced Query Features
# ================================
"""
üéØ CELL 13: ADVANCED QUERY FEATURES
This cell adds advanced features like query suggestions and filters.
"""

class AdvancedQuerySystem:
    """Enhanced query system with advanced features"""

    def __init__(self, rag_system):
        self.rag_system = rag_system
        self.common_terms = self._build_term_index()

    def _build_term_index(self):
        """Build index of common terms from papers"""

        terms = {
            'DL_models': ['ResNet50', 'VGG16', 'MVGG16', 'Xception', 'DenseNet121', 'InceptionV3', 'MobileNet'],
            'crops': ['potato', 'tomato', 'pepper bell', 'apple', 'corn', 'grape', 'peach', 'rice', 'habanero'],
            'diseases': ['Early Blight', 'Bacterial Spot', 'Late Blight', 'Yellow Leaf Curl Virus', 'Healthy'],
            'architectures': ['CNN', 'Deep Learning', 'Transfer Learning', 'Hybrid Models'],
            'systems': ['AI-IoT Pivot', 'Mobile Application', 'FAISS', 'TensorFlow Serving'],
            'metrics': ['accuracy', 'precision', 'recall', 'F1-score', 'loss', 'epoch']
        }

        return terms

    def suggest_queries(self, partial_query=""):
        """Suggest query completions"""

        suggestions = []

        # Template-based suggestions
        templates = [
            "What are the impacts of {phenomena} on {ecosystems}?",
            "How do {species} affect {ecosystems}?",
            "What {methods} are used to study {species}?",
            "How does climate change affect {species} in the {locations}?",
            "What causes {phenomena} in {locations}?"
        ]

        # Generate suggestions
        for template in templates:
            for category, terms in self.common_terms.items():
                if '{' + category + '}' in template:
                    for term in terms[:2]:  # Limit to 2 terms per category
                        suggestion = template.replace('{' + category + '}', term)
                        # Fill other placeholders with generic terms
                        for cat, term_list in self.common_terms.items():
                            suggestion = suggestion.replace('{' + cat + '}', term_list[0])
                        suggestions.append(suggestion)

        # Filter by partial query if provided
        if partial_query:
            suggestions = [s for s in suggestions if partial_query.lower() in s.lower()]

        return list(set(suggestions))[:10]  # Return unique suggestions, max 10

    def explain_query(self, question):
        """Explain how the query will be processed"""

        print(f"üîç QUERY ANALYSIS: {question}")
        print("="*50)

        # Extract key terms
        question_lower = question.lower()
        found_terms = {}

        for category, terms in self.common_terms.items():
            found = [term for term in terms if term in question_lower]
            if found:
                found_terms[category] = found

        if found_terms:
            print("üè∑Ô∏è Detected terms:")
            for category, terms in found_terms.items():
                print(f"   ‚Ä¢ {category.title()}: {', '.join(terms)}")

        # Suggest related queries
        suggestions = self.suggest_queries(question)
        if suggestions:
            print(f"\nüí° Related queries you might try:")
            for i, suggestion in enumerate(suggestions[:3], 1):
                print(f"   {i}. {suggestion}")

        print("="*50)

# Initialize advanced query system
advanced_query = AdvancedQuerySystem(rag_system)

def interactive_query_builder():
    """Interactive query builder with suggestions"""

    print("üéØ ADVANCED QUERY BUILDER")
    print("="*40)
    print("Type 'help' for commands, 'quit' to exit")

    while True:
        try:
            command = input("\nüí¨ Command: ").strip().lower()

            if command == 'quit':
                break

            elif command == 'help':
                print("\nüìã Available commands:")
                print("   ‚Ä¢ suggest - Get query suggestions")
                print("   ‚Ä¢ explain <query> - Explain query processing")
                print("   ‚Ä¢ query <question> - Regular query")
                print("   ‚Ä¢ quit - Exit")

            elif command == 'suggest':
                suggestions = advanced_query.suggest_queries()
                print("\nüí° Query suggestions:")
                for i, suggestion in enumerate(suggestions[:5], 1):
                    print(f"   {i}. {suggestion}")

            elif command.startswith('explain '):
                query = command[8:]
                advanced_query.explain_query(query)

            elif command.startswith('query '):
                question = command[6:]
                result = rag_system.query(question)
                print("\n" + "="*50)
                print(result['response'])
                print("="*50)

            else:
                print("‚ùì Unknown command. Type 'help' for available commands.")

        except KeyboardInterrupt:
            break
        except Exception as e:
            print(f"‚ùå Error: {e}")

print("üéØ Advanced query features loaded!")
print("üí° Run: advanced_query.suggest_queries() for suggestions")
print("üîç Run: advanced_query.explain_query('your question') for analysis")
print("üé® Run: interactive_query_builder() for interactive interface")
print("üìã Next: Run Cell 14 for system summary")

In [None]:
# CELL 14: System Summary and Testing
# ===================================
"""
üìã CELL 14: SYSTEM SUMMARY AND TESTING
This cell provides a summary of the complete RAG system and quick tests.
"""

def print_system_summary():
    """Print complete system summary"""

    print("üåä ECOLOGICAL RAG SYSTEM - COMPLETE SETUP")
    print("="*60)

    # System status
    print("üîß SYSTEM STATUS:")
    print(f"   ‚úÖ Vector Store: {'ChromaDB' if CHROMADB_AVAILABLE else 'Simple Store'}")
    print(f"   ‚úÖ Embeddings: {'Transformer' if TRANSFORMERS_AVAILABLE else 'TF-IDF'}")
    print(f"   ‚úÖ Generation: {'OpenAI GPT' if OPENAI_AVAILABLE and rag_system.use_openai else 'Template'}")
    print(f"   ‚úÖ Interface: {'Gradio' if GRADIO_AVAILABLE else 'Command Line'}")

    # Data status
    if hasattr(rag_system, 'collection'):
        try:
            paper_count = rag_system.collection.count()
            print(f"   ‚úÖ Papers Loaded: {paper_count}")
        except:
            print(f"   ‚úÖ Papers Loaded: {len(rag_system.papers) if hasattr(rag_system, 'papers') else 'Unknown'}")

    # Available functions
    print("\nüõ†Ô∏è AVAILABLE FUNCTIONS:")
    print("   ‚Ä¢ rag_system.query(question) - Basic query")
    print("   ‚Ä¢ analytics.logged_query(question) - Query with analytics")
    print("   ‚Ä¢ advanced_query.suggest_queries() - Get suggestions")
    print("   ‚Ä¢ query_interface() - Simple text interface")
    print("   ‚Ä¢ interactive_query_builder() - Advanced interface")
    if GRADIO_AVAILABLE:
        print("   ‚Ä¢ interface.launch() - Web interface")

    # Example queries
    print("\nüí° EXAMPLE QUERIES:")
    examples = [
        "How do convolutional neural networks improve the accuracy of plant disease identification?",
        "What challenges arise when detecting crop diseases from images taken under real field conditions?",
        "How does transfer learning help classify soybean diseases using CNN models like AlexNet and GoogleNet?",
        "What molecular and serological techniques are used to identify plant pathogens before symptoms appear?",
        "How has deep learning transformed agricultural plant disease detection compared to traditional image-processing methods?"
    ]

    for i, example in enumerate(examples, 1):
        print(f"   {i}. {example}")

    print("\nüéØ QUICK START:")
    print("   1. query_interface() - Start asking questions")
    print("   2. test_system_performance() - Run performance tests")
    print("   3. analytics.print_analytics() - View analytics")

    print("="*60)
    print("üéâ Your Ecological RAG System is ready!")
    print("Happy researching! üåäüî¨üìä")

def quick_test():
    """Quick test of the system"""
    print("\nüß™ QUICK SYSTEM TEST")
    print("-"*30)

    test_questions = [
        "How do variations in environmental conditions (lighting, backgrounds, plant architecture) impact the generalization ability of CNN models trained for plant disease identification?",
        "What are the main limitations of traditional color- and texture-based image-processing techniques when detecting plant diseases under uncontrolled field conditions?",
        "How does transfer learning with pretrained CNN architectures like AlexNet and GoogleNet improve the accuracy of soybean disease classification compared to conventional machine-learning methods?",
        "How do molecular diagnostic tools such as ELISA and FISH complement traditional visual inspection methods in accurately identifying plant pathogens?",
        "What key challenges must be overcome for deep learning models to fully replace traditional image-processing techniques in real-world agricultural disease detection?"

    ]

    for i, question in enumerate(test_questions, 1):
        print(f"\n[Test {i}] {question}")
        try:
            result = rag_system.query(question, n_results=3)
            print(f"‚úÖ Success: Found {result['papers_found']} papers")
            print(f"üìù Response length: {len(result['response'])} characters")
            # Correctly access paper titles
            if result['papers_found'] > 0:
                titles = [paper['title'] for paper in result['search_results']['metadatas'][0]]
                print(f"üìù Paper titles: {', '.join(titles)}")
            else:
                print("üìù No paper titles available.")
        except Exception as e:
            print(f"‚ùå Error: {e}")

    print("\n‚úÖ Quick test completed!")

def demo_all_features():
    """Demonstrate all system features"""

    print("üé¨ FULL SYSTEM DEMONSTRATION")
    print("="*50)

    # Test basic query
    print("\n1Ô∏è‚É£ BASIC QUERY TEST")
    result = rag_system.query("How do invasive species affect Mediterranean ecosystems?")
    print(f"‚úÖ Found {result['papers_found']} papers")

    # Test analytics
    print("\n2Ô∏è‚É£ ANALYTICS TEST")
    analytics_result = analytics.logged_query("What causes marine heatwaves?")
    print(f"‚úÖ Analytics logged: {len(analytics.query_history)} total queries")

    # Test suggestions
    print("\n3Ô∏è‚É£ QUERY SUGGESTIONS TEST")
    suggestions = advanced_query.suggest_queries()
    print(f"‚úÖ Generated {len(suggestions)} suggestions")
    for i, suggestion in enumerate(suggestions[:3], 1):
        print(f"   {i}. {suggestion}")

    # Test query explanation
    print("\n4Ô∏è‚É£ QUERY EXPLANATION TEST")
    advanced_query.explain_query("How do jellyfish affect marine ecosystems?")

    print("\nüéâ All features working!")

# Print system summary
print_system_summary()

quick_test()

# Available test functions
print("\nüß™ AVAILABLE TESTS:")
print("   ‚Ä¢ quick_test() - Quick functionality test")
print("   ‚Ä¢ demo_all_features() - Full feature demonstration")
print("   ‚Ä¢ test_system_performance() - Comprehensive performance test")

print("\nüöÄ READY TO USE!")
print("Run any of the test functions or start with query_interface()")