<a href="https://colab.research.google.com/github/ykhier/Cloud-Computing-Cat/blob/main/INDEX.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import requests
from bs4 import BeautifulSoup
import re
from collections import defaultdict
from nltk.stem import PorterStemmer

# Firebase RTDB endpoint where the inverted index is stored.
# Using PUT on this URL replaces the whole index each run.
FIREBASE_URL_PLANET = "https://purrform-a5da9-default-rtdb.firebaseio.com/plant_disease_index.json"

def fetch_page(url):
    """Fetch a Springer page and return a BeautifulSoup object, or None if the request fails."""
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
    }
    try:
        response = requests.get(url, headers=headers, timeout=15)
        if response.status_code == 200:
            return BeautifulSoup(response.text, "html.parser")
        return None
    except Exception as e:
        # Silent fail so one broken URL does not stop the full pipeline
        return None

def extract_content(soup):
    """Extract text from the Abstract and Introduction sections (best-effort, using multiple selectors)."""
    content = ""

    # Try a few common patterns for the abstract section
    for selector in ['div[id*="Abs"]', 'div.abstract', 'section.Abstract']:
        elements = soup.select(selector)
        for element in elements:
            text = element.get_text(strip=True)
            if text:
                content += " " + text
                break

    # Try a few common patterns for the introduction section
    for selector in ['div[id*="Sec1"]', 'div.introduction', 'section.Introduction']:
        elements = soup.select(selector)
        for element in elements:
            text = element.get_text(strip=True)
            if text:
                content += " " + text
                break

    return content

def get_stop_words():
    """Return a compact stop-word list to reduce noise in keyword extraction."""
    return {
        # Basic English (30 words)
        "the", "and", "for", "with", "that", "this", "are", "was", "were", "from",
        "have", "has", "had", "but", "not", "can", "may", "will", "would", "could",
        "they", "them", "their", "these", "those", "such", "very", "much", "many", "some",

        # Academic/Research terms (15 words)
        "study", "paper", "research", "method", "methods", "result", "results", "analysis",
        "show", "use", "used", "based", "data", "system", "approaches",

        # Generic descriptors (10 words)
        "new", "different", "important", "high", "low", "large", "small", "good", "better", "modern"
    }

def clean_text(text):
    """Tokenize text, remove stop-words, apply stemming, then return a cleaned list of terms."""
    if not text:
        return []

    # Keep only alphabetic words with length >= 3
    words = re.findall(r'\b[a-zA-Z]{3,}\b', text.lower())
    stop_words = get_stop_words()
    stemmer = PorterStemmer()

    cleaned_words = []
    for word in words:
        # First pass: remove stop-words before stemming
        if word not in stop_words:
            stemmed = stemmer.stem(word)

            # Second pass: after stemming, remove short/noisy tokens too
            if stemmed not in stop_words and len(stemmed) >= 3:
                cleaned_words.append(stemmed)
            elif len(stemmed) < 3 and word not in stop_words:
                cleaned_words.append(word)

    return cleaned_words

def build_word_graph(words, window_size=4):
    """Build a co-occurrence graph where each word connects to nearby words inside a sliding window."""
    graph = defaultdict(lambda: defaultdict(float))

    # Walk through the token list and connect each word to neighbors around it
    for i, word in enumerate(words):
        start = max(0, i - window_size)
        end = min(len(words), i + window_size + 1)

        for j in range(start, end):
            if i != j:
                neighbor = words[j]

                # Closer words contribute more than far words
                distance = abs(i - j)
                weight = 1.0 / distance
                graph[word][neighbor] += weight

    return graph

def calculate_textrank_scores(graph, damping=0.85, iterations=20):
    """Compute TextRank scores over the word graph using power-iteration."""
    words = list(graph.keys())
    if not words:
        return {}

    # Start all nodes with the same score
    scores = {word: 1.0 for word in words}

    # Repeat updates a fixed number of times for stability
    for iteration in range(iterations):
        new_scores = {}

        for word in words:
            # Base probability (random jump)
            score = 1.0 - damping

            # Add contributions from neighbors
            for neighbor, connection_strength in graph[word].items():
                neighbor_total_connections = sum(graph[neighbor].values())
                if neighbor_total_connections > 0:
                    contribution = (connection_strength / neighbor_total_connections) * scores[neighbor]
                    score += damping * contribution

            new_scores[word] = score

        scores = new_scores

    return scores

def count_word_frequencies_per_document(documents):
    """Build a per-document frequency map: term -> {doc_id: count}."""
    word_doc_counts = defaultdict(lambda: defaultdict(int))

    for doc_id, document in enumerate(documents, 1):
        words = clean_text(document)

        for word in words:
            word_doc_counts[word][doc_id] += 1

    return word_doc_counts

def create_firebase_structure(top_keywords, word_doc_counts):
    """Create the RTDB structure expected by your search engine: keyword -> DocsIds map + term field."""
    firebase_data = {}

    for keyword in top_keywords:
        firebase_data[keyword] = {
            "DocsIds": {},
            "term": keyword
        }

        # Store doc-id as string because RTDB keys are strings
        doc_counts = word_doc_counts[keyword]
        for doc_id, count in doc_counts.items():
            firebase_data[keyword]["DocsIds"][str(doc_id)] = count

    return firebase_data

def upload_to_firebase(data):
    """Upload the entire inverted index to Firebase RTDB (overwrites the node)."""
    try:
        response = requests.put(FIREBASE_URL_PLANET, json=data)

        if response.status_code in [200, 201]:
            print("Successfully uploaded to Firebase")
            return True
        else:
            print(f"Upload failed with status: {response.status_code}")
            return False
    except Exception as e:
        print(f"Upload error: {e}")
        return False

def extract_textrank_keywords(documents, top_k=30):
    """Extract top keywords from the whole corpus using TextRank."""
    combined_text = " ".join(documents)
    words = clean_text(combined_text)

    # If we barely have tokens, just return what we have
    if len(words) < 10:
        return words[:top_k]

    graph = build_word_graph(words)
    scores = calculate_textrank_scores(graph)

    ranked_words = sorted(scores.items(), key=lambda x: x[1], reverse=True)

    # Keep only the term strings
    keywords = [word for word, score in ranked_words[:top_k]]
    return keywords

def run_textrank_with_firebase():
    """Run TextRank on multiple papers, print results, and upload the inverted index to Firebase."""
    urls = [
        "https://link.springer.com/article/10.1007/s13593-014-0246-1",
        "https://link.springer.com/chapter/10.1007/978-981-15-6315-7_23",
        "https://link.springer.com/chapter/10.1007/978-981-15-5959-4_11",
        "https://link.springer.com/chapter/10.1007/978-981-15-2774-6_5",
        "https://link.springer.com/chapter/10.1007/978-981-15-6315-7_24",
    ]

    documents = []
    successful_urls = 0

    # Fetch each URL and extract only the parts we care about
    for i, url in enumerate(urls, 1):
        soup = fetch_page(url)

        if soup:
            content = extract_content(soup)
            if content:
                documents.append(content)
                successful_urls += 1

    if not documents:
        print("No documents processed successfully")
        return None

    print(f"Total documents processed: {successful_urls}")

    keywords = extract_textrank_keywords(documents, top_k=30)
    word_doc_counts = count_word_frequencies_per_document(documents)
    firebase_data = create_firebase_structure(keywords, word_doc_counts)

    print("\nTOP 30 TEXTRANK KEYWORDS:")
    print("=" * 40)
    for i, keyword in enumerate(keywords, 1):
        print(f"{i:2d}. {keyword}")

        # Print how the keyword is distributed across docs
        doc_counts = word_doc_counts[keyword]
        if doc_counts:
            docs_info = [f"Doc{doc_id}:{count}" for doc_id, count in sorted(doc_counts.items())]
            print(f"     {', '.join(docs_info)}")

    success = upload_to_firebase(firebase_data)

    return {
        'keywords': keywords,
        'documents': documents,
        'firebase_data': firebase_data,
        'word_doc_counts': word_doc_counts
    }

if __name__ == "__main__":
    results = run_textrank_with_firebase()

    if results:
        print("\nTextRank extraction and Firebase upload completed")
    else:
        print("\nExtraction failed")


Total documents processed: 5

TOP 30 TEXTRANK KEYWORDS:
 1. diseas
     Doc1:6, Doc2:6
 2. detect
     Doc1:4, Doc2:2, Doc4:1, Doc5:3
 3. monitor
     Doc1:1, Doc4:2, Doc5:6
 4. flood
     Doc5:9
 5. techniqu
     Doc1:3, Doc2:1, Doc4:2, Doc5:2
 6. imag
     Doc2:2, Doc5:6
 7. plant
     Doc1:4, Doc2:4
 8. technolog
     Doc1:1, Doc3:7
 9. commun
     Doc3:8
10. iot
     Doc3:8
11. sens
     Doc1:2, Doc4:3, Doc5:2
12. learn
     Doc2:2, Doc5:4
13. develop
     Doc1:1, Doc3:4
14. product
     Doc2:3, Doc3:1
15. infect
     Doc1:4
16. network
     Doc2:3, Doc5:1
17. help
     Doc1:2, Doc2:1, Doc5:1
18. disast
     Doc2:1, Doc5:3
19. serolog
     Doc1:4
20. deep
     Doc5:4
21. process
     Doc1:1, Doc4:2
22. agricultur
     Doc1:2, Doc2:1
23. day
     Doc1:1, Doc4:1, Doc5:1
24. innov
     Doc1:2, Doc3:1
25. optic
     Doc4:3
26. highli
     Doc2:2, Doc4:1
27. machin
     Doc2:3
28. leaf
     Doc2:3
29. india
     Doc2:1, Doc5:2
30. crop
     Doc1:1, Doc2:2
Successfully uploaded to Fireba