<a href="https://colab.research.google.com/github/wadeaT/Cloud-Computing-Course/blob/main/index.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install requests beautifulsoup4



In [3]:
from bs4 import BeautifulSoup
from nltk.stem import PorterStemmer
from collections import defaultdict
import requests
import json
import nltk
import re
nltk.download('punkt')

class IndexCreator:
    #Class Initialization
    def __init__(self, start_url, firebase_url, max_pages=50):
        self.start_url = start_url # starting URL to proccess
        self.firebase_url = firebase_url.rstrip('/') + '/'  # the firebase URL
        self.max_pages = max_pages  # maximum number of pages to process
        self.index = defaultdict(dict) #for the index to automatically handle new terms
        self.processed_pages = 0 # number of proccessed pages, start at 0
        self.visited_urls = set() #visited URLS

        # Patterns for URLs we want to follow
        self.relevant_patterns = [
            # Core Compute Products
            r'/products/compute',
            r'/compute',  # General compute pages
            r'/kubernetes-engine',  # Google Kubernetes Engine (GKE)
            r'/container-optimized-os',
            r'/run',  # Cloud Run
            r'/functions',  # Cloud Functions
            r'/app-engine',  # App Engine

            # Compute Documentation
            r'/compute/docs',  # All compute docs
            r'/compute/docs/concepts',
            r'/compute/docs/instances',
            r'/compute/docs/disks',
            r'/compute/docs/networking',
            r'/compute/docs/security',
            r'/compute/docs/regions-zones',
            r'/compute/docs/quickstarts',
            r'/compute/docs/tutorials',
            r'/compute/docs/containers',

            # Container and Kubernetes related
            r'/kubernetes-engine/docs',
            r'/container-registry/docs',
            r'/containers',

            # Serverless Computing
            r'/serverless',
            r'/run/docs',  # Cloud Run documentation
            r'/functions/docs',  # Cloud Functions documentation
            r'/app-engine/docs',  # App Engine documentation

            # Infrastructure and VMs
            r'/vpc/docs',  # Virtual Private Cloud
            r'/virtual-machines',
            r'/vmware-engine',

            # Pricing and Planning
            r'/compute/pricing',
            r'/compute/docs/machine-types',
            r'/compute/docs/cpu-platforms',

            # Solutions and Reference
            r'/solutions/compute',
            r'/architecture/compute',
            r'/docs/cloud-computing'
        ]

    #checks if that pattern appears anywhere in the URL string
    #Returns True/False for each check
    def is_relevant_url(self, url):
        """Check if URL matches our relevant patterns"""
        return any(pattern in url for pattern in self.relevant_patterns)

    def extract_content(self, url):
        """Extract meaningful content from a page"""
        try:
            #makes an HTTP request to get the webpage content
            response = requests.get(url)
            #Uses BeautifulSoup to parse the HTML into a soup object for easy navigation
            soup = BeautifulSoup(response.text, 'html.parser')

            # Remove navigation, footer, etc.
            for elem in soup.find_all(['nav', 'footer', 'script', 'style']):
                elem.decompose()

            #extracting main content
            content_sections = []
            # Get main content
            main_content = soup.find('main')
            if main_content:
                content_sections.append(main_content.get_text())

            # Get article content
            article = soup.find('article')
            if article:
                content_sections.append(article.get_text())

            # Get section content
            sections = soup.find_all('section')
            for section in sections:
                content_sections.append(section.get_text())

            return ' '.join(content_sections)
        except Exception as e:
            print(f"Error extracting content from {url}: {e}")
            return ""

    def get_links(self, url):
        """Get relevant links from a page"""
        try:
            response = requests.get(url)
            soup = BeautifulSoup(response.text, 'html.parser')
            links = set() #Using a set to avoid duplicate links

            #finds all <a> tags that have an href attribute
            for link in soup.find_all('a', href=True):
                href = link['href'] #Gets the href value (the URL)
                full_url = ''

                if href.startswith('http'):
                    if 'cloud.google.com' in href:
                        full_url = href
                elif href.startswith('/'):
                    full_url = f"https://cloud.google.com{href}"

                if full_url and self.is_relevant_url(full_url):
                    links.add(full_url)

            return list(links)
        except Exception as e:
            print(f"Error getting links from {url}: {e}")
            return []

    def process_text(self, text):
        """Process text into stemmed words"""
        stop_words = {'ha','thi','skip','-','&', 'me', 'my', 'myself', 'we', 'our', 'ours',
                     'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your',
                     'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself',
                     'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its'}

        words = []
        for word in text.split():
            word = word.lower()
            word = ''.join(c for c in word if c.isalnum())
            if word and word not in stop_words and len(word) > 2:
                words.append(word)
        #Uses Porter Stemming algorithm to reduce words to their root form
        stemmer = PorterStemmer()
        return [stemmer.stem(word) for word in words]

    def create_index(self):
        """Create inverted index from relevant pages"""
        queue = [self.start_url]
        doc_id = 1

        while queue and self.processed_pages < self.max_pages:
            url = queue.pop(0)

            if url in self.visited_urls:
                continue

            print(f"Processing page {self.processed_pages + 1}/{self.max_pages}: {url}")

            # Extract and process content
            content = self.extract_content(url)
            if content:
                words = self.process_text(content)

                # Count word occurrences
                word_counts = defaultdict(int)
                for word in words:
                    word_counts[word] += 1

                # Update index
                for word, count in word_counts.items():
                    self.index[word][str(doc_id)] = {
                        "url": url,
                        "counter": count
                    }

                # Mark as processed
                self.visited_urls.add(url)
                self.processed_pages += 1
                doc_id += 1

                # Get new links to process
                if self.processed_pages < self.max_pages:
                    new_links = self.get_links(url)
                    queue.extend([link for link in new_links if link not in self.visited_urls])

        print(f"\nIndexing complete!")
        print(f"Pages processed: {self.processed_pages}")
        print(f"Unique terms indexed: {len(self.index)}")

    def create_final_data(self):
        """Format index for database storage"""
        return [{
            'term': word,
            'DocId': docs
        } for word, docs in self.index.items()]

    def upload_to_firebase(self):
        """Upload index to Firebase using REST API"""
        print("\nUploading to Firebase...")

        data = self.create_final_data()
        index_url = f"{self.firebase_url}index.json"

        try:
            response = requests.put(index_url, json=data)
            response.raise_for_status()
            print("Upload complete! Data stored in Firebase.")

        except requests.exceptions.RequestException as e:
            print(f"Error uploading to Firebase: {e}")

# Usage
url = "https://cloud.google.com/products/compute"
firebase_url = "https://turtlenimbus-d033a-default-rtdb.firebaseio.com/"

# Create index with max 50 pages
indexer = IndexCreator(url, firebase_url, max_pages=50)
indexer.create_index()
indexer.upload_to_firebase()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Processing page 1/50: https://cloud.google.com/products/compute
Processing page 2/50: https://cloud.google.com/compute/docs/regions-zones/viewing-regions-zones
Processing page 3/50: https://cloud.google.com/kubernetes-engine/docs/tutorials/hello-app
Processing page 4/50: https://cloud.google.com/compute/docs/projects
Processing page 5/50: https://cloud.google.com/compute/docs/general-purpose-machines#c3_series
Processing page 6/50: https://cloud.google.com/compute/docs/general-purpose-machines#n1_machines
Processing page 7/50: https://cloud.google.com/kubernetes-engine/
Processing page 8/50: https://cloud.google.com/compute/docs/storage-optimized-machines#z3_series
Processing page 9/50: https://cloud.google.com/kubernetes-engine/pricing/
Processing page 10/50: https://cloud.google.com/products/compute/
Processing page 11/50: https://cloud.google.com/compute/docs/images/create-delete-deprecate-private-images
Processing page 12/50: https://cloud.google.com/products/compute#common-uses
Pr

In [2]:
def get_index_from_db():
    """Retrieve index from Firebase database"""
    firebase_url = "https://turtlenimbus-d033a-default-rtdb.firebaseio.com/"
    index_url = f"{firebase_url}index.json"

    try:
        # Get data from Firebase
        response = requests.get(index_url)
        response.raise_for_status()  # Raise an exception for bad status codes

        # Convert the list of dictionaries into the format your code expects
        index_data = response.json()
        index = defaultdict(dict)

        # Reconstruct the index structure
        for entry in index_data:
            term = entry['term']
            docs = entry['DocId']
            index[term] = docs

        print("Successfully retrieved index from database")
        return index

    except requests.exceptions.RequestException as e:
        print(f"Error retrieving index from Firebase: {e}")
        return None

In [None]:
# Add these imports at the top of index.ipynb
from firebase_admin import credentials, firestore
import firebase_admin

def initialize_firebase():
    """Initialize Firebase connection"""
    try:
        # Check if already initialized
        if not firebase_admin._apps:
            cred = credentials.Certificate({
                "type": "service_account",
                "project_id": "turtlenimbus-d033a",
                # Add other credential details from your Firebase service account JSON
            })
            firebase_admin.initialize_app(cred)
        return firestore.client()
    except Exception as e:
        print(f"Error initializing Firebase: {e}")
        return None

# Initialize Firebase when notebook runs
db = initialize_firebase()