In [1]:
import requests
from urllib import parse
from bs4 import BeautifulSoup, SoupStrainer
from typing import Iterator
import re
from collections import Counter

import index_database as db

In [2]:
base_address = 'https://hadoop.apache.org/docs/stable/index.html'

In [3]:
def get_links_in_document_from_response(response : requests.Response) -> Iterator[str]:
    content = response.text
    address = response.url
    for link in BeautifulSoup(content, 'html.parser', parse_only=SoupStrainer('a')):
        if link.has_attr('href'):
            href = link['href']
            yield parse.urljoin(address, href)
    
def get_text_in_document_from_response(response : requests.Response) -> str:
    content = response.text
    return BeautifulSoup(content, 'html.parser').get_text()

In [None]:
def index_document_from_response(response : requests.Response):
    print("Indexing\t", response.url)
    words = Counter(map(str.lower, re.findall('([a-zA-Z]+)', get_text_in_document_from_response(response))))
    links = list(get_links_in_document_from_response(response))

    current_url_object, _ = db.Urls.get_or_create(url = response.url)
    current_url_object.is_indexed = True
    current_url_object.save()

    for word in words:
        word_object, _ = db.Words.get_or_create(term = word)
        word_reference_object, _ = db.WordsReferences.get_or_create(referenced_word = word_object, page_url = current_url_object) 
        word_reference_object.count = words[word]
        word_reference_object.save()


    for link in links:
        reference_url_object, _ = db.Urls.get_or_create(url = link)
        db.References.get_or_create(reference_from = current_url_object, reference_to = reference_url_object)
    


In [5]:
def is_html(url : str):
    head = requests.head(url)
    if('Content-Type' in head.headers):
        return head.headers['Content-Type'].startswith('text/html')

In [6]:
def enumerate_page_and_references(base_url : str) -> Iterator[requests.Response]:
    counter = 0
    

    downloaded_pages = set()
    pages_to_download = [base_url]
    while(len(pages_to_download) > 0):
        current_url = pages_to_download.pop(0)

        if(current_url in downloaded_pages):
            continue
        if(not is_html(current_url)):
            continue

        counter += 1
        if(counter > 20):
            break
        try:
            downloaded_pages.add(current_url)
            response = requests.get(current_url)
            new_links = list(get_links_in_document_from_response(response))
            pages_to_download.extend(list(new_links))
        except: 
            print("Failed to download", current_url)

        yield response
 

   

In [7]:
responses = enumerate_page_and_references(base_address)
db.connect()

for response in responses:
    index_document_from_response(response)

db.close()

Indexing	 https://hadoop.apache.org/docs/stable/index.html
Indexing	 https://cwiki.apache.org/confluence/display/hadoop
Indexing	 https://github.com/apache/hadoop
Indexing	 https://hadoop.apache.org/docs/stable/hadoop-project-dist/hadoop-common/SingleCluster.html
Indexing	 https://hadoop.apache.org/docs/stable/hadoop-project-dist/hadoop-common/ClusterSetup.html
Indexing	 https://hadoop.apache.org/docs/stable/hadoop-project-dist/hadoop-common/CommandsManual.html
Indexing	 https://hadoop.apache.org/docs/stable/hadoop-project-dist/hadoop-common/FileSystemShell.html
Indexing	 https://hadoop.apache.org/docs/stable/hadoop-project-dist/hadoop-common/Compatibility.html
Indexing	 https://hadoop.apache.org/docs/stable/hadoop-project-dist/hadoop-common/DownstreamDev.html
Indexing	 https://hadoop.apache.org/docs/stable/hadoop-project-dist/hadoop-common/AdminCompatibilityGuide.html
Indexing	 https://hadoop.apache.org/docs/stable/hadoop-project-dist/hadoop-common/InterfaceClassification.html
Indexin