In [None]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_context('notebook')

%load_ext autoreload
%autoreload 2

In [None]:
from collections import deque
import requests
from urllib.parse import urljoin
import time
import random

from bs4 import BeautifulSoup
import networkx as nx

In [None]:
from crawler import cache_page

In [None]:
status = {
    True: "class_true",
    False: "class_false",
    "fail": "req_failed",
}

class Node:
    # Status one of "class_true", "class_false", "req_failed"
    def __init__(self, url, status):
        self.url = url
        self.status = status
        
    def __str__(self):
        # Remove "https://www." part for clarity
        return self.url#TODOremove
        return "".join(self.url.split("www.")[1:])
    
    def __eq__(self, other):
        if isinstance(self, other.__class__):
            return self.url == other.url
        return False
    
    def __hash__(self):
        return hash(self.url)

In [None]:
first_link = "https://www.coursera.org/"

# Queue containing: (url, depth)
queue = deque()
queue.append((first_link, 0, None))

G = nx.DiGraph()
seen_urls = set()

# Params
depth_limit = 3
breadth_limit = 3

print("Running BFS with depth_limit = {} and breadth_limit = {}.".format(depth_limit, breadth_limit))

# Assuming download+sleep=1s
print("Estimated time: {}s".format(np.power(breadth_limit, depth_limit + 1)))
start_time = time.time()

while(len(queue) > 0):
    url, depth, parent_node = queue.popleft()
    
    print(url, depth, parent_node)
    
    # Add url to seen_urls here, to avoid infinite loop (if the page links to itself)
    seen_urls.add(url)
    
    # Download the page
    try:
        time.sleep(0.5)
        r = requests.get(url)
    except Exception:
        print("Exception while requesting {}".format(url))
    
    if r.status_code != 200:
        node = Node(url, status["fail"])
        G.add_node(node)
    else:
        soup = BeautifulSoup(r.text, "lxml")

        node = Node(url, status[False])
        G.add_node(node.url)
        
        if depth < depth_limit:
            # Get outgoing url in their absolute form
            out_urls = (urljoin(url, a.get('href', '')) for a in soup.find_all("a"))
            
            # Remove already seen urls
            out_urls = [out_url for out_url in out_urls if out_url not in seen_urls]

            # Keeping only <breadth_limit> out_urls
            if len(out_urls) > breadth_limit:
                out_urls = random.sample(out_urls, breadth_limit)

            queue.extend((out_url, depth + 1, node) for out_url in out_urls)

            
    if parent_node is not None:
        G.add_edge(parent_node.url, node.url)
            
print("Done in {}s".format(time.time() - start_time))

In [None]:
import mpld3
mpld3.enable_notebook()

nx.draw_spring(G, with_labels=True)