In [None]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_context('notebook')

%load_ext autoreload
%autoreload 2

In [None]:
from collections import deque
import requests
from urllib.parse import urljoin
import time
import random

from bs4 import BeautifulSoup
import networkx as nx

In [None]:
from helpers import cache_page, is_cached, get_cached, url_to_filename
from features import extract_visible

In [None]:
from sklearn.externals import joblib
import os

pipeline_path = os.path.join('saved', 'models', 'log_reg_pipeline_general3.pkl')
pipeline = joblib.load(pipeline_path)

In [None]:
status = {
    True: "class_true",
    False: "class_false",
    "fail": "req_failed",
}

class Node:
    # Status one of "class_true", "class_false", "req_failed"
    def __init__(self, url, status, decision_func):
        self.url = url
        self.status = status
        self.decision_func = decision_func
        
    def __str__(self):
        # Remove "https://www." part for clarity
        stripped = "".join(self.url.split('://')[1:])
        return stripped
    
    def __eq__(self, other):
        if isinstance(self, other.__class__):
            return self.url == other.url
        return False
    
    def __hash__(self):
        return hash(self.url)

In [None]:
first_link = "http://online-learning.harvard.edu"

# Queue containing: (url, depth)
queue = deque()
queue.append((first_link, 0, None))

G = nx.DiGraph()
seen_urls = set()

# Params
depth_limit = 6
breadth_limit = 3

print("Running BFS with depth_limit = {} and breadth_limit = {}.".format(depth_limit, breadth_limit))

# Assuming download+sleep=1s
print("Estimated time: {}s".format(np.power(breadth_limit, depth_limit + 1)))
start_time = time.time()

while(len(queue) > 0):
    url, depth, parent_node = queue.popleft()
    
    print(url, depth)
    
    # Add url to seen_urls here, to avoid infinite loop (if the page links to itself)
    seen_urls.add(url)
    
    # Fetch from cache or download the page
    if is_cached(url):
        text = get_cached(url)
        status_code = 200
    else:
        try:
            # Sleep to avoid getting banned
            time.sleep(0.5)
            
            r = requests.get(url)
            status_code = r.status_code
            text = r.text
            soup = BeautifulSoup(text, "lxml")
        except Exception as e:
            print("Exception while requesting / parsing {}".format(url))
            print(e)
            status_code = -1
    

    if status_code != 200:
        node = Node(url, status["fail"], 0)
        G.add_node(node)
    else:
        visible_text = extract_visible(soup)
        
        # Predict label & get strength of prediction
        label = pipeline.predict([visible_text])[0]
        decision_func = pipeline.decision_function([visible_text])[0]

        node = Node(url, status[label], decision_func)
        G.add_node(node)
        
        if depth < depth_limit:
            # Get outgoing url in their absolute form
            out_urls = (urljoin(url, a.get('href', '')) for a in soup.find_all("a"))
            
            # Remove already seen urls
            out_urls = [out_url for out_url in out_urls if out_url not in seen_urls]

            # Keeping only <breadth_limit> out_urls
            if len(out_urls) > breadth_limit:
                out_urls = random.sample(out_urls, breadth_limit)

            queue.extend((out_url, depth + 1, node) for out_url in out_urls)

            
    if parent_node is not None:
        G.add_edge(parent_node, node)
            
print("Done in {}s".format(time.time() - start_time))

In [None]:
import mpld3
mpld3.enable_notebook()

colors = []

for node in G:
    if node.status == status[True]:
        print("TRUE: {} {}".format(node.decision_func, node))
        colors.append('green')
    elif node.status == status[False]:
        print("FALSE: {} {}".format(node.decision_func, node))
        colors.append('red')
    else:
        # Download error
        colors.append('grey')

fig, ax = plt.subplots(figsize=(15, 10))
nx.draw_kamada_kawai(G, ax=ax, node_color=colors, with_labels=False, node_size=50)

In [None]:
nx.write_gpickle(G, os.path.join('saved', 'graphs', "{}-depth:{}-breadth:{}.pkl"
    .format(url_to_filename(first_link), depth_limit, breadth_limit)))