#### 1. Install mpld3 for interactive graph result

In [9]:
!pip install mpld3



#### 2. Import relevant modules

In [10]:
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import networkx as nx
import pandas as pd
import numpy as np

import threading
import requests
import random
import urllib
import mpld3
import time
import re

from collections import defaultdict, deque
from urllib.parse import urlparse
from bs4 import BeautifulSoup
from threading import Lock
from pprint import pprint
from mpld3 import plugins



#### Set constant parameters

In [11]:
saved_html_name = 'figure1.html'

data_regex = '[a-zA-Z0-9-_.]+@[a-zA-Z0-9-_.]+\.[a-z]{2,3}'
data_name = 'emails'

MAX_THREDS = 4

css = """
    table
    {
    border-collapse: collapse;
    }
    th
    {
    color: #ffffff;
    background-color: #000000;
    }
    td
    {
    background-color: #cccccc;
    }
    table, th, td
    {
    font-family:Arial, Helvetica, sans-serif;
    border: 1px solid black;
    text-align: left;
    }
    """


class crw_Data:
    main_color = "red"
    link_color = "#eb9234"  # orange
    empty_color = "blue"
    data_color = "limegreen"

    edge_urls_color = "black"
    edge_data_color = "limegreen"

    main_url_attrs = {"color": main_color}              # color of main url (in knowlage graph)
    urls_attrs = {"color": link_color}                  # color of linked url (in knowlage graph)
    empty_urls_attrs = {"color": empty_color}           # color of empty url (in knowlage graph)
    data_attrs = {"color": data_color}                  # color of email (in knowlage graph)

    edge_urls_attrs = {"color": edge_urls_color}        # color of edge to url (both graphs)
    edge_data_attrs = {"color": edge_data_color}        # color of edge to email (both graphs)

    main_url_attrs_domain = {"color": main_color}       # color of main url (in domain graph)
    urls_attrs_domain = {"color": link_color}           # color of linked (in domain graph)
    empty_urls_attrs_domain = {"color": empty_color}    # color of empty (in domain graph)
    data_attrs_domain = {"color": data_color}           # color of email (in domain graph)

    legends = {'Main url': main_color, 'Normal url': link_color, 'Empty url': empty_color, data_name: data_color}
    legends_domain = {'Main domain': main_color, 'Normal domain': link_color, data_name: data_color}

#### 3. Define Carawler class

In [12]:
class CrawlingObject:

    def __init__(self, crawl_name, main_url, max_length=4, max_urls=100):
        self.crawl_name = crawl_name
        self.main_url = main_url
        self.max_length = max_length
        self.MAX_URLS = max_urls

        self.knowlage_graph = nx.Graph()
        self.domain_graph = nx.Graph()

        self.main_url_attrs = crw_Data.main_url_attrs
        self.urls_attrs = crw_Data.urls_attrs
        self.empty_urls_attrs = crw_Data.empty_urls_attrs
        self.data_attrs = crw_Data.data_attrs

        self.edge_urls_attrs = crw_Data.edge_urls_attrs
        self.edge_data_attrs = crw_Data.edge_data_attrs

        self.main_url_attrs_domain = crw_Data.main_url_attrs_domain
        self.urls_attrs_domain = crw_Data.urls_attrs_domain
        self.empty_urls_attrs_domain = crw_Data.empty_urls_attrs_domain
        self.data_attrs_domain = crw_Data.data_attrs_domain

        self.legends = crw_Data.legends
        self.legends_domain = crw_Data.legends_domain

        self.DATA_REGEX = data_regex
        self.datas = set()
        self.visited_urls = set()
        self.datas_dict = defaultdict(lambda: defaultdict(lambda: set()))
        self.All_viseted_URLS_2 = None
        self.locking = None
        self.time = 0

    def set_visited_urls_dict(self, set_visited_urls_dict, locking):
        self.All_viseted_URLS_2 = set_visited_urls_dict
        self.locking = locking

    def __repr__(self):
        return f'crawler {self.crawl_name} (main_url={self.main_url}, ' \
               f'max_length={self.max_length}, max_urls={self.MAX_URLS})'

    def extract_data(self, response, shuffle=False):
        doc = BeautifulSoup(response.text, "html.parser")
        url_links = doc.find_all('a', href=True, limit=10 * self.max_length)

        if shuffle:
            random.shuffle(url_links)

        urls_datas = set(re.findall(self.DATA_REGEX, response.text, re.I))
        return list(url_links), list(urls_datas), response.status_code

    def add_datas_to_graphs(self, url, datas):
        domain = domain_from(url)
        for dat in datas:
            self.datas.add(dat)
            self.knowlage_graph.add_node(dat, **self.data_attrs)
            self.knowlage_graph.add_edge(url, dat, **self.edge_data_attrs)

            self.domain_graph.add_node(dat, **self.data_attrs_domain)
            self.domain_graph.add_edge(domain, dat, **self.edge_data_attrs)

            self.datas_dict[domain][url].add(dat)

    def add_urls_to_graphs(self, curr_url, new_url):
        curr_domain = domain_from(curr_url)
        new_domain = domain_from(new_url)
        self.knowlage_graph.add_node(new_url, **self.urls_attrs)
        self.knowlage_graph.add_edge(curr_url, new_url, **self.edge_urls_attrs)
        if new_domain != curr_domain:
            self.domain_graph.add_node(new_domain, **self.urls_attrs_domain)
            self.domain_graph.add_edge(curr_domain, new_domain, **self.edge_urls_attrs)

    def BFS(self):
        q = deque([self.main_url])
        total_urls = 1
        while q:
            first_url = q.popleft()
            try:
                response = requests.get(first_url)
            except:
                print(f'fail read {first_url}')
                total_urls -= 1
                continue

            domain = domain_from(first_url)
            hyper_links, new_datas, status_code = self.extract_data(response)

            self.locking.acquire()
            exist_url = False
            if domain in self.All_viseted_URLS_2.keys() and \
                    first_url in self.All_viseted_URLS_2[domain]:
                exist_url = True
            self.locking.release()
            if exist_url:
                continue

            self.All_viseted_URLS_2[domain].add(first_url)
            self.add_datas_to_graphs(first_url, new_datas)
            self.visited_urls.add(first_url)
            print(f'{self.crawl_name} go to : {first_url} ({len(self.visited_urls)}/{self.MAX_URLS})')

            add_urls = 0
            for new_url in hyper_links:
                new_url = urllib.parse.urljoin(first_url, new_url['href'])
                if total_urls == self.MAX_URLS or add_urls == self.max_length:
                    break
                if 'http' not in new_url or '.pdf' in new_url:
                    continue
                if new_url not in self.knowlage_graph.nodes():
                    total_urls += 1
                    add_urls += 1
                    q.append(new_url)
                    self.add_urls_to_graphs(first_url, new_url)

            if not hyper_links:
                self.knowlage_graph.add_node(first_url, **self.empty_urls_attrs)

    def crawl(self):
        start_time = time.time()
        self.knowlage_graph.add_node(self.main_url, **self.main_url_attrs)
        self.domain_graph.add_node(domain_from(self.main_url), **self.main_url_attrs_domain)
        self.BFS()
        end_time = time.time()
        self.time = end_time - start_time


#### 4. Define utils for Crawler class 

In [13]:
def set_crawlers(urls, max_links=4, max_urls=40):
    urls = urls if isinstance(urls, list) else [urls]
    return [CrawlingObject(crawl_name=f'crawl {idx + 1}', main_url=url,
                           max_length=max_links, max_urls=max_urls) for idx, url in enumerate(urls)]

def crawl_urls(crawlers, thred=1, timeout='100s'):
    num_urls = len(crawlers)
    timeout_val = to_seconds(timeout)
    thred = min(thred, num_urls, MAX_THREDS)
    subset_index_urls = [a.tolist() for a in np.array_split(np.arange(num_urls).astype(int), thred)]

    All_viseted_URLS = defaultdict(lambda: set())
    data_lock = Lock()

    def subset_crawl(subset_idx_urls, visited_urls, d_lock):
        for idx in subset_idx_urls:
            crawlers[idx].set_visited_urls_dict(visited_urls, d_lock)
            crawlers[idx].crawl()

    def make_threds(crw_threds):
        for thr in crw_threds:
            thr.start()

        for thr in crw_threds:
            thr.join(timeout_val / num_urls)

    crwlers_threds = [threading.Thread(target=subset_crawl, args=(idxs, All_viseted_URLS, data_lock), daemon=True) for idxs in subset_index_urls]
    print(f'\nStart crawling url\'s\n\n'
          f'Number of urls : {num_urls}\n'
          f'Number of threds : {thred}\n'
          f'Maximum urls to crawl (per url) : {crawlers[0].MAX_URLS}\n'
          f'Maximum open links for each url : {crawlers[0].max_length}\n'
          f'Maximum time for all url\'s : {string_time(timeout_val)}\n')

    start_time = time.time()
    make_threds(crwlers_threds)
    end_time = time.time()

    all_datas, all_crawls_dict = get_datas(crawlers)
    print(f'\ntotal time : {string_time(end_time - start_time)}'
          f'\ntotal {data_name}s found : {len(all_datas)}\n')
    pprint(crawlers[0].All_viseted_URLS_2)
    return crawlers, all_crawls_dict, all_datas

def get_datas(crawlers):
    all_datas = set()
    all_crawls_dict = {}
    for crw in crawlers:
        all_crawls_dict[crw.crawl_name] = crw.datas_dict
        for dat in crw.datas:
            all_datas.add(dat)
    return list(all_datas), all_crawls_dict


def get_the_best(crws, k=5):
    most_imported_urls = defaultdict(lambda: 0)
    most_imported_urls_per_domain = defaultdict(lambda: defaultdict(lambda: 0))
    pattern = re.compile(data_regex)
    for crw in crws:
        for (v_1, v_2) in crw.knowlage_graph.edges():
            claim_1, claim_2 = int(bool(pattern.match(v_1))), int(bool(pattern.match(v_2)))
            if (claim_1 + claim_2) % 2 == 0:
                continue
            if claim_1 and not claim_2:
                domain = domain_from(v_2)
                most_imported_urls[v_2] += 1
                most_imported_urls_per_domain[domain][v_2] += 1
            if claim_2 and not claim_1:
                domain = domain_from(v_1)
                most_imported_urls[v_1] += 1
                most_imported_urls_per_domain[domain][v_1] += 1

    most_imported_urls = sorted(most_imported_urls.items(), key=lambda item: -item[1])[:k]
    for key, val in most_imported_urls_per_domain.items():
        most_imported_urls_per_domain[key] = sorted(val.items(), key=lambda item: -item[1])[:k]
    return most_imported_urls, dict(most_imported_urls_per_domain)

#### 5. Define more util functions

In [14]:
def read_file(path):
    URLS = []
    with open(file=path, mode='r') as f:
        for line in f:
            URLS.append(line.strip())
    return URLS


def domain_from(url):
    domain = urlparse(url).netloc
    # domain = ".".join(urlparse(url).netloc.split(sep='.')[1:])
    return domain


def check_valid_url(url):
    return (requests.get(url).status_code // 100 == 2)


def to_seconds(time_string):
    converter = {'s': lambda x: x,
                 'm': lambda x: 60 * x,
                 'h': lambda x: 60 * 60 * x,
                 'd': lambda x: 60 * 60 * 24 * x,
                 'w': lambda x: 60 * 60 * 27 * 7 * x}
    T, units = float(time_string[:-1]), time_string[-1]
    return converter[units](T)


def string_time(T):
    T = int(T)
    text_s = f'{T % 60}s '
    text_m = f'{(T // 60) % 60}m ' if (T // 60) % 60 else ''
    text_h = f'{(T // (60 * 60)) % 24}h ' if (T // (60 * 60)) % 24 else ''
    text_d = f'{(T // (60 * 60 * 24)) % 7}h ' if (T // (60 * 60 * 24)) % 7 else ''
    text_w = f'{T // (60 * 60 * 24 * 7)}h ' if T // (60 * 60 * 24 * 7) else ''
    return text_h + text_m + text_s + text_d + text_w


def print_results(data, with_text=True):
    text = ['\nAll visited urls :\n',
            '\nAll crawls urls :\n',
            f'\nAll {data_name} :\n',
            '\nMost importent urls :\n',
            '\nMost importent urls per domain :\n']
    for i, dat in enumerate(data):
        if with_text:
            print(text[i])
        pprint(dat)
        print('\n')

#### 6. Define interactive plot

In [15]:
def beutiful_plot(crawlers, node_size=100, edge_width=2, circle_size=40, figsize=(20, 40)):
    crawlers = crawlers if isinstance(crawlers, list) else [crawlers]
    fig, ax = plt.subplots(len(crawlers), 2, figsize=figsize)
    TYPE = ['URL', 'Domain']
    for k, crw in enumerate(crawlers):
        text = [f'{crw.main_url[:60]}...' if len(crw.main_url) >= 50 else crw.main_url, domain_from(crw.main_url)]
        Graphs = [crw.knowlage_graph, crw.domain_graph]

        attrs = [{'node_color': nx.get_node_attributes(G, 'color').values(),
                'edge_color': nx.get_edge_attributes(G, 'color').values()} for G in Graphs]
            
        handles = [[mpatches.Patch(label=k, color=v) for k, v in lgd.items()] for lgd in [crw.legends, crw.legends_domain]] 

        for i, Gra in enumerate(Graphs):
            Ax = ax[i] if len(crawlers) == 1 else ax[k, i]
            nx.draw_kamada_kawai(Gra, **attrs[i], node_size=node_size, width=edge_width  ,ax=Ax, with_labels=False)
            Ax.set_title(f'crawl of {TYPE[i]} : {text[i]} \n crawl time : {crw.time:.2f} (seconds),    {data_name} found : {len(crw.datas)}')
            Ax.grid(alpha=0.25)
            Ax.legend(handles=handles[i], fontsize=14)
            Ax.axis('on')

            positions = nx.kamada_kawai_layout(Gra)

            points_x = []
            points_y = []
            dict_data = {'data': []}
            for node, (x, y) in positions.items():
                dict_data['data'].append(node)
                points_x.append(x)
                points_y.append(y)

            df = pd.DataFrame(dict_data, columns=['data'])
            labels = []
            for i in range(len(df)):
                label = df.iloc[[i], :].T
                label.columns = [f'link {i + 1}']
                labels.append(str(label.to_html()))

            points = Ax.plot(points_x, points_y, 'o', color='b', mec='k', ms=circle_size, mew=1, alpha=0.1, mfc='none')
            tooltip = plugins.PointHTMLTooltip(points[0], labels, voffset=10, hoffset=10, css=css)
            plugins.connect(fig, tooltip)
    return fig

#### 7.Set URL's and run the code

In [16]:
URLS = ["https://www.wikipedia.org/",
        "https://www.walla.co.il/",
        "https://www.mako.co.il/tv",
        "https://www.kan.org.il/page.aspx?landingPageId=1039"]


crawlers = set_crawlers(URLS, max_links=10, max_urls=50)
crawlers, all_crawls_dict, all_emails = crawl_urls(crawlers, thred=4, timeout='2m')
imported_urls, imported_urls_per_domain = get_the_best(crawlers, k=5)

crawl 1
crawl 2
crawl 3
crawl 4
number of urls : 4
number of threds : 4
max urls to crawl (per url) : 50
max open links for each url : 10
max time for all url's : 2m 0s 

crawl 1 go to : https://www.wikipedia.org/ (1/50)
crawl 2 go to : https://www.walla.co.il/ (1/50)
crawl 1 go to : https://en.wikipedia.org/ (2/50)
crawl 4 go to : https://www.kan.org.il/page.aspx?landingPageId=1039 (1/50)
crawl 1 go to : https://ru.wikipedia.org/ (3/50)
crawl 2 go to : https://www.walla.co.il (2/50)
crawl 3 go to : https://www.mako.co.il/tv (1/50)
crawl 1 go to : https://ja.wikipedia.org/ (4/50)
crawl 3 go to : https://www.mako.co.il?partner=headerlogo (2/50)
crawl 1 go to : https://de.wikipedia.org/ (5/50)
crawl 4 go to : https://www.kan.org.il/page.aspx?landingPageId=1039#mainContent (2/50)
crawl 1 go to : https://es.wikipedia.org/ (6/50)
crawl 3 go to : https://www.facebook.com/keshet.mako/ (3/50)
crawl 1 go to : https://fr.wikipedia.org/ (7/50)
crawl 4 go to : https://www.kan.org.il/ (3/50)
crawl 

#### 8.Print results

In [20]:
print_results([crawlers[0].All_viseted_URLS_2, all_crawls_dict, all_emails, imported_urls, imported_urls_per_domain], with_text=True)


All visited urls :

defaultdict(<function crawl_urls.<locals>.<lambda> at 0x7ff773f35f80>,
            {'ar-ar.facebook.com': {'https://ar-ar.facebook.com/login/?next=https%3A%2F%2Fwww.facebook.com%2Fkeshet.mako%2F'},
             'b.walla.co.il': {'https://b.walla.co.il/'},
             'beauty.walla.co.il': {'https://beauty.walla.co.il/'},
             'calendar.walla.co.il': {'https://calendar.walla.co.il/'},
             'cars.walla.co.il': {'https://cars.walla.co.il/'},
             'celebs.walla.co.il': {'https://celebs.walla.co.il/'},
             'corona.mako.co.il': {'https://corona.mako.co.il/?partner=NewsNavBar'},
             'de-de.facebook.com': {'https://de-de.facebook.com/login/?next=https%3A%2F%2Fwww.facebook.com%2Fkeshet.mako%2F'},
             'de.wikipedia.org': {'https://de.wikipedia.org/',
                                  'https://de.wikipedia.org/#artikel',
                                  'https://de.wikipedia.org/#ereignisse',
                               

#### 9.Plot and save results

In [18]:
fig = beutiful_plot(crawlers, node_size=50, edge_width=2, circle_size=30, figsize=(20, 40))
mpld3.display()

#### 10.Save figure in HTML format

In [19]:
mpld3.save_html(fig, saved_html_name)