In [1]:
import os
import re
import numpy as np
import json
from urllib.parse import unquote, urljoin, urlparse, urlsplit, urlunsplit

from bs4 import BeautifulSoup

HTML_DIR = "html"
OUT_URL_MAP = "urlmap.txt"
OUT_WEB_GRAPH = "webgraph.txt"
OUT_PAGERANK_SCORE = 'page_scores.txt'
OUT_PAGERANK_MAPPER = 'page_rank_mapper.json'

pages = []
all_urls = []
whitelist_file_types = ['html', 'htm']
whitelist_domain = 'ku.ac.th'

## Web graph

In [2]:
for dirpath, dirnames, filenames in os.walk(HTML_DIR):
    if "index.html" in filenames:
        url = "https://" + dirpath.replace("html/", "") + "/index.html"
        pages.append(
            {
                "url": url,
                "base_path": "https://" + dirpath.replace("html/", ""),
                "abs_path": dirpath + "/index.html",
            }
        )

In [3]:
pages[0]

{'url': 'https://caras.arch.ku.ac.th/carasn/index.html',
 'base_path': 'https://caras.arch.ku.ac.th/carasn',
 'abs_path': 'html/caras.arch.ku.ac.th/carasn/index.html'}

In [4]:
unique_key = "url"
pages = list({v[unique_key]: v for v in pages}.values())
all_urls = [page["url"] for page in pages]

In [5]:
def remove_query_from_url(url):
    parsed = urlparse(url)
    return "".join([parsed.scheme, "://", parsed.netloc, parsed.path])


def normalization_url(url, base_url):
    try:
        x = "%s" % url
        # absolute
        url = urljoin(base_url, url)

        # remove # (self reference)
        url = re.sub(r"#.*", "", url)

        # parse to utf8
        url = unquote(url)

        # strip / (backslash)
        url = url.strip("/")

        # remove query string
        url = remove_query_from_url(url)

        url = format_url(url)

        return url
    except Exception as e:
        print(x, base_url)
        #         print(x, url, base_url)
        print(e)


def format_url(url):
    url = re.sub(r"((?!:).)\/\/", r"\g<1>/", url)
    parsed = urlparse(url)
    hostname = parsed.hostname
    url_path = parsed.path

    filetype = re.match(r".*\.(.*)$", url_path)
    if filetype != None:
        # urljoin 'http://localhost:8888/asdasd/htasd.html' => 'http://localhost:8888/asdasd/'
        save_folder_path = hostname + "/".join(url_path.split("/")[:-1])
        save_filename = url.split(urljoin(url, "."))[1]
    else:
        save_folder_path = hostname + url_path
        save_filename = "index.html"

    save_folder_path = save_folder_path.strip("/")
    save_abs_path = "https://" + save_folder_path + "/" + save_filename

    return save_abs_path

In [6]:
def filters_urls(urls, base_url):
    filtered_urls = []
    for url in urls:
        parsed = urlparse(url)
        url_path = parsed.path
        hostname = parsed.hostname
        # check domain allow only ku.ac.th
        if not hostname or whitelist_domain not in hostname:
            continue

        # check filetype
        filetype = re.match(r".*\.(.*)$", url_path)

        if not filetype:
            filtered_urls.append(url)
        elif filetype[1] in whitelist_file_types:
            filtered_urls.append(url)
        else:
            pass

    return filtered_urls

In [7]:
# pages = ['http://https://www.ku.ac.th/wlh']

for i in range(len(pages)):
    page = pages[i]
    url = page['url']
    base_path = page['base_path']
    abs_path = page['abs_path']
    
    with open(abs_path, 'r') as file:
       raw_html = file.read()
    
    soup = BeautifulSoup(raw_html, 'html.parser')
    href_list = []
    for page in soup.find_all('a', href=True):
        href_list.append(page.get('href'))
    href_list = filters_urls(href_list, base_path)
    href_list = [normalization_url(href, base_path) for href in href_list]
    href_list = [href for href in href_list if href in all_urls]
    href_list = list(set(href_list))
    
    adj_list = [str(all_urls.index(href)) for href in href_list if href in all_urls]
    # eliminate self loop
    adj_list = [adj for adj in adj_list if str(adj) != str(i)]
    
    with open(OUT_WEB_GRAPH, "a") as file:
        file.write(','.join(adj_list) + '\n')
    
    with open(OUT_URL_MAP, "a") as file:
        file.write(url + '\n')


## Page Rank

In [8]:
edges = []
num_node = 0

with open(OUT_WEB_GRAPH, "r") as file:
    lines = file.readlines()
    num_node = len(lines)
    for i in range(len(lines)):
        line = lines[i].strip().split(',')
        for adj in line:
            if(adj):
                edges.append([i, int(adj), 1/len(line)])

In [9]:
p = np.zeros(shape=(num_node, num_node))

for edge in edges:
    p[(edge[0], edge[1])] = edge[2]

In [10]:
def pagerank(p, alpha=0.85, max_iter=100, tol=1.0e-6):
    initial_value = 1 / num_node
    
    # solve rank leak problem
    for i in range(len(p)):
        sum_row = sum(p[i])
        if(sum_row == 0):
            p[i] = np.full((num_node,), initial_value)
    
    r = np.full((num_node, 1), initial_value)
    p_t = p.T
    for i in range(max_iter):
        prev_r = r.copy()
        r = alpha * p_t.dot(prev_r) + (1 - alpha) * np.full((num_node, 1), initial_value)

        err = sum(abs(r - prev_r))
        if(err < tol):
            return r

    return r

In [11]:
pagerank_scores = pagerank(p)

In [12]:
pagerank_scores = pagerank_scores.reshape(num_node)

In [13]:
with open(OUT_PAGERANK_SCORE, "w") as file:
    file.write('\n'.join(([str(score) for score in pagerank_scores])))

In [14]:
pagerank_mapper = {pages[i]['url']: pagerank_scores[i] for i in range(len(pages))}

In [15]:
with open(OUT_PAGERANK_MAPPER, 'w') as file:
    file.write(json.dumps(pagerank_mapper, indent=4))