In [4]:
from os import environ

from pyspark.sql import SparkSession

environ[
    'PYSPARK_PYTHON'] = "/mnt/ceph/storage/data-in-progress/data-research/web-search/web-archive-query-log/venv/bin/python"
session = SparkSession.builder \
    .master("yarn") \
    .appName("web-archive-query-log-most-referenced-domains") \
    .config("spark.executor.instances", 3) \
    .getOrCreate()

In [5]:
sc = session.sparkContext
sc

In [6]:
from pathlib import Path

global_data_dir = Path("/mnt/ceph/storage/data-in-progress/data-research/web-search/web-archive-query-log/")
global_data_dir

PosixPath('/mnt/ceph/storage/data-in-progress/data-research/web-search/web-archive-query-log')

In [7]:
data_dir = global_data_dir / "focused"
data_dir

PosixPath('/mnt/ceph/storage/data-in-progress/data-research/web-search/web-archive-query-log/focused')

In [8]:
from pathlib import Path

figures_dir = Path("figures")
figures_dir.mkdir(parents=True, exist_ok=True)

In [9]:
def paths_jsonl(service: str, base_type: str):
    base_path = data_dir / base_type / service
    if not base_path.exists():
        return []
    yield from base_path.glob("*/*.jsonl.gz")

In [17]:
from json import loads, JSONDecodeError
from typing import Iterator
from gzip import GzipFile

def read_jsonl(path: Path) -> Iterator:
    print(f"Read JSONL records in {path}.")
    try:
        with GzipFile(path, "r") as gzip_file:
            for line in gzip_file:
                try:
                    url = loads(line)
                except JSONDecodeError:
                    continue
                yield url
    except:
        return 

In [18]:
with Path("services.txt").open("rt") as file:
    alexa_services = [
        line.strip()
        for line in file
        if line
    ]
len(alexa_services)

1087

In [34]:
!pip install publicsuffixlist



In [71]:
from urllib.parse import urlsplit
from publicsuffixlist import PublicSuffixList

public_suffix_list = PublicSuffixList()

def domain(url: str) -> str:
    hostname = urlsplit(url).hostname
    if hostname is None:
        return None
    public_suffix = public_suffix_list.publicsuffix(hostname)
    second_level_domain = public_suffix_list.subdomain(hostname, 0)
    if second_level_domain is None:
        second_level_domain = public_suffix
    return second_level_domain

In [72]:
from json import loads
from pandas import DataFrame
from datetime import datetime

def most_referenced_results(k: int, external: bool) -> dict:
    rdd = sc.parallelize(alexa_services, 1000)\
        .flatMap(lambda service: paths_jsonl(service, "archived-parsed-serps"))\
        .repartition(10_000)\
        .flatMap(read_jsonl)\
        .filter(lambda serp: serp["results"] is not None)\
        .keyBy(lambda serp: domain(serp["url"]))\
        .flatMapValues(lambda serp: serp["results"][:k])\
        .mapValues(lambda result: domain(result["url"]))
    if external:
        rdd = rdd.filter(lambda domains: domains[0] != domains[1])
    return rdd.values()\
        .keyBy(lambda domain: domain)\
        .countByKey()

In [73]:
most_referenced_results_5 = most_referenced_results(5, external=True)
len(most_referenced_results_5)

791656

In [74]:
most_referenced_results_10 = most_referenced_results(10, external=True)
len(most_referenced_results_10)

1243208

In [75]:
def self_references(k: int) -> dict:
    return sc.parallelize(alexa_services, 1000)\
        .flatMap(lambda service: paths_jsonl(service, "archived-parsed-serps"))\
        .repartition(10_000)\
        .flatMap(read_jsonl)\
        .filter(lambda serp: serp["results"] is not None)\
        .keyBy(lambda serp: domain(serp["url"]))\
        .flatMapValues(lambda serp: serp["results"][:k])\
        .mapValues(lambda result: domain(result["url"]))\
        .filter(lambda domains: domains[0] == domains[1])\
        .count()

In [76]:
self_5 = self_references(5)
self_5

13265146

In [77]:
self_10 = self_references(10)
self_10

24330691

In [82]:
top = sorted(
    most_referenced_results_5.items(),
    key=lambda domain_count: domain_count[1],
    reverse=True,
)[:20]
top

[('wikipedia.org', 544202),
 ('youtube.com', 145871),
 ('facebook.com', 111251),
 ('linkedin.com', 74866),
 ('imdb.com', 64279),
 ('instagram.com', 62059),
 ('amazon.com', 56661),
 ('pinterest.com', 46803),
 ('devroye.org', 46408),
 ('twitter.com', 41317),
 ('fandom.com', 37848),
 (None, 32370),
 ('yahoo.com', 27807),
 ('google.com', 27737),
 ('behance.net', 26976),
 ('merriam-webster.com', 25482),
 ('tistory.com', 25359),
 ('xvideos.com', 23251),
 ('thefreedictionary.com', 20417),
 ('discogs.com', 20121)]

In [83]:
for d, num_5 in top:
    num_10 = most_referenced_results_10.get(d, 0)
    print(fr"\domain{{{d}}} & {num_5:,.0f} & {num_10:,.0f} \\")
print(r"\midrule")
print(fr"self-references & {self_5:,.0f} & {self_10:,.0f} \\")

\domain{wikipedia.org} & 544,202 & 758,860 \\
\domain{youtube.com} & 145,871 & 234,522 \\
\domain{facebook.com} & 111,251 & 176,381 \\
\domain{linkedin.com} & 74,866 & 107,182 \\
\domain{imdb.com} & 64,279 & 93,207 \\
\domain{instagram.com} & 62,059 & 84,291 \\
\domain{amazon.com} & 56,661 & 95,388 \\
\domain{pinterest.com} & 46,803 & 97,408 \\
\domain{devroye.org} & 46,408 & 62,012 \\
\domain{twitter.com} & 41,317 & 68,598 \\
\domain{fandom.com} & 37,848 & 61,127 \\
\domain{None} & 32,370 & 64,278 \\
\domain{yahoo.com} & 27,807 & 53,559 \\
\domain{google.com} & 27,737 & 49,431 \\
\domain{behance.net} & 26,976 & 34,435 \\
\domain{merriam-webster.com} & 25,482 & 32,805 \\
\domain{tistory.com} & 25,359 & 31,643 \\
\domain{xvideos.com} & 23,251 & 46,598 \\
\domain{thefreedictionary.com} & 20,417 & 29,318 \\
\domain{discogs.com} & 20,121 & 30,090 \\
\midrule
self-references & 13,265,146 & 24,330,691 \\
