In [1]:
from os import environ

from pyspark.sql import SparkSession

environ[
    'PYSPARK_PYTHON'] = "/mnt/ceph/storage/data-in-progress/data-research/web-search/web-archive-query-log/venv/bin/python"
session = SparkSession.builder \
    .master("yarn") \
    .appName("web-archive-query-log-most-referenced-domains") \
    .config("spark.executor.instances", 3) \
    .getOrCreate()

In [2]:
sc = session.sparkContext
sc

In [3]:
from pathlib import Path

global_data_dir = Path("/mnt/ceph/storage/data-in-progress/data-research/web-search/web-archive-query-log/")
global_data_dir

PosixPath('/mnt/ceph/storage/data-in-progress/data-research/web-search/web-archive-query-log')

In [4]:
data_dir = global_data_dir / "focused"
data_dir

PosixPath('/mnt/ceph/storage/data-in-progress/data-research/web-search/web-archive-query-log/focused')

In [5]:
from pathlib import Path

figures_dir = Path("figures")
figures_dir.mkdir(parents=True, exist_ok=True)

In [6]:
def paths_jsonl(service: str, base_type: str):
    base_path = data_dir / base_type / service
    if not base_path.exists():
        return []
    yield from base_path.glob("*/*.jsonl.gz")

In [7]:
from json import loads, JSONDecodeError
from typing import Iterator
from gzip import GzipFile

def read_jsonl(path: Path) -> Iterator:
    print(f"Read JSONL records in {path}.")
    try:
        with GzipFile(path, "r") as gzip_file:
            for line in gzip_file:
                try:
                    url = loads(line)
                except JSONDecodeError:
                    continue
                yield url
    except:
        return 

In [8]:
with Path("services.txt").open("rt") as file:
    alexa_services = [
        line.strip()
        for line in file
        if line
    ]
len(alexa_services)

1087

In [9]:
!pip install publicsuffixlist



In [10]:
from urllib.parse import urlsplit
from publicsuffixlist import PublicSuffixList

public_suffix_list = PublicSuffixList()

def domain(url: str) -> str:
    hostname = urlsplit(url).hostname
    if hostname is None:
        return None
    public_suffix = public_suffix_list.publicsuffix(hostname)
    second_level_domain = public_suffix_list.subdomain(hostname, 0)
    if second_level_domain is None:
        second_level_domain = public_suffix
    return second_level_domain

In [11]:
from json import loads
from pandas import DataFrame
from datetime import datetime

def referenced_results(k: int) -> dict:
    return sc.parallelize(alexa_services, 1000)\
        .flatMap(lambda service: paths_jsonl(service, "archived-parsed-serps"))\
        .repartition(10_000)\
        .flatMap(read_jsonl)\
        .filter(lambda serp: serp["results"] is not None)\
        .filter(lambda serp: serp["offset"] is None or serp["offset"] == 0)\
        .filter(lambda serp: serp["page"] is None or serp["page"] <= 1)\
        .keyBy(lambda serp: domain(serp["url"]))\
        .flatMapValues(lambda serp: serp["results"][:k])\
        .mapValues(lambda result: domain(result["url"]))\
        .values()\
        .count()

In [12]:
total_5 = referenced_results(5)
total_5

19062218

In [13]:
total_10 = referenced_results(10)
total_10

34545570

In [14]:
from json import loads
from pandas import DataFrame
from datetime import datetime

def most_referenced_results(k: int, external: bool) -> dict:
    rdd = sc.parallelize(alexa_services, 1000)\
        .flatMap(lambda service: paths_jsonl(service, "archived-parsed-serps"))\
        .repartition(10_000)\
        .flatMap(read_jsonl)\
        .filter(lambda serp: serp["results"] is not None)\
        .filter(lambda serp: serp["offset"] is None or serp["offset"] == 0)\
        .filter(lambda serp: serp["page"] is None or serp["page"] <= 1)\
        .keyBy(lambda serp: domain(serp["url"]))\
        .flatMapValues(lambda serp: serp["results"][:k])\
        .mapValues(lambda result: domain(result["url"]))
    if external:
        rdd = rdd.filter(lambda domains: domains[0] != domains[1])
    return rdd.values()\
        .keyBy(lambda domain: domain)\
        .countByKey()

In [15]:
most_referenced_results_5 = most_referenced_results(5, external=True)
len(most_referenced_results_5)

791656

In [16]:
most_referenced_results_10 = most_referenced_results(10, external=True)
len(most_referenced_results_10)

1243208

In [17]:
def self_references(k: int) -> dict:
    return sc.parallelize(alexa_services, 1000)\
        .flatMap(lambda service: paths_jsonl(service, "archived-parsed-serps"))\
        .repartition(10_000)\
        .flatMap(read_jsonl)\
        .filter(lambda serp: serp["results"] is not None)\
        .filter(lambda serp: serp["offset"] is None or serp["offset"] == 0)\
        .filter(lambda serp: serp["page"] is None or serp["page"] <= 1)\
        .keyBy(lambda serp: domain(serp["url"]))\
        .flatMapValues(lambda serp: serp["results"][:k])\
        .mapValues(lambda result: domain(result["url"]))\
        .filter(lambda domains: domains[0] == domains[1])\
        .count()

In [18]:
self_5 = self_references(5)
self_5

13265146

In [19]:
self_10 = self_references(10)
self_10

24330691

In [20]:
table_k = 10

In [21]:
top = sorted(
    most_referenced_results_5.items(),
    key=lambda domain_count: domain_count[1],
    reverse=True,
)[:table_k]
top

[('wikipedia.org', 544202),
 ('youtube.com', 145871),
 ('facebook.com', 111251),
 ('linkedin.com', 74866),
 ('imdb.com', 64279),
 ('instagram.com', 62059),
 ('amazon.com', 56661),
 ('pinterest.com', 46803),
 ('devroye.org', 46408),
 ('twitter.com', 41317)]

In [22]:
num_remaining = len(most_referenced_results_5) - len(top)
num_remaining

791646

In [23]:
remaining_5 = total_5 - self_5 - sum(most_referenced_results_5[domain] for domain, _ in top)
remaining_5

4603355

In [24]:
remaining_10 = total_10 - self_10 - sum(most_referenced_results_10[domain] for domain, _ in top)
remaining_10

8437030

In [25]:
for d, num_5 in top:
    num_10 = most_referenced_results_10.get(d, 0)
    print(fr"\domain{{{d}}} & {(num_5/total_5*100):.1f}\,\% & {(num_10/total_10*100):.1f}\,\% \\")
print(r"\midrule")
print(fr"{num_remaining} others & {(remaining_5/total_5*100):.1f}\,\% & {(remaining_10/total_10*100):.1f}\,\% \\")
print(r"\midrule")
print(fr"self-references & {(self_5/total_5*100):.1f}\,\% & {(self_10/total_10*100):.1f}\,\% \\")

\domain{wikipedia.org} & 2.9\,\% & 2.2\,\% \\
\domain{youtube.com} & 0.8\,\% & 0.7\,\% \\
\domain{facebook.com} & 0.6\,\% & 0.5\,\% \\
\domain{linkedin.com} & 0.4\,\% & 0.3\,\% \\
\domain{imdb.com} & 0.3\,\% & 0.3\,\% \\
\domain{instagram.com} & 0.3\,\% & 0.2\,\% \\
\domain{amazon.com} & 0.3\,\% & 0.3\,\% \\
\domain{pinterest.com} & 0.2\,\% & 0.3\,\% \\
\domain{devroye.org} & 0.2\,\% & 0.2\,\% \\
\domain{twitter.com} & 0.2\,\% & 0.2\,\% \\
\midrule
791646 others & 24.1\,\% & 24.4\,\% \\
\midrule
self-references & 69.6\,\% & 70.4\,\% \\


In [32]:
sc.parallelize(alexa_services, 1000)\
    .keyBy(lambda service: service)\
    .flatMapValues(lambda service: paths_jsonl(service, "archived-parsed-serps"))\
    .repartition(10_000)\
    .flatMapValues(read_jsonl)\
    .filter(lambda service_serp: service_serp[1]["results"] is not None)\
    .filter(lambda service_serp: service_serp[1]["offset"] is None or service_serp[1]["offset"] == 0)\
    .filter(lambda service_serp: service_serp[1]["page"] is None or service_serp[1]["page"] <= 1)\
    .map(lambda service_serp: ((service_serp[0], service_serp[1]["query"]), service_serp[1]["results"]))\
    .flatMapValues(lambda results: results)\
    .mapValues(lambda result: domain(result["url"]))\
    .filter(lambda service_query_domain: service_query_domain[1] == "devroye.org")\
    .take(10)

[(('google', '"LS Graphic Design Tipografia"'), 'devroye.org'),
 (('google', '"LS Graphic Design Tipografia"'), 'devroye.org'),
 (('google', '"LSU Webliography  TrueType Fonts"'), 'devroye.org'),
 (('google', '"LSU Webliography  TrueType Fonts"'), 'devroye.org'),
 (('google', '"LSTTF"'), 'devroye.org'),
 (('google', '"LTypI  Lack of Typographic Imagination"'), 'devroye.org'),
 (('google', '"Louise Rigaux"'), 'devroye.org'),
 (('google', '"Louise Ross" "The Autumn Rabbit"'), 'devroye.org'),
 (('google', '"Louise Vieusseux"'), 'devroye.org'),
 (('google', '"Lourdes Garcia Traverso"'), 'devroye.org')]