In [None]:
from os import environ

from pyspark.sql import SparkSession

environ['PYSPARK_PYTHON'] = "/mnt/ceph/storage/data-in-progress/data-research/web-search/web-archive-query-log/venv/bin/python"
session = SparkSession.builder \
    .master("yarn") \
    .appName("web-archive-query-log-num-results-per-serp") \
    .config("spark.executor.instances", 3) \
    .getOrCreate()

In [None]:
sc = session.sparkContext
sc

In [None]:
from pathlib import Path

global_data_dir = Path("/mnt/ceph/storage/data-in-progress/data-research/web-search/web-archive-query-log/")
global_data_dir

In [None]:
data_dir = global_data_dir / "focused"
data_dir

In [None]:
from pathlib import Path

# TODO: For final evaluation, use the full corpus.
# corpus_dir = Path("/mnt/ceph/storage/data-in-progress/data-research/web-search/web-archive-query-log/focused/corpus")
# queries_dir = corpus_dir / "queries-2023-02-XX"
# documents_dir = corpus_dir / "documents-2023-02-XX"
corpus_dir = Path("/mnt/ceph/storage/data-in-progress/data-research/web-search/web-archive-query-log/focused/sample-corpus")
queries_dir = corpus_dir / "queries-2023-02-18"
documents_dir = corpus_dir / "documents-2023-02-18"

In [None]:
from pathlib import Path

figures_dir = Path("figures")
figures_dir.mkdir(parents=True, exist_ok=True)

In [None]:
def paths_jsonl(service: str, base_type: str):
    base_path = data_dir / base_type / service
    if not base_path.exists():
        return []
    yield from base_path.glob("*/*.jsonl.gz")

In [None]:
def paths_warc(service: str, base_type: str):
    base_path = data_dir / base_type / service
    if not base_path.exists():
        return []
    for path in base_path.glob("*/*"):
        if path.is_dir():
            yield path

In [None]:
from gzip import GzipFile


def count_jsonl(path: Path) -> int:
    print(f"Count JSONL records in {path}.")
    try:
        with GzipFile(path, "r") as file:
            return sum(1 for _ in file)
    except:
        return 0

In [None]:
from json import loads, JSONDecodeError
from typing import Iterator

def read_jsonl(path: Path) -> Iterator:
    print(f"Read JSONL records in {path}.")
    try:
        with GzipFile(path, "r") as gzip_file:
            for line in gzip_file:
                try:
                    url = loads(line)
                except JSONDecodeError:
                    continue
                yield url
    except:
        return 

In [None]:
with Path("services.txt").open("rt") as file:
    alexa_services = [
        line.strip()
        for line in file
        if line
    ]
len(alexa_services)

In [None]:
from operator import add

num_search_results_per_serp = sc.parallelize(alexa_services, 1000)\
    .filter(lambda service: service not in top_services.keys())\
    .flatMap(lambda service: paths_jsonl(service, "archived-parsed-serps"))\
    .repartition(10_000)\
    .flatMap(read_jsonl)\
    .filter(lambda serp: serp["results"] is not None)\
    .map(lambda serp: len(serp["results"]))\
    .mean()
num_search_results_per_serp