In [1]:
from os import environ

from pyspark.sql import SparkSession

environ['PYSPARK_PYTHON'] = "/mnt/ceph/storage/data-in-progress/data-research/web-search/web-archive-query-log/venv/bin/python"
session = SparkSession.builder \
    .master("yarn") \
    .appName("web-archive-query-log-trec-overlap") \
    .config("spark.executor.instances", 3) \
    .getOrCreate()

In [2]:
sc = session.sparkContext
sc

In [3]:
from pathlib import Path

global_data_dir = Path("/mnt/ceph/storage/data-in-progress/data-research/web-search/web-archive-query-log/")
global_data_dir

PosixPath('/mnt/ceph/storage/data-in-progress/data-research/web-search/web-archive-query-log')

In [4]:
data_dir = global_data_dir / "focused"
data_dir

PosixPath('/mnt/ceph/storage/data-in-progress/data-research/web-search/web-archive-query-log/focused')

In [5]:
from pathlib import Path

# TODO: For final evaluation, use the full corpus.
# corpus_dir = Path("/mnt/ceph/storage/data-in-progress/data-research/web-search/web-archive-query-log/focused/corpus")
# queries_dir = corpus_dir / "queries-2023-02-XX"
# documents_dir = corpus_dir / "documents-2023-02-XX"
corpus_dir = Path("/mnt/ceph/storage/data-in-progress/data-research/web-search/web-archive-query-log/focused/sample-corpus")
queries_dir = corpus_dir / "queries-2023-02-18"
documents_dir = corpus_dir / "documents-2023-02-18"

In [6]:
from pathlib import Path

figures_dir = Path("figures")
figures_dir.mkdir(parents=True, exist_ok=True)

In [7]:
def paths_jsonl(service: str, base_type: str):
    base_path = data_dir / base_type / service
    if not base_path.exists():
        return []
    yield from base_path.glob("*/*.jsonl.gz")

In [8]:
def paths_warc(service: str, base_type: str):
    base_path = data_dir / base_type / service
    if not base_path.exists():
        return []
    for path in base_path.glob("*/*"):
        if path.is_dir():
            yield path

In [9]:
from gzip import GzipFile


def count_jsonl(path: Path) -> int:
    print(f"Count JSONL records in {path}.")
    try:
        with GzipFile(path, "r") as file:
            return sum(1 for _ in file)
    except:
        return 0

In [10]:
from json import loads, JSONDecodeError
from typing import Iterator

def read_jsonl(path: Path) -> Iterator:
    print(f"Read JSONL records in {path}.")
    try:
        with GzipFile(path, "r") as gzip_file:
            for line in gzip_file:
                try:
                    url = loads(line)
                except JSONDecodeError:
                    continue
                yield url
    except:
        return 

In [11]:
with Path("services.txt").open("rt") as file:
    alexa_services = [
        line.strip()
        for line in file
        if line
    ]
len(alexa_services)

1087

In [12]:
from operator import add

# num_search_results_per_serp = sc.parallelize(alexa_services, 1000)\
#     .flatMap(lambda service: paths_jsonl(service, "archived-parsed-serps"))\
#     .flatMap(read_jsonl)\
#     .filter(lambda serp: serp["results"] is not None)\
#     .map(lambda serp: len(serp["results"]))\
#     .mean()
# num_search_results_per_serp

In [13]:
from operator import add

# num_interpreted_query = sc.parallelize(alexa_services, 1000)\
#     .flatMap(lambda service: paths_jsonl(service, "archived-parsed-serps"))\
#     .flatMap(read_jsonl)\
#     .filter(lambda serp: serp["interpreted_query"] is not None)\
#     .count()
# num_interpreted_query

In [14]:
!pip install ir-datasets



In [15]:
trec_tasks = {
    "Robust": {
        2004: "disks45/nocr/trec-robust-2004",
        2005: "aquaint/trec-robust-2005",
    },
    "Terabyte": {
        2004: "gov2/trec-tb-2004",
        2005: "gov2/trec-tb-2005",
        2006: "gov2/trec-tb-2006",
    },
    "MQ": {
        2007: "gov2/trec-mq-2007",
        2008: "gov2/trec-mq-2008",
        2009: "clueweb09/trec-mq-2009",
    },
    "Web": {
        2002: "gov/trec-web-2002",
        2003: "gov/trec-web-2003",
        2004: "gov/trec-web-2004",
        2009: "clueweb09/catb/trec-web-2009",
        2010: "clueweb09/catb/trec-web-2010",
        2011: "clueweb09/catb/trec-web-2011",
        2012: "clueweb09/catb/trec-web-2012",
        2013: "clueweb12/trec-web-2013",
        2014: "clueweb12/trec-web-2014",
    },
    "Deep Learning": {
        2019: "msmarco-document-v2/trec-dl-2019",
        2020: "msmarco-document-v2/trec-dl-2020",
        2021: "msmarco-document-v2/trec-dl-2021",
        2022: "msmarco-passage-v2/trec-dl-2022",
    },
}

In [16]:
from ir_datasets import load

def query_titles(irdsid: str) -> list:
    return [
        query.query.lower() if hasattr(query, "query") else 
        query.title.lower() if hasattr(query, "title") else 
        query.text.lower()
        for query in load(irdsid).queries_iter()
    ]

In [17]:
# query_titles(trec_tasks["Web"][2002])

In [18]:
irdsids = [
    irdsid
    for _, years in trec_tasks.items()
    for _, irdsid in years.items()
]

In [19]:
irds_queries = {
    irdsid: query_titles(irdsid)
    for irdsid in irdsids
}

In [20]:
irds_unique_queries = {
    irdsid: set(qs)
    for irdsid, qs in irds_queries.items()
}

In [21]:
irds_all_unique_queries = {
    q
    for _, qs in irds_unique_queries.items()
    for q in qs
}

In [23]:
irds_all_matched_queries = sc.parallelize(alexa_services, 1000)\
    .flatMap(lambda service: paths_jsonl(service, "archived-query-urls"))\
    .repartition(1000)\
    .flatMap(read_jsonl)\
    .map(lambda serp: serp["query"].lower())\
    .filter(lambda q: q in irds_all_unique_queries)\
    .distinct()\
    .collect()
irds_all_matched_queries = set(irds_all_matched_queries)

In [24]:
irds_non_mathed_queries = {
    irdsid: qs - irds_all_matched_queries
    for irdsid, qs in irds_unique_queries.items()
}

In [25]:
irds_overlap = {
    irdsid: 1 - (len(nmqs) / len(irds_queries[irdsid]))
    for irdsid, nmqs in irds_non_mathed_queries.items()
}

In [26]:
year_range = list(range(
    min(min(years) for _, years in trec_tasks.items()),
    max(max(years) for _, years in trec_tasks.items()) + 1,
))
year_range

[2002,
 2003,
 2004,
 2005,
 2006,
 2007,
 2008,
 2009,
 2010,
 2011,
 2012,
 2013,
 2014,
 2015,
 2016,
 2017,
 2018,
 2019,
 2020,
 2021,
 2022]

In [27]:
trec_overlaps = {
    track: {
        year: irds_overlap[irdsid]
        for year, irdsid in years.items()
    }
    for track, years in trec_tasks.items()
}
trec_overlaps

{'Robust': {2004: 0.32799999999999996, 2005: 0.54},
 'Terabyte': {2004: 0.19999999999999996,
  2005: 0.26,
  2006: 0.33999999999999997},
 'MQ': {2007: 0.07420000000000004, 2008: 0.0988, 2009: 0.00017500000000003624},
 'Web': {2002: 0.16000000000000003,
  2003: 0.72,
  2004: 0.31999999999999995,
  2009: 0.72,
  2010: 0.74,
  2011: 0.30000000000000004,
  2012: 0.6799999999999999,
  2013: 0.5800000000000001,
  2014: 0.56},
 'Deep Learning': {2019: 0.0050000000000000044,
  2020: 0.0050000000000000044,
  2021: 0.002096436058700246,
  2022: 0.02400000000000002}}

In [33]:
line = [fr"\textbf{{{track}}}"for track in trec_overlaps.keys()]
print(fr"\textbf{{Track}} & {' & '.join(line)} \\")
print(r"\midrule")
for year in year_range:
    line = [
        fr"{(year_overlaps[year] * 100):.0f}\,\%" if year in year_overlaps else "--"
        for track, year_overlaps in trec_overlaps.items()
    ]
    print(fr"{year} & {' & '.join(line)} \\")

\textbf{Track} & \textbf{Robust} & \textbf{Terabyte} & \textbf{MQ} & \textbf{Web} & \textbf{Deep Learning} \\
\midrule
2002 & -- & -- & -- & 16\,\% & -- \\
2003 & -- & -- & -- & 72\,\% & -- \\
2004 & 33\,\% & 20\,\% & -- & 32\,\% & -- \\
2005 & 54\,\% & 26\,\% & -- & -- & -- \\
2006 & -- & 34\,\% & -- & -- & -- \\
2007 & -- & -- & 7\,\% & -- & -- \\
2008 & -- & -- & 10\,\% & -- & -- \\
2009 & -- & -- & 0\,\% & 72\,\% & -- \\
2010 & -- & -- & -- & 74\,\% & -- \\
2011 & -- & -- & -- & 30\,\% & -- \\
2012 & -- & -- & -- & 68\,\% & -- \\
2013 & -- & -- & -- & 58\,\% & -- \\
2014 & -- & -- & -- & 56\,\% & -- \\
2015 & -- & -- & -- & -- & -- \\
2016 & -- & -- & -- & -- & -- \\
2017 & -- & -- & -- & -- & -- \\
2018 & -- & -- & -- & -- & -- \\
2019 & -- & -- & -- & -- & 1\,\% \\
2020 & -- & -- & -- & -- & 1\,\% \\
2021 & -- & -- & -- & -- & 0\,\% \\
2022 & -- & -- & -- & -- & 2\,\% \\
