In [62]:
from json import loads 
from datetime import datetime
from uuid import UUID
from gzip import GzipFile
from io import TextIOWrapper
from json import dumps
from csv import reader
from pathlib import Path
from pyspark import SparkConf, SparkContext

In [1]:
conf = SparkConf()
conf.setAll([
    ("spark.executor.instances", 3)
])
sc = SparkContext(
    master="yarn",
    appName="web-archive-ql-join",
    conf=conf,
)
sc

In [63]:
global_data_dir = Path("/mnt/ceph/storage/data-in-progress/data-research/web-search/web-archive-query-log/")
global_data_dir

PosixPath('/mnt/ceph/storage/data-in-progress/data-research/web-search/web-archive-query-log')

In [64]:
data_dir = global_data_dir / "focused"
data_dir

PosixPath('/mnt/ceph/storage/data-in-progress/data-research/web-search/web-archive-query-log/focused')

In [56]:
relative_paths = [
    path.relative_to(data_dir / "archived-urls").with_name(path.name[:-len(".jsonl.gz")])
    for path in data_dir.glob("archived-urls/*/*/*.jsonl.gz")
]
len(relative_paths)

59403

In [117]:
def process_snippet(
    archived_search_result_snippet_index: dict,
    archived_raw_search_result_index: dict,
    archived_parsed_search_result_index: dict,
    archived_search_result_snippet: dict,
    archived_search_result_snippet_id: UUID,
) -> dict:
    print(f"Process archived search result snippet ID {archived_search_result_snippet_id}.")
    archived_search_result_snippet_location = archived_search_result_snippet_index.get(archived_search_result_snippet_id)
    archived_raw_search_result_location = archived_raw_search_result_index.get(archived_search_result_snippet_id)
    archived_parsed_search_result_location = archived_parsed_search_result_index.get(archived_search_result_snippet_id)
    
    if archived_search_result_snippet_location is not None:
        with GzipFile(archived_search_result_snippet_location[0], "rb") as gzip_file:
            gzip_file.seek(archived_search_result_snippet_location[1])
            with TextIOWrapper(gzip_file) as text_file:
                line = text_file.readline()
                archived_search_result_snippet = loads(line)
    else:
        print(f"Could not find archived search result snippet ID {archived_search_result_snippet_id}.")
        return None
    
    wayback_timestamp = datetime.fromtimestamp(archived_search_result_snippet["timestamp"]).strftime("%Y%m%d%H%M%S")
    wayback_url = f"https://web.archive.org/web/{wayback_timestamp}/{archived_search_result_snippet['url']}"
    wayback_raw_url = f"https://web.archive.org/web/{wayback_timestamp}id_/{archived_search_result_snippet['url']}"
    
    document = {
        "id": str(archived_search_result_snippet_id),
        "url": archived_search_result_snippet["url"],
        "timestamp": archived_search_result_snippet["timestamp"],
        "wayback_url": wayback_url,
        "wayback_raw_url": wayback_raw_url,
        "snippet_rank": archived_search_result_snippet["rank"],
        "snippet_title": archived_search_result_snippet["title"],
        "snippet_text": archived_search_result_snippet["snippet"],
        "archived_snippet_location": {
            "relative_path": str(archived_search_result_snippet_location[0].relative_to(global_data_dir)),
            "byte_offset": archived_search_result_snippet_location[1],
            "index": archived_search_result_snippet_location[2],
        } if archived_search_result_snippet_location is not None else None,
        "archived_raw_search_result_location": {
            "relative_path": str(archived_raw_search_result_location[0].relative_to(global_data_dir)),
            "byte_offset": archived_raw_search_result_location[1],
        } if archived_raw_search_result_location is not None else None,
        "archived_parsed_search_result_location": {
            "relative_path": str(archived_parsed_search_result_location[0].relative_to(global_data_dir)),
            "byte_offset": archived_parsed_search_result_location[1],
        } if archived_parsed_search_result_location is not None else None,
    }
    print(f"Finished processing archived search result snippet ID {archived_search_result_snippet_id}.")
    return document

In [118]:
def process_url(
    service: str,
    archived_urls_index: dict,
    archived_query_urls_index: dict,
    archived_raw_serps_index: dict,
    archived_parsed_serps_index: dict,
    archived_search_result_snippet_index: dict,
    archived_raw_search_result_index: dict,
    archived_parsed_search_result_index: dict,
    archived_url_id: UUID,
) -> dict:
    print(f"Process archived URL ID {archived_url_id}.")
    
    archived_url_location = archived_urls_index.get(archived_url_id)
    archived_query_url_location = archived_query_urls_index.get(archived_url_id)
    archived_raw_serp_location = archived_raw_serps_index.get(archived_url_id)
    archived_parsed_serp_location = archived_parsed_serps_index.get(archived_url_id)
    
    if archived_url_location is not None:
        with GzipFile(archived_url_location[0], "rb") as gzip_file:
            gzip_file.seek(archived_url_location[1])
            with TextIOWrapper(gzip_file) as text_file:
                line = text_file.readline()
                archived_url = loads(line)
    else:
        print(f"Could not find archived URL ID {archived_url_id}.")
        return None
    if archived_query_url_location is not None:
        with GzipFile(archived_query_url_location[0], "rb") as gzip_file:
            gzip_file.seek(archived_query_url_location[1])
            with TextIOWrapper(gzip_file) as text_file:
                line = text_file.readline()
                archived_query_url = loads(line)
    else:
        archived_query_url = None
    if archived_parsed_serp_location is not None:
        with GzipFile(archived_parsed_serp_location[0], "rb") as gzip_file:
            gzip_file.seek(archived_parsed_serp_location[1])
            with TextIOWrapper(gzip_file) as text_file:
                line = text_file.readline()
                archived_parsed_serp = loads(line)
    else:
        archived_parsed_serp = None
            
    wayback_timestamp = datetime.fromtimestamp(archived_url["timestamp"]).strftime("%Y%m%d%H%M%S")
    wayback_url = f"https://web.archive.org/web/{wayback_timestamp}/{archived_url['url']}"
    wayback_raw_url = f"https://web.archive.org/web/{wayback_timestamp}id_/{archived_url['url']}"
    
    partial_documents = [
        process_snippet(
            archived_search_result_snippet_index,
            archived_raw_search_result_index,
            archived_parsed_search_result_index,
            archived_search_result_snippet,
        )
        for archived_search_result_snippet in archived_parsed_serp["results"]
    ] if archived_parsed_serp is not None else None
    
    partial_query = {
        "id": str(archived_url_id),
        "url": archived_url["url"],
        "timestamp": archived_url["timestamp"],
        "wayback_url": wayback_url,
        "wayback_raw_url": wayback_raw_url,
        "url_query": archived_query_url["query"] if archived_query_url is not None else None,
        "url_page": archived_query_url["page"] if archived_query_url is not None else None,
        "url_offset": archived_query_url["offset"] if archived_query_url is not None else None,
        "serp_query": archived_parsed_serp["interpreted_query"] if archived_parsed_serp is not None else None,
        "archived_url_location":{
            "relative_path": str(archived_url_location[0].relative_to(global_data_dir)),
            "byte_offset": archived_url_location[1],
        },
        "archived_query_url_location":{
            "relative_path": str(archived_query_url_location[0].relative_to(global_data_dir)),
            "byte_offset": archived_query_url_location[1],
        } if archived_query_url_location is not None else None,
        "archived_raw_serp_location":{
            "relative_path": str(archived_raw_serp_location[0].relative_to(global_data_dir)),
            "byte_offset": archived_raw_serp_location[1],
        } if archived_raw_serp_location is not None else None,
        "archived_parsed_serp_location":{
            "relative_path": str(archived_parsed_serp_location[0].relative_to(global_data_dir)),
            "byte_offset": archived_parsed_serp_location[1],
        } if archived_parsed_serp_location is not None else None,
    }
    
    query = {
        **partial_query,
        "service": service,
        "results": partial_documents,
    }
    documents = [
        {
            **partial_document,
            "service": service,
            "query": partial_query,
        }
        for partial_document in partial_documents
    ] if partial_documents is not None else None
    print(f"Finished processing archived URL ID {archived_url_id}.")
    return query, documents

In [119]:
def process_relative_path(path: Path):
    print(f"Process relative path {path}.")
    
    service = path.parts[0]
    
    archived_urls_index_path = data_dir / "archived-urls" / path.with_suffix(".index")
    archived_query_urls_index_path = data_dir / "archived-query-urls" / path.with_suffix(".index")
    archived_raw_serps_index_path = data_dir / "archived-raw-serps" / path / ".index"
    archived_parsed_serps_index_path = data_dir / "archived-parsed-serps" / path.with_suffix(".index")
    archived_search_result_snippets_index_path = data_dir / "archived-parsed-serps" / path.with_suffix(".snippets.index")
    archived_raw_search_results_index_path = data_dir / "archived-raw-search-results" / path / ".index"
    archived_parsed_search_results_index_path = data_dir / "archived-parsed-search-results" / path.with_suffix(".index")
    
    if archived_urls_index_path.exists():
        with archived_urls_index_path.open("rt") as index_file:
            archived_urls_index = {
                UUID(row[0]): (
                    global_data_dir / row[1],
                    int(row[2]),
                )
                for row in reader(index_file)
            }
    else:
        archived_urls_index = {}
    if archived_query_urls_index_path.exists():
        with archived_query_urls_index_path.open("rt") as index_file:
            archived_query_urls_index = {
                UUID(row[0]): (
                    global_data_dir / row[1],
                    int(row[2]),
                )
                for row in reader(index_file)
            }
    else:
        archived_query_urls_index = {}
    if archived_raw_serps_index_path.exists():
        with archived_raw_serps_index_path.open("rt") as index_file:
            archived_raw_serps_index = {
                UUID(row[0]): (
                    global_data_dir / row[1],
                    int(row[2]),
                )
                for row in reader(index_file)
            }
    else:
        archived_raw_serps_index = {}
    if archived_parsed_serps_index_path.exists():
        with archived_parsed_serps_index_path.open("rt") as index_file:
            archived_parsed_serps_index = {
                UUID(row[0]): (
                    global_data_dir / row[1],
                    int(row[2]),
                )
                for row in reader(index_file)
            }
    else:
        archived_parsed_serps_index = {}
    if archived_search_result_snippets_index_path.exists():
        with archived_search_result_snippets_index_path.open("rt") as index_file:
            archived_search_result_snippets_index = {
                UUID(row[0]): (
                    global_data_dir / row[1],
                    int(row[2]),
                    int(row[3]),
                )
                for row in reader(index_file)
            }
    else:
        archived_search_result_snippets_index = {}
    if archived_raw_search_results_index_path.exists():
        with archived_raw_search_results_index_path.open("rt") as index_file:
            archived_raw_search_results_index = {
                UUID(row[0]): (
                    global_data_dir / row[1], 
                    int(row[2]),
                )
                for row in reader(index_file)
            }
    else:
        archived_raw_search_results_index = {}
    if archived_parsed_search_results_index_path.exists():
        with archived_parsed_search_results_index_path.open("rt") as index_file:
            archived_parsed_search_results_index = {
                UUID(row[0]): (
                    global_data_dir / row[1],
                    int(row[2]),
                )
                for row in reader(index_file)
            }
    else:
        archived_parsed_search_results_index = {}
        
    archived_ids = archived_parsed_serps_index.keys()
    corpus = [
        process_url(
            service, 
            archived_urls_index,
            archived_query_urls_index,
            archived_raw_serps_index,
            archived_parsed_serps_index,
            archived_search_result_snippets_index,
            archived_raw_search_results_index,
            archived_parsed_search_results_index,
            archived_id,
        )
        for archived_id in archived_ids
    ]
    corpus_queries = [
        corpus_query
        for corpus_query, _ in corpus
    ]
    corpus_documents = [
        corpus_document
        for _, corpus_query_documents in corpus
        for corpus_document in (
            corpus_query_documents if corpus_query_documents is not None else []
        )
    ]
    print(f"Finished processing relative path {path}.")
    return corpus_queries, corpus_documents

In [120]:
def process_relative_path_queries(path: Path):
    print(f"Process corpus queries for path {path}.")
    corpus_queries, _ = process_relative_path(path)
    yield from corpus_queries
    print(f"Finished processing corpus queries for path {path}.")

In [121]:
def process_relative_path_documents(path: Path):
    print(f"Process corpus documents for path {path}.")
    _, corpus_documents = process_relative_path(path)
    yield from corpus_documents
    print(f"Finished processing corpus documents for path {path}.")

In [122]:
!hdfs dfs -ls

Found 3 items
drwxr-xr-x   - ajjxp ajjxp          0 2023-02-06 15:29 .sparkStaging
drwxr-xr-x   - ajjxp ajjxp          0 2022-08-22 20:34 ccqa
drwxr-xr-x   - ajjxp ajjxp          0 2023-02-06 20:30 web-archive-query-log


In [131]:
!hdfs dfs -rm -r web-archive-query-log

Deleted web-archive-query-log


In [None]:
rdd = sc.parallelize(relative_paths)
rdd = rdd.flatMap(process_relative_path_queries)
rdd.repartition(100).saveAsTextFile("web-archive-query-log/queries/", compressionCodecClass='org.apache.hadoop.io.compress.GzipCodec')

In [None]:
rdd = sc.parallelize(relative_paths)
rdd = rdd.flatMap(process_relative_path_documents)
rdd.repartition(100).saveAsTextFile("web-archive-query-log/documents/", compressionCodecClass='org.apache.hadoop.io.compress.GzipCodec')