In [22]:
from os import environ

from pyspark.sql import SparkSession

environ['PYSPARK_PYTHON'] = "/mnt/ceph/storage/data-in-progress/data-research/web-search/web-archive-query-log/venv/bin/python"
session = SparkSession.builder\
    .master("yarn")\
    .appName("web-archive-query-log-stats")\
    .config("spark.executor.instances", 3)\
    .getOrCreate()

In [23]:
sc = session.sparkContext
sc

In [24]:
from pathlib import Path

global_data_dir = Path("/mnt/ceph/storage/data-in-progress/data-research/web-search/web-archive-query-log/")
global_data_dir

PosixPath('/mnt/ceph/storage/data-in-progress/data-research/web-search/web-archive-query-log')

In [25]:
data_dir = global_data_dir / "focused"
data_dir

PosixPath('/mnt/ceph/storage/data-in-progress/data-research/web-search/web-archive-query-log/focused')

In [26]:
with Path("services.txt").open("rt") as file:
    alexa_services = [
        line.strip()
        for line in file
        if line
    ]
# alexa_services = alexa_services[:1]
alexa_services

['google',
 'youtube',
 'baidu',
 'qq',
 'facebook',
 'tmall',
 'taobao',
 'yahoo',
 'amazon',
 'wikipedia',
 'jd',
 '360',
 'weibo',
 'reddit',
 'netflix',
 'instagram',
 'vk',
 'microsoft',
 'csdn',
 'bing',
 'twitter',
 'twitch',
 'zoom',
 'ebay',
 'naver',
 'aliexpress',
 'yandex',
 'linkedin',
 'bongacams',
 'apple',
 'pornhub',
 'mail',
 'stackoverflow',
 'msn',
 'tribunnews',
 'imdb',
 'livejasmin',
 'chaturbate',
 'dropbox',
 'ok',
 'xvideos',
 'github',
 'cnn',
 'etsy',
 'xhamster',
 'sogou',
 'canva',
 'tumblr',
 'espn',
 'instructure',
 'indeed',
 'roblox',
 'imgur',
 'flipkart',
 'fandom',
 'bbc',
 'detik',
 'booking',
 'cnblogs',
 'walmart',
 'alibaba',
 'freepik',
 'zhihu',
 'nih',
 'force',
 'salesforce',
 'stackexchange',
 'daum',
 'udemy',
 'onlinesbi',
 'savefrom',
 'indiatimes',
 'theguardian',
 'craigslist',
 'avito',
 'grid',
 'duckduckgo',
 'aliyun',
 'tiktok',
 'primevideo',
 'shutterstock',
 'speedtest',
 'mediafire',
 'xnxx',
 'gome',
 'w3schools',
 'researchga

In [27]:
def paths_jsonl(service: str, base_type: str):
    base_path = data_dir / base_type / service
    if not base_path.exists():
        return []
    yield from base_path.glob("*/*.jsonl.gz")

In [28]:
def paths_warc(service: str, base_type: str):
    base_path = data_dir / base_type / service
    if not base_path.exists():
        return []
    for path in base_path.glob("*/*"):
        if path.is_dir():
            yield path

In [29]:
from gzip import GzipFile


def count_jsonl(path: Path) -> int:
    print(f"Count JSONL records in {path}.")
    try:
        with GzipFile(path, "r") as file:
            return sum(1 for _ in file)
    except:
        return 0

In [30]:
from json import loads, JSONDecodeError
from typing import Iterator

def read_jsonl(path: Path) -> Iterator:
    print(f"Read JSONL records in {path}.")
    try:
        with GzipFile(path, "r") as gzip_file:
            for line in gzip_file:
                try:
                    url = loads(line)
                except JSONDecodeError:
                    continue
                yield url
    except:
        return 

In [31]:
def count_warc(path: Path) -> int:
    print(f"Count WARC records in {path}.")
    lock_path = path / ".lock"
    with lock_path.open("rt") as file:
        return sum(1 for _ in file)

In [32]:
def identity(x):
    return x

In [33]:
from operator import add

url_counts = sc.parallelize(alexa_services, 1000)\
    .map(lambda service: (service, service))\
    .flatMapValues(lambda service: paths_jsonl(service, "archived-urls"))\
    .repartition(10_000)\
    .mapValues(count_jsonl)\
    .reduceByKey(add)\
    .collectAsMap()
url_counts

{'watanserb': 753336,
 'manga9': 3138,
 'zimbio': 29797,
 'xvideos': 29710376,
 'worldbank': 1446,
 'tiktok': 11652,
 'club-k': 174348,
 'irrawaddy': 14400183,
 'shopzilla': 7890006,
 'tokopedia': 21286,
 'alibaba': 63870,
 'cp': 26,
 'nifty': 0,
 '9anime': 5610,
 'dygang': 21868,
 'sarvgyan': 29881,
 '100ppi': 15596,
 'jra': 266,
 'voachinese': 26431,
 '51pla': 233,
 '1001jogos': 5918,
 'buedemusica': 75595,
 'sogou': 3374145,
 'base-search': 17437,
 'smashingmagazine': 4391,
 'zaful': 601,
 'dailypost': 3642357,
 'aliyun': 0,
 'sergey-mavrodi': 3,
 'perezhilton': 21958275,
 'znzmo': 142607,
 'justindianporn': 9784,
 'belgium': 440,
 'elfagr': 558,
 'zk': 0,
 '9to5mac': 2769211,
 'in': 24509,
 'podbean': 441,
 'suite101': 8042,
 'ecrater': 45825,
 'vietxx': 531,
 'globalspec': 453,
 'fatosdesconhecidos': 258048,
 'warcraftlogs': 143,
 'actblue': 119254,
 'gib': 125,
 'milenio': 451,
 'usmagazine': 0,
 'auchan': 80633,
 'wa': 45,
 'iconfinder': 975242,
 'eztv': 901,
 'balenciaga': 4386

In [34]:
query_url_counts = sc.parallelize(alexa_services, 1000)\
    .map(lambda service: (service, service))\
    .flatMapValues(lambda service: paths_jsonl(service, "archived-query-urls"))\
    .repartition(10_000)\
    .mapValues(count_jsonl)\
    .reduceByKey(add)\
    .collectAsMap()
query_url_counts

{'watanserb': 445,
 'manga9': 70,
 'zimbio': 18812,
 'xvideos': 1756637,
 'worldbank': 391,
 'tiktok': 10611,
 'club-k': 6,
 'shopzilla': 11405,
 'irrawaddy': 100012,
 'tokopedia': 18226,
 'alibaba': 34079,
 'cp': 2,
 'nifty': 0,
 '9anime': 712,
 'dygang': 2314,
 'sarvgyan': 5,
 'jra': 11,
 '100ppi': 0,
 'voachinese': 0,
 '51pla': 231,
 '1001jogos': 7,
 'buedemusica': 5,
 'sogou': 2241105,
 'base-search': 13912,
 'zaful': 582,
 'smashingmagazine': 2790,
 'dailypost': 90,
 'aliyun': 0,
 'sergey-mavrodi': 0,
 'perezhilton': 2496,
 'znzmo': 135,
 'justindianporn': 3,
 'belgium': 20,
 'elfagr': 308,
 'zk': 0,
 '9to5mac': 1798,
 'in': 0,
 'suite101': 7241,
 'podbean': 246,
 'ecrater': 102,
 'vietxx': 0,
 'globalspec': 396,
 'fatosdesconhecidos': 60,
 'warcraftlogs': 142,
 'gib': 125,
 'actblue': 53,
 'milenio': 0,
 'auchan': 5013,
 'usmagazine': 0,
 'wa': 10,
 'iconfinder': 4715,
 'balenciaga': 43,
 'eztv': 7,
 'colorado': 14,
 'mql5': 0,
 'miner': 0,
 '4shared': 0,
 'dontorrent': 72,
 'gam

In [35]:
unique_query_url_counts = sc.parallelize(alexa_services, 1000)\
    .map(lambda service: (service, service))\
    .flatMapValues(lambda service: paths_jsonl(service, "archived-query-urls"))\
    .repartition(10_000)\
    .flatMapValues(read_jsonl)\
    .mapValues(lambda url: url["query"])\
    .distinct()\
    .countByKey()
unique_query_url_counts

defaultdict(int,
            {'badjojo': 87420,
             'e621': 695970,
             'google': 19953592,
             'sogou': 972831,
             'vectorstock': 173710,
             'youtube': 11250179,
             'twitter': 3869382,
             'duckduckgo': 3279282,
             'baidu': 2900878,
             'pornhub': 187374,
             'xvideos': 433355,
             'tokopedia': 2765,
             'tigerdirect': 37132,
             'youm7': 1812,
             'glassdoor': 407245,
             'yahoo': 1232589,
             'livejasmin': 15047,
             'bing': 2253965,
             'stackoverflow': 1817671,
             'shutterstock': 271820,
             'investopedia': 2046,
             '9gag': 18173,
             'pornmd': 847331,
             'wikimedia': 621971,
             'kat': 377,
             'indeed': 134812,
             'berkeley': 135,
             'sourceforge': 91203,
             'stackexchange': 223216,
             'weibo': 1886458,
        

In [36]:
raw_serp_counts = sc.parallelize(alexa_services, 1000)\
    .map(lambda service: (service, service))\
    .flatMapValues(lambda service: paths_warc(service, "archived-raw-serps"))\
    .repartition(10_000)\
    .mapValues(count_warc)\
    .reduceByKey(add)\
    .collectAsMap()
raw_serp_counts

{'watanserb': 338,
 'manga9': 70,
 'zimbio': 10660,
 'xvideos': 24209,
 'worldbank': 318,
 'tiktok': 6612,
 'club-k': 4,
 'shopzilla': 1994,
 'irrawaddy': 244,
 'tokopedia': 2743,
 'alibaba': 14224,
 'cp': 2,
 '9anime': 530,
 'dygang': 288,
 'sarvgyan': 2,
 'jra': 7,
 '51pla': 226,
 '1001jogos': 1,
 'buedemusica': 4,
 'sogou': 24585,
 'base-search': 6548,
 'smashingmagazine': 306,
 'zaful': 165,
 'dailypost': 3,
 'znzmo': 70,
 'perezhilton': 286,
 'justindianporn': 1,
 'belgium': 2,
 'elfagr': 154,
 '9to5mac': 593,
 'suite101': 2409,
 'podbean': 158,
 'ecrater': 39,
 'globalspec': 110,
 'fatosdesconhecidos': 9,
 'warcraftlogs': 117,
 'gib': 1,
 'actblue': 36,
 'auchan': 1101,
 'wa': 1,
 'iconfinder': 4158,
 'eztv': 7,
 'balenciaga': 21,
 'colorado': 12,
 'gamingwonderland': 457,
 'dontorrent': 33,
 'internshala': 211,
 'leit': 77,
 'turkiye': 374,
 'avito': 647,
 'razer': 42,
 'ew': 204,
 'xnxx': 23329,
 'excite': 147,
 'twitch': 11443,
 'anysex': 34130,
 'citeab': 4,
 'myfonts': 62,
 

In [37]:
parsed_serp_counts = sc.parallelize(alexa_services, 1000)\
    .map(lambda service: (service, service))\
    .flatMapValues(lambda service: paths_jsonl(service, "archived-parsed-serps"))\
    .repartition(1000)\
    .mapValues(count_jsonl)\
    .reduceByKey(add)\
    .collectAsMap()
parsed_serp_counts

{'xvideos': 942,
 'alibaba': 0,
 'sogou': 23550,
 'turkiye': 0,
 'xnxx': 0,
 'yandex': 13161,
 'etsy': 15071,
 'imdb': 17454,
 'chinaz': 0,
 'globo': 0,
 'bing': 1115786,
 'ask': 0,
 'pornhub': 24202,
 'github': 23987,
 'google': 785814,
 '360': 30503,
 'slideshare': 0,
 'twitter': 9077,
 'reddit': 0,
 'weibo': 27828,
 'wikimedia': 0,
 'facebook': 553,
 'daum': 0,
 'youtube': 1184860,
 'nih': 0,
 'naver': 334490,
 'baidu': 492028,
 'aliexpress': 20692,
 'amazon': 102031,
 'chaturbate': 0,
 'freepik': 0,
 'ebay': 168937,
 'stackoverflow': 21237,
 '123rf': 0,
 'tribunnews': 0,
 'investopedia': 0,
 'stackexchange': 0,
 'fivethirtyeight': 0,
 'espn': 0,
 'qq': 29991,
 'duckduckgo': 0,
 'yahoo': 1021819,
 'tdameritrade': 0,
 'cnblogs': 0,
 'apple': 0,
 'booking': 0,
 'gome': 0,
 'detik': 0}

In [38]:
search_result_snippet_counts = sc.parallelize(alexa_services, 1000)\
    .map(lambda service: (service, service))\
    .flatMapValues(lambda service: paths_jsonl(service, "archived-parsed-serps"))\
    .repartition(10_000)\\servicelogo{google} & Google & \\
    .flatMapValues(read_jsonl)\
    .mapValues(lambda serp: len(serp["results"]) if "results" in serp else 0)\
    .reduceByKey(add)\
    .collectAsMap()
search_result_snippet_counts

{'xvideos': 12980,
 'sogou': 225479,
 'yandex': 0,
 'etsy': 284660,
 'imdb': 361087,
 'bing': 5947107,
 'pornhub': 450734,
 'github': 93199,
 'google': 6265179,
 '360': 253457,
 'twitter': 118576,
 'weibo': 0,
 'facebook': 0,
 'youtube': 25233781,
 'naver': 3376741,
 'baidu': 1975625,
 'aliexpress': 165108,
 'amazon': 2659675,
 'ebay': 2002188,
 'stackoverflow': 179939,
 'qq': 0,
 'yahoo': 8677259}

In [39]:
raw_search_result_counts = sc.parallelize(alexa_services, 1000)\
    .map(lambda service: (service, service))\
    .flatMapValues(lambda service: paths_warc(service, "archived-raw-search-results"))\
    .repartition(10_000)\
    .mapValues(count_warc)\
    .reduceByKey(add)\
    .collectAsMap()
raw_search_result_counts

{}

In [40]:
parsed_search_result_counts = sc.parallelize(alexa_services, 1000)\
    .map(lambda service: (service, service))\
    .flatMapValues(lambda service: paths_jsonl(service, "archived-parsed-search-results"))\
    .repartition(10_000)\
    .mapValues(count_jsonl)\
    .reduceByKey(add)\
    .collectAsMap()
parsed_search_result_counts

{}

In [41]:
from concurrent.futures import ThreadPoolExecutor

from tqdm import tqdm

with Path("numbers.csv").open("wt") as file:
    file.write(",".join([
        "service", 
        "urls", 
        "query_urls", 
        "unique_query_urls", 
        "raw_serps",
        "parsed_serps",
        "search_result_snippets",
        "raw_search_results",
        "parsed_search_results",
    ]) + "\n")
    for service in alexa_services:
        file.write(",".join([
            service, 
            str(url_counts.get(service, 0)),
            str(query_url_counts.get(service, 0)),
            str(unique_query_url_counts.get(service, 0)),
            str(raw_serp_counts.get(service, 0)),
            str(parsed_serp_counts.get(service, 0)),
            str(search_result_snippet_counts.get(service, 0)),
            str(raw_search_result_counts.get(service, 0)),
            str(parsed_search_result_counts.get(service, 0)),
        ]) + "\n")

In [42]:
global_unique_query_urls = sc.parallelize(alexa_services, 1000)\
    .flatMap(lambda service: paths_jsonl(service, "archived-query-urls"))\
    .repartition(10_000)\
    .flatMap(read_jsonl)\
    .map(lambda url: url["query"])\
    .distinct().count()
print(global_unique_query_urls)

64544345
