In [None]:
from pyspark import SparkConf, SparkContext

conf = SparkConf()
conf.setAll([
    ("spark.executor.instances", 3)
])
sc = SparkContext(
    # TODO: For final evaluation, run on YARN cluster.
    # master="yarn",
    appName="web-archive-query-log-query-length",
    conf=conf,
)
sc

In [None]:
print(sc.uiWebUrl)

In [None]:
from pathlib import Path

# TODO: For final evaluation, use the full corpus.
# corpus_dir = Path("/mnt/ceph/storage/data-in-progress/data-research/web-search/web-archive-query-log/focused/corpus/")
corpus_dir = Path("/mnt/ceph/storage/data-in-progress/data-research/web-search/web-archive-query-log/focused/sample-corpus/")
queries_dir = corpus_dir / "queries"

In [None]:
from pathlib import Path

figures_dir = Path("figures")
figures_dir.mkdir(parents=True, exist_ok=True)

In [None]:
def characteristics(query: dict) -> tuple:
    return (
        query["service"],
    )

In [None]:
from json import loads

def length_counts():
    return sc.textFile(f"file://{queries_dir}")\
        .map(lambda line: loads(line))\
        .filter(lambda query: query["url_query"] is not None) \
        .keyBy(lambda query: (len(query["url_query"]), *characteristics(query)))\
        .countByKey()

In [None]:
from pandas import DataFrame

df = DataFrame([
    {
        "query_length": length,
        "service": service,
        "count": count,
    }
    for (length, service), count in length_counts().items()
])
df

In [None]:
from seaborn import histplot, kdeplot
from matplotlib.figure import Figure
from matplotlib.pyplot import subplots

fig: Figure
fig, ax = subplots()
# fig.tight_layout()
df_vis = df[df["count"] <= 100].iloc[:10]
histplot(
    data=df_vis,
    x="query_length",
    weights="count",
    hue="service",
    bins=False,
    ax=ax,
)
ax.set_xlabel("query length")
ax.set_ylabel("count")
# fig.savefig(figures_dir / "query-length-histogram.pdf")
fig.savefig(figures_dir / "query-length-histogram.png")
ax