In [14]:
from os import environ
from pathlib import Path
from pyspark import SparkConf
from pyspark.sql import SparkSession

environ['PYSPARK_PYTHON'] = "/mnt/ceph/storage/data-in-progress/data-research/web-search/web-archive-query-log/venv/bin/python"
session = SparkSession.builder\
    .master("yarn")\
    .appName("web-archive-query-log-query-length")\
    .config("spark.executor.instances", 3)\
    .getOrCreate()

In [15]:
sc = session.sparkContext
sc

In [16]:
from pathlib import Path

# TODO: For final evaluation, use the full corpus.
# corpus_dir = Path("/mnt/ceph/storage/data-in-progress/data-research/web-search/web-archive-query-log/focused/corpus")
# queries_dir = corpus_dir / "queries-2023-02-14"
corpus_dir = Path("/mnt/ceph/storage/data-in-progress/data-research/web-search/web-archive-query-log/focused/sample-corpus")
queries_dir = corpus_dir / "queries"

In [17]:
from pathlib import Path

figures_dir = Path("figures")
figures_dir.mkdir(parents=True, exist_ok=True)

In [18]:
def detect_language(text: str) -> str:
    from ftlangdetect import detect
    text = text.replace("\n", " ")
    language = detect(text)
    if language["score"] < 0.5:
        return None
    return language["lang"]

In [19]:
from json import loads
from pandas import DataFrame

counts = sc.textFile(f"file://{queries_dir}")\
    .map(lambda line: loads(line))\
    .filter(lambda query: query["url_query"] is not None) \
    .keyBy(lambda query: (
        len(query["url_query"]),
        query["service"],
        detect_language(query["url_query"])
    ))\
    .countByKey()

In [20]:
from numpy import percentile

df = DataFrame([
    {
        "url_query_length": url_query_length,
        "service": service,
        "language": language,
        "count": count,
    }
    for (url_query_length, service, language), count in counts.items()
])
services = sorted(
    df["service"].unique(),
    key=lambda service: df[df["service"] == service]["count"].sum(),
    reverse=True,
)[:5]
df = df[df["service"].isin(services)]
df

Unnamed: 0,url_query_length,service,language,count
0,14,baidu,,17467
1,13,baidu,,55
2,31,google,en,396
3,41,google,en,181
4,26,google,en,574
...,...,...,...,...
13454,27,youtube,sah,1
13459,82,google,fr,1
13468,115,google,he,1
13476,38,google,ja,1


In [None]:
from seaborn import displot

plot = displot(
    data=df.rename(columns={
        "url_query_length": "Query Length",
        "count": "Count",
        "service": "Service",
    }),
    x="Query Length",
    weights="Count",
    hue="Service",
    binwidth=1,
    multiple="stack",
    aspect=2,
    linewidth=0,
)
plot.set_axis_labels("Query Length", "Count")
plot.savefig(figures_dir / "query-length-histogram.pdf")
plot.savefig(figures_dir / "query-length-histogram.png")
plot

  baselines.iloc[:, cols] = (curves


In [None]:
# from numpy import percentile

# df2 = df.copy()
# df2["count"] = df2["count"].clip(upper=percentile(df["count"], 99))
# df2

In [None]:
# from seaborn import displot

# plot = displot(
#     data=df2.rename(columns={
#         "url_query_length": "Query Length",
#         "count": "Count",
#         "service": "Service",
#     }),
#     x="Query Length",
#     weights="Count",
#     hue="Service",
#     binwidth=1,
#     multiple="stack",
#     aspect=2,
#     linewidth=0,
# )
# plot.savefig(figures_dir / "query-length-histogram-without-outliers.pdf")
# plot.savefig(figures_dir / "query-length-histogram-without-outliers.png")
# plot