In [1]:
from os import environ
import json
from pyspark.sql import SparkSession
from pathlib import Path
import pandas as pd
import yaml
import re

environ[
    'PYSPARK_PYTHON'] = "/mnt/ceph/storage/data-in-progress/data-research/web-search/web-archive-query-log/venv/bin/python"
session = SparkSession.builder \
    .master("yarn") \
    .appName("obscene_queries") \
    .config("spark.executor.instances", 3) \
    .getOrCreate()

In [2]:
sc = session.sparkContext
sc

In [3]:
# TODO: For final evaluation, use the full corpus.
# corpus_dir = Path("/mnt/ceph/storage/data-in-progress/data-research/web-search/web-archive-query-log/focused/corpus")
# queries_dir = corpus_dir / "queries-2023-02-14"
corpus_dir = Path(
    "/mnt/ceph/storage/data-in-progress/data-research/web-search/web-archive-query-log/focused/sample-corpus")
queries_dir = corpus_dir / "queries-2023-02-16"
queries_dir

PosixPath('/mnt/ceph/storage/data-in-progress/data-research/web-search/web-archive-query-log/focused/sample-corpus/queries-2023-02-16')

In [4]:
figures_dir = Path("figures")
figures_dir.mkdir(parents=True, exist_ok=True)

In [5]:
def adapt_query(query: dict) -> tuple:
    query_text = query["url_query"].lower()
    query_text = query_text.replace('+', ' ').replace('-', ' ').replace('|', ' ').replace('_', ' ')
    query_text = query_text.replace('[^\w\s]','')
    return query["service"],   query["language"], query_text

In [6]:
# Get all queries to construct a  set of query terms
queries = sc.textFile(f"file://{queries_dir}") \
    .map(lambda line: json.loads(line)) \
    .filter(lambda query: query["url_query"] is not None) \
    .keyBy(lambda query: query["id"]) \
    .mapValues(adapt_query)

In [7]:
# Use list of obscene words (https://github.com/LDNOOBW/List-of-Dirty-Naughty-Obscene-and-Otherwise-Bad-Words)
with open("obscene_words.json", "r") as file:
    obscene_words = json.load(file)

In [31]:
def check_query(query: str, words: list, lan: str) -> bool:
    obscene = False
    if any(re.search(f'(?<!\S){word}(?!\S)', query) for word in words):
        obscene = True
    return obscene
        

def check_obscenity(row: tuple, obscene_words: dict) -> tuple:
    language = row[1][1]
    words = obscene_words[language] if language in obscene_words else []
    query = row[1][2]
    obscene = check_query(query, words, language)
    if not obscene:
        obs_en = False
        obs_zh = False
        if language != "en":
            obs_en = check_query(query, obscene_words["en"], "en")
        if language != "zh":
            obs_zh = check_query(query, obscene_words["zh"][1:], "zh")
        obscene = max(obs_en, obs_zh)
    return *row, obscene

In [None]:
# Check all queries for obscenity in their respective language (and english as fallback)
checked_queries = queries \
    .map(lambda row: check_obscenity(row, obscene_words)) \
    .countByValue()

In [None]:
df = pd.DataFrame([
    {
        "service": service,
        "language": language,
        "query": query,
        "obscene": obscene,
        "count": count
    }
    for (_, (service, language, query), obscene), count in checked_queries.items()
])

In [None]:
yaml_path = "../data/selected-services.yaml"
with open(yaml_path, "r") as file:
    yaml_list = yaml.safe_load(file)
    
category_dict = {}
for elem in yaml_list:
    category_dict[elem["name"]] = elem["category"]

In [None]:
# TODO: Standardize naming
df.loc[df["service"] == "search", "service"] = "search-ch"
df.loc[df["service"] == "mail.ru", "service"] = "mail"
df["category"] = df.apply(lambda row: category_dict[row["service"]], axis=1)

In [None]:
non_obscene_service = df.loc[~df["obscene"]].groupby("service")["count"].sum()
obscene_service = df.loc[df["obscene"]].groupby("service")["count"].sum()

service_df = pd.concat([obscene_service.rename("obscene"), non_obscene_service.rename("non-obscene")], join="outer", axis=1)
service_df = service_df.reset_index()
service_df.fillna(0, inplace=True)

In [None]:
non_obscene_category = df.loc[~df["obscene"]].groupby("category")["count"].sum()
obscene_category = df.loc[df["obscene"]].groupby("category")["count"].sum()

category_df = pd.concat([obscene_category.rename("obscene"), non_obscene_category.rename("non-obscene")], join="outer", axis=1)
category_df = category_df.reset_index()
category_df.fillna(0, inplace=True)

In [None]:
service_df["share"] = service_df["obscene"] / (service_df["obscene"] + service_df["non-obscene"])
category_df["share"] = category_df["obscene"] / (category_df["obscene"] + category_df["non-obscene"])
category_df.sort_values("share", ascending=False, inplace=True)
service_df.sort_values("share", ascending=False, inplace=True)
service_df["category"] = service_df.apply(lambda row: category_dict[row["service"]], axis=1)

In [None]:
category_df.to_csv("./figures/obsenity_categories.csv")
service_df.to_csv("./figures/obsenity_services.csv")

In [None]:
category_df.loc[category_df["obscene"]+category_df["non-obscene"]>10].head()

In [None]:
service_df.loc[(service_df["obscene"]+service_df["non-obscene"]>50) & (service_df["category"] != "pornography")].head()

In [None]:
obs = service_df["obscene"].sum()
non_obs = service_df["non-obscene"].sum()

obs / (obs + non_obs)

In [None]:
df.loc[(df["obscene"]) & (df["service"]=="baidu")]